diff options
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU')
122 files changed, 51894 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h new file mode 100644 index 0000000..5d00e1c --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -0,0 +1,160 @@ +//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPU_H +#define LLVM_LIB_TARGET_R600_AMDGPU_H + +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + +class AMDGPUInstrPrinter; +class AMDGPUSubtarget; +class AMDGPUTargetMachine; +class FunctionPass; +class MachineSchedContext; +class MCAsmInfo; +class raw_ostream; +class ScheduleDAGInstrs; +class Target; +class TargetMachine; + +// R600 Passes +FunctionPass *createR600VectorRegMerger(TargetMachine &tm); +FunctionPass *createR600TextureIntrinsicsReplacer(); +FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm); +FunctionPass *createR600EmitClauseMarkers(); +FunctionPass *createR600ClauseMergePass(TargetMachine &tm); +FunctionPass *createR600Packetizer(TargetMachine &tm); +FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm); +FunctionPass *createAMDGPUCFGStructurizerPass(); + +// SI Passes +FunctionPass *createSITypeRewriter(); +FunctionPass *createSIAnnotateControlFlowPass(); +FunctionPass *createSIFoldOperandsPass(); +FunctionPass *createSILowerI1CopiesPass(); +FunctionPass *createSIShrinkInstructionsPass(); +FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); +FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); +FunctionPass *createSIFixControlFlowLiveIntervalsPass(); +FunctionPass *createSIFixSGPRCopiesPass(); +FunctionPass *createSIFixSGPRLiveRangesPass(); +FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); +FunctionPass *createSIInsertWaits(TargetMachine &tm); + +ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C); + +ModulePass *createAMDGPUAnnotateKernelFeaturesPass(); +void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); +extern char &AMDGPUAnnotateKernelFeaturesID; + +void initializeSIFoldOperandsPass(PassRegistry &); +extern char &SIFoldOperandsID; + +void initializeSIFixSGPRCopiesPass(PassRegistry &); +extern char &SIFixSGPRCopiesID; + +void initializeSILowerI1CopiesPass(PassRegistry &); +extern char &SILowerI1CopiesID; + +void initializeSILoadStoreOptimizerPass(PassRegistry &); +extern char &SILoadStoreOptimizerID; + +// Passes common to R600 and SI +FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST); +Pass *createAMDGPUStructurizeCFGPass(); +FunctionPass *createAMDGPUISelDag(TargetMachine &tm); +ModulePass *createAMDGPUAlwaysInlinePass(); +ModulePass *createAMDGPUOpenCLImageTypeLoweringPass(); +FunctionPass *createAMDGPUAnnotateUniformValues(); + +void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&); +extern char &SIFixControlFlowLiveIntervalsID; + +void initializeSIFixSGPRLiveRangesPass(PassRegistry&); +extern char &SIFixSGPRLiveRangesID; + +void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&); +extern char &AMDGPUAnnotateUniformValuesPassID; + +extern Target TheAMDGPUTarget; +extern Target TheGCNTarget; + +namespace AMDGPU { +enum TargetIndex { + TI_CONSTDATA_START, + TI_SCRATCH_RSRC_DWORD0, + TI_SCRATCH_RSRC_DWORD1, + TI_SCRATCH_RSRC_DWORD2, + TI_SCRATCH_RSRC_DWORD3 +}; +} + +} // End namespace llvm + +namespace ShaderType { + enum Type { + PIXEL = 0, + VERTEX = 1, + GEOMETRY = 2, + COMPUTE = 3 + }; +} + +/// OpenCL uses address spaces to differentiate between +/// various memory regions on the hardware. On the CPU +/// all of the address spaces point to the same memory, +/// however on the GPU, each address space points to +/// a separate piece of memory that is unique from other +/// memory locations. +namespace AMDGPUAS { +enum AddressSpaces : unsigned { + PRIVATE_ADDRESS = 0, ///< Address space for private memory. + GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). + CONSTANT_ADDRESS = 2, ///< Address space for constant memory + LOCAL_ADDRESS = 3, ///< Address space for local memory. + FLAT_ADDRESS = 4, ///< Address space for flat memory. + REGION_ADDRESS = 5, ///< Address space for region memory. + PARAM_D_ADDRESS = 6, ///< Address space for direct addressible parameter memory (CONST0) + PARAM_I_ADDRESS = 7, ///< Address space for indirect addressible parameter memory (VTX1) + + // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on this + // order to be able to dynamically index a constant buffer, for example: + // + // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx + + CONSTANT_BUFFER_0 = 8, + CONSTANT_BUFFER_1 = 9, + CONSTANT_BUFFER_2 = 10, + CONSTANT_BUFFER_3 = 11, + CONSTANT_BUFFER_4 = 12, + CONSTANT_BUFFER_5 = 13, + CONSTANT_BUFFER_6 = 14, + CONSTANT_BUFFER_7 = 15, + CONSTANT_BUFFER_8 = 16, + CONSTANT_BUFFER_9 = 17, + CONSTANT_BUFFER_10 = 18, + CONSTANT_BUFFER_11 = 19, + CONSTANT_BUFFER_12 = 20, + CONSTANT_BUFFER_13 = 21, + CONSTANT_BUFFER_14 = 22, + CONSTANT_BUFFER_15 = 23, + ADDRESS_NONE = 24, ///< Address space for unknown memory. + LAST_ADDRESS = ADDRESS_NONE, + + // Some places use this if the address space can't be determined. + UNKNOWN_ADDRESS_SPACE = ~0u +}; + +} // namespace AMDGPUAS + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td new file mode 100644 index 0000000..db869cf --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -0,0 +1,309 @@ +//===-- AMDGPU.td - AMDGPU Tablegen files ------------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +include "llvm/Target/Target.td" + +//===----------------------------------------------------------------------===// +// Subtarget Features +//===----------------------------------------------------------------------===// + +// Debugging Features + +def FeatureDumpCode : SubtargetFeature <"DumpCode", + "DumpCode", + "true", + "Dump MachineInstrs in the CodeEmitter">; + +def FeatureDumpCodeLower : SubtargetFeature <"dumpcode", + "DumpCode", + "true", + "Dump MachineInstrs in the CodeEmitter">; + +def FeatureIRStructurizer : SubtargetFeature <"disable-irstructurizer", + "EnableIRStructurizer", + "false", + "Disable IR Structurizer">; + +def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca", + "EnablePromoteAlloca", + "true", + "Enable promote alloca pass">; + +// Target features + +def FeatureIfCvt : SubtargetFeature <"disable-ifcvt", + "EnableIfCvt", + "false", + "Disable the if conversion pass">; + +def FeatureFP64 : SubtargetFeature<"fp64", + "FP64", + "true", + "Enable double precision operations">; + +def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals", + "FP64Denormals", + "true", + "Enable double precision denormal handling", + [FeatureFP64]>; + +def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf", + "FastFMAF32", + "true", + "Assuming f32 fma is at least as fast as mul + add", + []>; + +// Some instructions do not support denormals despite this flag. Using +// fp32 denormals also causes instructions to run at the double +// precision rate for the device. +def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals", + "FP32Denormals", + "true", + "Enable single precision denormal handling">; + +def Feature64BitPtr : SubtargetFeature<"64BitPtr", + "Is64bit", + "true", + "Specify if 64-bit addressing should be used">; + +def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst", + "R600ALUInst", + "false", + "Older version of ALU instructions encoding">; + +def FeatureVertexCache : SubtargetFeature<"HasVertexCache", + "HasVertexCache", + "true", + "Specify use of dedicated vertex cache">; + +def FeatureCaymanISA : SubtargetFeature<"caymanISA", + "CaymanISA", + "true", + "Use Cayman ISA">; + +def FeatureCFALUBug : SubtargetFeature<"cfalubug", + "CFALUBug", + "true", + "GPU has CF_ALU bug">; + +// XXX - This should probably be removed once enabled by default +def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt", + "EnableLoadStoreOpt", + "true", + "Enable SI load/store optimizer pass">; + +// Performance debugging feature. Allow using DS instruction immediate +// offsets even if the base pointer can't be proven to be base. On SI, +// base pointer values that won't give the same result as a 16-bit add +// are not safe to fold, but this will override the conservative test +// for the base pointer. +def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature <"unsafe-ds-offset-folding", + "EnableUnsafeDSOffsetFolding", + "true", + "Force using DS instruction immediate offsets on SI">; + +def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global", + "FlatForGlobal", + "true", + "Force to generate flat instruction for global">; + +def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", + "FlatAddressSpace", + "true", + "Support flat address space">; + +def FeatureXNACK : SubtargetFeature<"xnack", + "EnableXNACK", + "true", + "Enable XNACK support">; + +def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", + "EnableVGPRSpilling", + "true", + "Enable spilling of VGPRs to scratch memory">; + +def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", + "SGPRInitBug", + "true", + "VI SGPR initilization bug requiring a fixed SGPR allocation size">; + +def FeatureEnableHugeScratchBuffer : SubtargetFeature<"huge-scratch-buffer", + "EnableHugeScratchBuffer", + "true", + "Enable scratch buffer sizes greater than 128 GB">; + +class SubtargetFeatureFetchLimit <string Value> : + SubtargetFeature <"fetch"#Value, + "TexVTXClauseSize", + Value, + "Limit the maximum number of fetches in a clause to "#Value>; + +def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">; +def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">; + +class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature< + "wavefrontsize"#Value, + "WavefrontSize", + !cast<string>(Value), + "The number of threads per wavefront">; + +def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>; +def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>; +def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>; + +class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature < + "ldsbankcount"#Value, + "LDSBankCount", + !cast<string>(Value), + "The number of LDS banks per compute unit.">; + +def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>; +def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>; + +class SubtargetFeatureISAVersion <int Major, int Minor, int Stepping> + : SubtargetFeature < + "isaver"#Major#"."#Minor#"."#Stepping, + "IsaVersion", + "ISAVersion"#Major#"_"#Minor#"_"#Stepping, + "Instruction set version number" +>; + +def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0>; +def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1>; +def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0>; +def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1>; + +class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature< + "localmemorysize"#Value, + "LocalMemorySize", + !cast<string>(Value), + "The size of local memory in bytes">; + +def FeatureGCN : SubtargetFeature<"gcn", + "IsGCN", + "true", + "GCN or newer GPU">; + +def FeatureGCN1Encoding : SubtargetFeature<"gcn1-encoding", + "GCN1Encoding", + "true", + "Encoding format for SI and CI">; + +def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding", + "GCN3Encoding", + "true", + "Encoding format for VI">; + +def FeatureCIInsts : SubtargetFeature<"ci-insts", + "CIInsts", + "true", + "Additional intstructions for CI+">; + +// Dummy feature used to disable assembler instructions. +def FeatureDisable : SubtargetFeature<"", + "FeatureDisable","true", + "Dummy feature to disable assembler" + " instructions">; + +class SubtargetFeatureGeneration <string Value, + list<SubtargetFeature> Implies> : + SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value, + Value#" GPU generation", Implies>; + +def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>; +def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>; +def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>; + +def FeatureR600 : SubtargetFeatureGeneration<"R600", + [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]>; + +def FeatureR700 : SubtargetFeatureGeneration<"R700", + [FeatureFetchLimit16, FeatureLocalMemorySize0]>; + +def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN", + [FeatureFetchLimit16, FeatureLocalMemorySize32768]>; + +def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS", + [FeatureFetchLimit16, FeatureWavefrontSize64, + FeatureLocalMemorySize32768] +>; + +def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS", + [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768, + FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding, + FeatureLDSBankCount32]>; + +def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", + [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, + FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace, + FeatureGCN1Encoding, FeatureCIInsts]>; + +def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", + [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, + FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, + FeatureGCN3Encoding, FeatureCIInsts, FeatureLDSBankCount32]>; + +//===----------------------------------------------------------------------===// + +def AMDGPUInstrInfo : InstrInfo { + let guessInstructionProperties = 1; + let noNamedPositionallyEncodedOperands = 1; +} + +def AMDGPUAsmParser : AsmParser { + // Some of the R600 registers have the same name, so this crashes. + // For example T0_XYZW and T0_XY both have the asm name T0. + let ShouldEmitMatchRegisterName = 0; +} + +def AMDGPU : Target { + // Pull in Instruction Info: + let InstructionSet = AMDGPUInstrInfo; + let AssemblyParsers = [AMDGPUAsmParser]; +} + +// Dummy Instruction itineraries for pseudo instructions +def ALU_NULL : FuncUnit; +def NullALU : InstrItinClass; + +//===----------------------------------------------------------------------===// +// Predicate helper class +//===----------------------------------------------------------------------===// + +def TruePredicate : Predicate<"true">; +def isSICI : Predicate< + "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS" +>, AssemblerPredicate<"FeatureGCN1Encoding">; + +def isVI : Predicate < + "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, + AssemblerPredicate<"FeatureGCN3Encoding">; + +class PredicateControl { + Predicate SubtargetPredicate; + Predicate SIAssemblerPredicate = isSICI; + Predicate VIAssemblerPredicate = isVI; + list<Predicate> AssemblerPredicates = []; + Predicate AssemblerPredicate = TruePredicate; + list<Predicate> OtherPredicates = []; + list<Predicate> Predicates = !listconcat([SubtargetPredicate, AssemblerPredicate], + AssemblerPredicates, + OtherPredicates); +} + +// Include AMDGPU TD files +include "R600Schedule.td" +include "SISchedule.td" +include "Processors.td" +include "AMDGPUInstrInfo.td" +include "AMDGPUIntrinsics.td" +include "AMDGPURegisterInfo.td" +include "AMDGPUInstructions.td" +include "AMDGPUCallingConv.td" diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp new file mode 100644 index 0000000..ad267d3 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -0,0 +1,64 @@ +//===-- AMDGPUAlwaysInlinePass.cpp - Promote Allocas ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass marks all internal functions as always_inline and creates +/// duplicates of all other functions a marks the duplicates as always_inline. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/Cloning.h" + +using namespace llvm; + +namespace { + +class AMDGPUAlwaysInline : public ModulePass { + static char ID; + +public: + AMDGPUAlwaysInline() : ModulePass(ID) { } + bool runOnModule(Module &M) override; + const char *getPassName() const override { return "AMDGPU Always Inline Pass"; } +}; + +} // End anonymous namespace + +char AMDGPUAlwaysInline::ID = 0; + +bool AMDGPUAlwaysInline::runOnModule(Module &M) { + std::vector<Function *> FuncsToClone; + + for (Function &F : M) { + if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() && + !F.hasFnAttribute(Attribute::NoInline)) + FuncsToClone.push_back(&F); + } + + for (Function *F : FuncsToClone) { + ValueToValueMapTy VMap; + Function *NewFunc = CloneFunction(F, VMap, false); + NewFunc->setLinkage(GlobalValue::InternalLinkage); + M.getFunctionList().push_back(NewFunc); + F->replaceAllUsesWith(NewFunc); + } + + for (Function &F : M) { + if (F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::NoInline)) { + F.addFnAttr(Attribute::AlwaysInline); + } + } + return false; +} + +ModulePass *llvm::createAMDGPUAlwaysInlinePass() { + return new AMDGPUAlwaysInline(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp new file mode 100644 index 0000000..3781839 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -0,0 +1,126 @@ +//===-- AMDGPUAnnotateKernelFeaturesPass.cpp ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This pass adds target attributes to functions which use intrinsics +/// which will impact calling convention lowering. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" + +#define DEBUG_TYPE "amdgpu-annotate-kernel-features" + +using namespace llvm; + +namespace { + +class AMDGPUAnnotateKernelFeatures : public ModulePass { +private: + void addAttrToCallers(Function *Intrin, StringRef AttrName); + bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>); + +public: + static char ID; + + AMDGPUAnnotateKernelFeatures() : ModulePass(ID) { } + bool runOnModule(Module &M) override; + const char *getPassName() const override { + return "AMDGPU Annotate Kernel Features"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + ModulePass::getAnalysisUsage(AU); + } +}; + +} + +char AMDGPUAnnotateKernelFeatures::ID = 0; + +char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; + + +INITIALIZE_PASS_BEGIN(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, + "Add AMDGPU function attributes", false, false) +INITIALIZE_PASS_END(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, + "Add AMDGPU function attributes", false, false) + + +void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin, + StringRef AttrName) { + SmallPtrSet<Function *, 4> SeenFuncs; + + for (User *U : Intrin->users()) { + // CallInst is the only valid user for an intrinsic. + CallInst *CI = cast<CallInst>(U); + + Function *CallingFunction = CI->getParent()->getParent(); + if (SeenFuncs.insert(CallingFunction).second) + CallingFunction->addFnAttr(AttrName); + } +} + +bool AMDGPUAnnotateKernelFeatures::addAttrsForIntrinsics( + Module &M, + ArrayRef<StringRef[2]> IntrinsicToAttr) { + bool Changed = false; + + for (const StringRef *Arr : IntrinsicToAttr) { + if (Function *Fn = M.getFunction(Arr[0])) { + addAttrToCallers(Fn, Arr[1]); + Changed = true; + } + } + + return Changed; +} + +bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) { + Triple TT(M.getTargetTriple()); + + static const StringRef IntrinsicToAttr[][2] = { + // .x omitted + { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" }, + { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" }, + + // .x omitted + { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" }, + { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" } + + }; + + static const StringRef HSAIntrinsicToAttr[][2] = { + { "llvm.r600.read.local.size.x", "amdgpu-dispatch-ptr" }, + { "llvm.r600.read.local.size.y", "amdgpu-dispatch-ptr" }, + { "llvm.r600.read.local.size.z", "amdgpu-dispatch-ptr" }, + + { "llvm.r600.read.global.size.x", "amdgpu-dispatch-ptr" }, + { "llvm.r600.read.global.size.y", "amdgpu-dispatch-ptr" }, + { "llvm.r600.read.global.size.z", "amdgpu-dispatch-ptr" }, + { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" } + }; + + // TODO: Intrinsics that require queue ptr. + + // We do not need to note the x workitem or workgroup id because they are + // always initialized. + + bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr); + if (TT.getOS() == Triple::AMDHSA) + Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr); + + return Changed; +} + +ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { + return new AMDGPUAnnotateKernelFeatures(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp new file mode 100644 index 0000000..dfddc34 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -0,0 +1,84 @@ +//===-- AMDGPUAnnotateUniformValues.cpp - ---------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass adds amdgpu.uniform metadata to IR values so this information +/// can be used during instruction selection. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUIntrinsicInfo.h" +#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "amdgpu-annotate-uniform" + +using namespace llvm; + +namespace { + +class AMDGPUAnnotateUniformValues : public FunctionPass, + public InstVisitor<AMDGPUAnnotateUniformValues> { + DivergenceAnalysis *DA; + +public: + static char ID; + AMDGPUAnnotateUniformValues() : + FunctionPass(ID) { } + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + const char *getPassName() const override { return "AMDGPU Annotate Uniform Values"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DivergenceAnalysis>(); + AU.setPreservesAll(); + } + + void visitLoadInst(LoadInst &I); + +}; + +} // End anonymous namespace + +INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE, + "Add AMDGPU uniform metadata", false, false) +INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE, + "Add AMDGPU uniform metadata", false, false) + +char AMDGPUAnnotateUniformValues::ID = 0; + +void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { + Value *Ptr = I.getPointerOperand(); + if (!DA->isUniform(Ptr)) + return; + + if (Instruction *PtrI = dyn_cast<Instruction>(Ptr)) + PtrI->setMetadata("amdgpu.uniform", MDNode::get(I.getContext(), {})); + +} + +bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) { + return false; +} + +bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) { + DA = &getAnalysis<DivergenceAnalysis>(); + visit(F); + + return true; +} + +FunctionPass * +llvm::createAMDGPUAnnotateUniformValues() { + return new AMDGPUAnnotateUniformValues(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp new file mode 100644 index 0000000..1239dfb2 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -0,0 +1,682 @@ +//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// The AMDGPUAsmPrinter is used to print both assembly string and also binary +/// code. When passed an MCAsmStreamer it prints assembly and when passed +/// an MCObjectStreamer it outputs binary code. +// +//===----------------------------------------------------------------------===// +// + +#include "AMDGPUAsmPrinter.h" +#include "MCTargetDesc/AMDGPUTargetStreamer.h" +#include "InstPrinter/AMDGPUInstPrinter.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "AMDGPU.h" +#include "AMDKernelCodeT.h" +#include "AMDGPUSubtarget.h" +#include "R600Defines.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "SIDefines.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetLoweringObjectFile.h" + +using namespace llvm; + +// TODO: This should get the default rounding mode from the kernel. We just set +// the default here, but this could change if the OpenCL rounding mode pragmas +// are used. +// +// The denormal mode here should match what is reported by the OpenCL runtime +// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but +// can also be override to flush with the -cl-denorms-are-zero compiler flag. +// +// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double +// precision, and leaves single precision to flush all and does not report +// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports +// CL_FP_DENORM for both. +// +// FIXME: It seems some instructions do not support single precision denormals +// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32, +// and sin_f32, cos_f32 on most parts). + +// We want to use these instructions, and using fp32 denormals also causes +// instructions to run at the double precision rate for the device so it's +// probably best to just report no single precision denormals. +static uint32_t getFPMode(const MachineFunction &F) { + const AMDGPUSubtarget& ST = F.getSubtarget<AMDGPUSubtarget>(); + // TODO: Is there any real use for the flush in only / flush out only modes? + + uint32_t FP32Denormals = + ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; + + uint32_t FP64Denormals = + ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; + + return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | + FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | + FP_DENORM_MODE_SP(FP32Denormals) | + FP_DENORM_MODE_DP(FP64Denormals); +} + +static AsmPrinter * +createAMDGPUAsmPrinterPass(TargetMachine &tm, + std::unique_ptr<MCStreamer> &&Streamer) { + return new AMDGPUAsmPrinter(tm, std::move(Streamer)); +} + +extern "C" void LLVMInitializeAMDGPUAsmPrinter() { + TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass); + TargetRegistry::RegisterAsmPrinter(TheGCNTarget, createAMDGPUAsmPrinterPass); +} + +AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, + std::unique_ptr<MCStreamer> Streamer) + : AsmPrinter(TM, std::move(Streamer)) {} + +void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { + if (TM.getTargetTriple().getOS() != Triple::AMDHSA) + return; + + // Need to construct an MCSubtargetInfo here in case we have no functions + // in the module. + std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo( + TM.getTargetTriple().str(), TM.getTargetCPU(), + TM.getTargetFeatureString())); + + AMDGPUTargetStreamer *TS = + static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); + + TS->EmitDirectiveHSACodeObjectVersion(1, 0); + AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits()); + TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping, + "AMD", "AMDGPU"); +} + +void AMDGPUAsmPrinter::EmitFunctionBodyStart() { + const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); + SIProgramInfo KernelInfo; + if (STM.isAmdHsaOS()) { + getSIProgramInfo(KernelInfo, *MF); + EmitAmdKernelCodeT(*MF, KernelInfo); + } +} + +void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { + const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); + if (MFI->isKernel() && STM.isAmdHsaOS()) { + AMDGPUTargetStreamer *TS = + static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); + TS->EmitAMDGPUSymbolType(CurrentFnSym->getName(), + ELF::STT_AMDGPU_HSA_KERNEL); + } + + AsmPrinter::EmitFunctionEntryLabel(); +} + +static bool isModuleLinkage(const GlobalValue *GV) { + switch (GV->getLinkage()) { + case GlobalValue::InternalLinkage: + case GlobalValue::CommonLinkage: + return true; + case GlobalValue::ExternalLinkage: + return false; + default: llvm_unreachable("unknown linkage type"); + } +} + +void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { + + if (TM.getTargetTriple().getOS() != Triple::AMDHSA) { + AsmPrinter::EmitGlobalVariable(GV); + return; + } + + if (GV->isDeclaration() || GV->getLinkage() == GlobalValue::PrivateLinkage) { + AsmPrinter::EmitGlobalVariable(GV); + return; + } + + // Group segment variables aren't emitted in HSA. + if (AMDGPU::isGroupSegment(GV)) + return; + + AMDGPUTargetStreamer *TS = + static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); + if (isModuleLinkage(GV)) { + TS->EmitAMDGPUHsaModuleScopeGlobal(GV->getName()); + } else { + TS->EmitAMDGPUHsaProgramScopeGlobal(GV->getName()); + } + + MCSymbolELF *GVSym = cast<MCSymbolELF>(getSymbol(GV)); + const DataLayout &DL = getDataLayout(); + + // Emit the size + uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType()); + OutStreamer->emitELFSize(GVSym, MCConstantExpr::create(Size, OutContext)); + OutStreamer->PushSection(); + OutStreamer->SwitchSection( + getObjFileLowering().SectionForGlobal(GV, *Mang, TM)); + const Constant *C = GV->getInitializer(); + OutStreamer->EmitLabel(GVSym); + EmitGlobalConstant(DL, C); + OutStreamer->PopSection(); +} + +bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + + // The starting address of all shader programs must be 256 bytes aligned. + MF.setAlignment(8); + + SetupMachineFunction(MF); + + MCContext &Context = getObjFileLowering().getContext(); + MCSectionELF *ConfigSection = + Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); + OutStreamer->SwitchSection(ConfigSection); + + const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); + SIProgramInfo KernelInfo; + if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + getSIProgramInfo(KernelInfo, MF); + if (!STM.isAmdHsaOS()) { + EmitProgramInfoSI(MF, KernelInfo); + } + } else { + EmitProgramInfoR600(MF); + } + + DisasmLines.clear(); + HexLines.clear(); + DisasmLineMaxLen = 0; + + EmitFunctionBody(); + + if (isVerbose()) { + MCSectionELF *CommentSection = + Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); + OutStreamer->SwitchSection(CommentSection); + + if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + OutStreamer->emitRawComment(" Kernel info:", false); + OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen), + false); + OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR), + false); + OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR), + false); + OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode), + false); + OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode), + false); + OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize), + false); + + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " + + Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " + + Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " + + Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " + + Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + + Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)), + false); + + } else { + R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); + OutStreamer->emitRawComment( + Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize))); + } + } + + if (STM.dumpCode()) { + + OutStreamer->SwitchSection( + Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0)); + + for (size_t i = 0; i < DisasmLines.size(); ++i) { + std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' '); + Comment += " ; " + HexLines[i] + "\n"; + + OutStreamer->EmitBytes(StringRef(DisasmLines[i])); + OutStreamer->EmitBytes(StringRef(Comment)); + } + } + + return false; +} + +void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { + unsigned MaxGPR = 0; + bool killPixel = false; + const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); + const R600RegisterInfo *RI = + static_cast<const R600RegisterInfo *>(STM.getRegisterInfo()); + const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); + + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + if (MI.getOpcode() == AMDGPU::KILLGT) + killPixel = true; + unsigned numOperands = MI.getNumOperands(); + for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { + const MachineOperand &MO = MI.getOperand(op_idx); + if (!MO.isReg()) + continue; + unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff; + + // Register with value > 127 aren't GPR + if (HWReg > 127) + continue; + MaxGPR = std::max(MaxGPR, HWReg); + } + } + } + + unsigned RsrcReg; + if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) { + // Evergreen / Northern Islands + switch (MFI->getShaderType()) { + default: // Fall through + case ShaderType::COMPUTE: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break; + case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break; + case ShaderType::PIXEL: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break; + case ShaderType::VERTEX: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break; + } + } else { + // R600 / R700 + switch (MFI->getShaderType()) { + default: // Fall through + case ShaderType::GEOMETRY: // Fall through + case ShaderType::COMPUTE: // Fall through + case ShaderType::VERTEX: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break; + case ShaderType::PIXEL: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break; + } + } + + OutStreamer->EmitIntValue(RsrcReg, 4); + OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) | + S_STACK_SIZE(MFI->StackSize), 4); + OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); + OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); + + if (MFI->getShaderType() == ShaderType::COMPUTE) { + OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); + OutStreamer->EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4); + } +} + +void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, + const MachineFunction &MF) const { + const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + uint64_t CodeSize = 0; + unsigned MaxSGPR = 0; + unsigned MaxVGPR = 0; + bool VCCUsed = false; + bool FlatUsed = false; + const SIRegisterInfo *RI = + static_cast<const SIRegisterInfo *>(STM.getRegisterInfo()); + + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + // TODO: CodeSize should account for multiple functions. + + // TODO: Should we count size of debug info? + if (MI.isDebugValue()) + continue; + + // FIXME: This is reporting 0 for many instructions. + CodeSize += MI.getDesc().Size; + + unsigned numOperands = MI.getNumOperands(); + for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { + const MachineOperand &MO = MI.getOperand(op_idx); + unsigned width = 0; + bool isSGPR = false; + + if (!MO.isReg()) + continue; + + unsigned reg = MO.getReg(); + switch (reg) { + case AMDGPU::EXEC: + case AMDGPU::SCC: + case AMDGPU::M0: + continue; + + case AMDGPU::VCC: + case AMDGPU::VCC_LO: + case AMDGPU::VCC_HI: + VCCUsed = true; + continue; + + case AMDGPU::FLAT_SCR: + case AMDGPU::FLAT_SCR_LO: + case AMDGPU::FLAT_SCR_HI: + FlatUsed = true; + continue; + + default: + break; + } + + if (AMDGPU::SReg_32RegClass.contains(reg)) { + isSGPR = true; + width = 1; + } else if (AMDGPU::VGPR_32RegClass.contains(reg)) { + isSGPR = false; + width = 1; + } else if (AMDGPU::SReg_64RegClass.contains(reg)) { + isSGPR = true; + width = 2; + } else if (AMDGPU::VReg_64RegClass.contains(reg)) { + isSGPR = false; + width = 2; + } else if (AMDGPU::VReg_96RegClass.contains(reg)) { + isSGPR = false; + width = 3; + } else if (AMDGPU::SReg_128RegClass.contains(reg)) { + isSGPR = true; + width = 4; + } else if (AMDGPU::VReg_128RegClass.contains(reg)) { + isSGPR = false; + width = 4; + } else if (AMDGPU::SReg_256RegClass.contains(reg)) { + isSGPR = true; + width = 8; + } else if (AMDGPU::VReg_256RegClass.contains(reg)) { + isSGPR = false; + width = 8; + } else if (AMDGPU::SReg_512RegClass.contains(reg)) { + isSGPR = true; + width = 16; + } else if (AMDGPU::VReg_512RegClass.contains(reg)) { + isSGPR = false; + width = 16; + } else { + llvm_unreachable("Unknown register class"); + } + unsigned hwReg = RI->getEncodingValue(reg) & 0xff; + unsigned maxUsed = hwReg + width - 1; + if (isSGPR) { + MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR; + } else { + MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR; + } + } + } + } + + unsigned ExtraSGPRs = 0; + + if (VCCUsed) + ExtraSGPRs = 2; + + if (STM.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (FlatUsed) + ExtraSGPRs = 4; + } else { + if (STM.isXNACKEnabled()) + ExtraSGPRs = 4; + + if (FlatUsed) + ExtraSGPRs = 6; + } + + MaxSGPR += ExtraSGPRs; + + // We found the maximum register index. They start at 0, so add one to get the + // number of registers. + ProgInfo.NumVGPR = MaxVGPR + 1; + ProgInfo.NumSGPR = MaxSGPR + 1; + + if (STM.hasSGPRInitBug()) { + if (ProgInfo.NumSGPR > AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) { + LLVMContext &Ctx = MF.getFunction()->getContext(); + Ctx.emitError("too many SGPRs used with the SGPR init bug"); + } + + ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; + } + + if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) { + LLVMContext &Ctx = MF.getFunction()->getContext(); + Ctx.emitError("too many user SGPRs used"); + } + + ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4; + ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8; + // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode + // register. + ProgInfo.FloatMode = getFPMode(MF); + + // XXX: Not quite sure what this does, but sc seems to unset this. + ProgInfo.IEEEMode = 0; + + // Do not clamp NAN to 0. + ProgInfo.DX10Clamp = 0; + + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF); + + ProgInfo.FlatUsed = FlatUsed; + ProgInfo.VCCUsed = VCCUsed; + ProgInfo.CodeLen = CodeSize; + + unsigned LDSAlignShift; + if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { + // LDS is allocated in 64 dword blocks. + LDSAlignShift = 8; + } else { + // LDS is allocated in 128 dword blocks. + LDSAlignShift = 9; + } + + unsigned LDSSpillSize = MFI->LDSWaveSpillSize * + MFI->getMaximumWorkGroupSize(MF); + + ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize; + ProgInfo.LDSBlocks = + RoundUpToAlignment(ProgInfo.LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; + + // Scratch is allocated in 256 dword blocks. + unsigned ScratchAlignShift = 10; + // We need to program the hardware with the amount of scratch memory that + // is used by the entire wave. ProgInfo.ScratchSize is the amount of + // scratch memory used per thread. + ProgInfo.ScratchBlocks = + RoundUpToAlignment(ProgInfo.ScratchSize * STM.getWavefrontSize(), + 1 << ScratchAlignShift) >> ScratchAlignShift; + + ProgInfo.ComputePGMRSrc1 = + S_00B848_VGPRS(ProgInfo.VGPRBlocks) | + S_00B848_SGPRS(ProgInfo.SGPRBlocks) | + S_00B848_PRIORITY(ProgInfo.Priority) | + S_00B848_FLOAT_MODE(ProgInfo.FloatMode) | + S_00B848_PRIV(ProgInfo.Priv) | + S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) | + S_00B848_DEBUG_MODE(ProgInfo.DebugMode) | + S_00B848_IEEE_MODE(ProgInfo.IEEEMode); + + // 0 = X, 1 = XY, 2 = XYZ + unsigned TIDIGCompCnt = 0; + if (MFI->hasWorkItemIDZ()) + TIDIGCompCnt = 2; + else if (MFI->hasWorkItemIDY()) + TIDIGCompCnt = 1; + + ProgInfo.ComputePGMRSrc2 = + S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | + S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) | + S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) | + S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) | + S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) | + S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) | + S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) | + S_00B84C_EXCP_EN_MSB(0) | + S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) | + S_00B84C_EXCP_EN(0); +} + +static unsigned getRsrcReg(unsigned ShaderType) { + switch (ShaderType) { + default: // Fall through + case ShaderType::COMPUTE: return R_00B848_COMPUTE_PGM_RSRC1; + case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; + case ShaderType::PIXEL: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; + case ShaderType::VERTEX: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; + } +} + +void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, + const SIProgramInfo &KernelInfo) { + const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + unsigned RsrcReg = getRsrcReg(MFI->getShaderType()); + + if (MFI->getShaderType() == ShaderType::COMPUTE) { + OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); + + OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4); + + OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); + OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4); + + OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4); + OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4); + + // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = + // 0" comment but I don't see a corresponding field in the register spec. + } else { + OutStreamer->EmitIntValue(RsrcReg, 4); + OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) | + S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4); + if (STM.isVGPRSpillingEnabled(MFI)) { + OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); + OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4); + } + } + + if (MFI->getShaderType() == ShaderType::PIXEL) { + OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); + OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4); + OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); + OutStreamer->EmitIntValue(MFI->PSInputEna, 4); + OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4); + OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4); + } +} + +void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, + const SIProgramInfo &KernelInfo) const { + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); + amd_kernel_code_t header; + + AMDGPU::initDefaultAMDKernelCodeT(header, STM.getFeatureBits()); + + header.compute_pgm_resource_registers = + KernelInfo.ComputePGMRSrc1 | + (KernelInfo.ComputePGMRSrc2 << 32); + header.code_properties = AMD_CODE_PROPERTY_IS_PTR64; + + if (MFI->hasPrivateSegmentBuffer()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; + } + + if (MFI->hasDispatchPtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; + + if (MFI->hasQueuePtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; + + if (MFI->hasKernargSegmentPtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; + + if (MFI->hasDispatchID()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; + + if (MFI->hasFlatScratchInit()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; + + // TODO: Private segment size + + if (MFI->hasGridWorkgroupCountX()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X; + } + + if (MFI->hasGridWorkgroupCountY()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y; + } + + if (MFI->hasGridWorkgroupCountZ()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z; + } + + if (MFI->hasDispatchPtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; + + if (STM.isXNACKEnabled()) + header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; + + header.kernarg_segment_byte_size = MFI->ABIArgOffset; + header.wavefront_sgpr_count = KernelInfo.NumSGPR; + header.workitem_vgpr_count = KernelInfo.NumVGPR; + header.workitem_private_segment_byte_size = KernelInfo.ScratchSize; + header.workgroup_group_segment_byte_size = KernelInfo.LDSSize; + + AMDGPUTargetStreamer *TS = + static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); + TS->EmitAMDKernelCodeT(header); +} + +bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode, raw_ostream &O) { + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) + return true; // Unknown modifier. + + switch (ExtraCode[0]) { + default: + // See if this is a generic print operand + return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); + case 'r': + break; + } + } + + AMDGPUInstPrinter::printRegOperand(MI->getOperand(OpNo).getReg(), O, + *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo()); + return false; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h new file mode 100644 index 0000000..99d4091 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -0,0 +1,119 @@ +//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU Assembly printer class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H +#define LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H + +#include "llvm/CodeGen/AsmPrinter.h" +#include <vector> + +namespace llvm { + +class AMDGPUAsmPrinter : public AsmPrinter { +private: + struct SIProgramInfo { + SIProgramInfo() : + VGPRBlocks(0), + SGPRBlocks(0), + Priority(0), + FloatMode(0), + Priv(0), + DX10Clamp(0), + DebugMode(0), + IEEEMode(0), + ScratchSize(0), + ComputePGMRSrc1(0), + LDSBlocks(0), + ScratchBlocks(0), + ComputePGMRSrc2(0), + NumVGPR(0), + NumSGPR(0), + FlatUsed(false), + VCCUsed(false), + CodeLen(0) {} + + // Fields set in PGM_RSRC1 pm4 packet. + uint32_t VGPRBlocks; + uint32_t SGPRBlocks; + uint32_t Priority; + uint32_t FloatMode; + uint32_t Priv; + uint32_t DX10Clamp; + uint32_t DebugMode; + uint32_t IEEEMode; + uint32_t ScratchSize; + + uint64_t ComputePGMRSrc1; + + // Fields set in PGM_RSRC2 pm4 packet. + uint32_t LDSBlocks; + uint32_t ScratchBlocks; + + uint64_t ComputePGMRSrc2; + + uint32_t NumVGPR; + uint32_t NumSGPR; + uint32_t LDSSize; + bool FlatUsed; + + // Bonus information for debugging. + bool VCCUsed; + uint64_t CodeLen; + }; + + void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const; + void findNumUsedRegistersSI(const MachineFunction &MF, + unsigned &NumSGPR, + unsigned &NumVGPR) const; + + /// \brief Emit register usage information so that the GPU driver + /// can correctly setup the GPU state. + void EmitProgramInfoR600(const MachineFunction &MF); + void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo); + void EmitAmdKernelCodeT(const MachineFunction &MF, + const SIProgramInfo &KernelInfo) const; + +public: + explicit AMDGPUAsmPrinter(TargetMachine &TM, + std::unique_ptr<MCStreamer> Streamer); + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "AMDGPU Assembly Printer"; + } + + /// Implemented in AMDGPUMCInstLower.cpp + void EmitInstruction(const MachineInstr *MI) override; + + void EmitFunctionBodyStart() override; + + void EmitFunctionEntryLabel() override; + + void EmitGlobalVariable(const GlobalVariable *GV) override; + + void EmitStartOfAsmFile(Module &M) override; + + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &O) override; + +protected: + std::vector<std::string> DisasmLines, HexLines; + size_t DisasmLineMaxLen; +}; + +} // End anonymous llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td new file mode 100644 index 0000000..b0db261 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -0,0 +1,137 @@ +//===---- AMDCallingConv.td - Calling Conventions for Radeon GPUs ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This describes the calling conventions for the AMD Radeon GPUs. +// +//===----------------------------------------------------------------------===// + +// Inversion of CCIfInReg +class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {} + +// Calling convention for SI +def CC_SI : CallingConv<[ + + CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[ + SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, + SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, + SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, + SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31, + SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39 + ]>>>, + + CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow< + [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14, + SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30, + SGPR32, SGPR34, SGPR36, SGPR38 ], + [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15, + SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31, + SGPR33, SGPR35, SGPR37, SGPR39 ] + >>>, + + // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs. + CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[ + VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, + VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, + VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, + VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31, + VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39, + VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47, + VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55, + VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63, + VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71, + VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79, + VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87, + VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95, + VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103, + VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111, + VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119, + VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127, + VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135 + ]>>>, + + CCIfByVal<CCIfType<[i64] , CCAssignToRegWithShadow< + [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14, + SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30, + SGPR32, SGPR34, SGPR36, SGPR38 ], + [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15, + SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31, + SGPR33, SGPR35, SGPR37, SGPR39 ] + >>> + +]>; + +def RetCC_SI : CallingConv<[ + CCIfType<[i32] , CCAssignToReg<[ + SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, + SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, + SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, + SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31, + SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39 + ]>>, + + // 32*4 + 4 is the minimum for a fetch shader with 32 outputs. + CCIfType<[f32] , CCAssignToReg<[ + VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, + VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, + VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, + VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31, + VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39, + VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47, + VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55, + VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63, + VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71, + VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79, + VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87, + VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95, + VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103, + VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111, + VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119, + VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127, + VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135 + ]>> +]>; + +// Calling convention for R600 +def CC_R600 : CallingConv<[ + CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[ + T0_XYZW, T1_XYZW, T2_XYZW, T3_XYZW, T4_XYZW, T5_XYZW, T6_XYZW, T7_XYZW, + T8_XYZW, T9_XYZW, T10_XYZW, T11_XYZW, T12_XYZW, T13_XYZW, T14_XYZW, T15_XYZW, + T16_XYZW, T17_XYZW, T18_XYZW, T19_XYZW, T20_XYZW, T21_XYZW, T22_XYZW, + T23_XYZW, T24_XYZW, T25_XYZW, T26_XYZW, T27_XYZW, T28_XYZW, T29_XYZW, + T30_XYZW, T31_XYZW, T32_XYZW + ]>>> +]>; + +// Calling convention for compute kernels +def CC_AMDGPU_Kernel : CallingConv<[ + CCCustom<"allocateStack"> +]>; + +def CC_AMDGPU : CallingConv<[ + CCIf<"static_cast<const AMDGPUSubtarget&>" + "(State.getMachineFunction().getSubtarget()).getGeneration() >=" + "AMDGPUSubtarget::SOUTHERN_ISLANDS && " + "State.getMachineFunction().getInfo<SIMachineFunctionInfo>()" + "->getShaderType() == ShaderType::COMPUTE", + CCDelegateTo<CC_AMDGPU_Kernel>>, + CCIf<"static_cast<const AMDGPUSubtarget&>" + "(State.getMachineFunction().getSubtarget()).getGeneration() < " + "AMDGPUSubtarget::SOUTHERN_ISLANDS && " + "State.getMachineFunction().getInfo<R600MachineFunctionInfo>()" + "->getShaderType() == ShaderType::COMPUTE", + CCDelegateTo<CC_AMDGPU_Kernel>>, + CCIf<"static_cast<const AMDGPUSubtarget&>" + "(State.getMachineFunction().getSubtarget()).getGeneration() >= " + "AMDGPUSubtarget::SOUTHERN_ISLANDS", + CCDelegateTo<CC_SI>>, + CCIf<"static_cast<const AMDGPUSubtarget&>" + "(State.getMachineFunction().getSubtarget()).getGeneration() < " + "AMDGPUSubtarget::SOUTHERN_ISLANDS", + CCDelegateTo<CC_R600>> +]>; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp new file mode 100644 index 0000000..2f6b302 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp @@ -0,0 +1,26 @@ +//===-- AMDGPUDiagnosticInfoUnsupported.cpp -------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUDiagnosticInfoUnsupported.h" + +using namespace llvm; + +DiagnosticInfoUnsupported::DiagnosticInfoUnsupported( + const Function &Fn, + const Twine &Desc, + DiagnosticSeverity Severity) + : DiagnosticInfo(getKindID(), Severity), + Description(Desc), + Fn(Fn) { } + +int DiagnosticInfoUnsupported::KindID = 0; + +void DiagnosticInfoUnsupported::print(DiagnosticPrinter &DP) const { + DP << "unsupported " << getDescription() << " in " << Fn.getName(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h new file mode 100644 index 0000000..0fd37e1 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h @@ -0,0 +1,48 @@ +//===-- AMDGPUDiagnosticInfoUnsupported.h - Error reporting -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H + +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" + +namespace llvm { + +/// Diagnostic information for unimplemented or unsupported feature reporting. +class DiagnosticInfoUnsupported : public DiagnosticInfo { +private: + const Twine &Description; + const Function &Fn; + + static int KindID; + + static int getKindID() { + if (KindID == 0) + KindID = llvm::getNextAvailablePluginDiagnosticKind(); + return KindID; + } + +public: + DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc, + DiagnosticSeverity Severity = DS_Error); + + const Function &getFunction() const { return Fn; } + const Twine &getDescription() const { return Description; } + + void print(DiagnosticPrinter &DP) const override; + + static bool classof(const DiagnosticInfo *DI) { + return DI->getKind() == getKindID(); + } +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp new file mode 100644 index 0000000..4d84d28 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp @@ -0,0 +1,118 @@ +//===----------------------- AMDGPUFrameLowering.cpp ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// Interface to describe a layout of a stack frame on a AMDIL target machine +// +//===----------------------------------------------------------------------===// +#include "AMDGPUFrameLowering.h" +#include "AMDGPURegisterInfo.h" +#include "R600MachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Instructions.h" + +using namespace llvm; +AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl, + int LAO, unsigned TransAl) + : TargetFrameLowering(D, StackAl, LAO, TransAl) { } + +AMDGPUFrameLowering::~AMDGPUFrameLowering() { } + +unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const { + + // XXX: Hardcoding to 1 for now. + // + // I think the StackWidth should stored as metadata associated with the + // MachineFunction. This metadata can either be added by a frontend, or + // calculated by a R600 specific LLVM IR pass. + // + // The StackWidth determines how stack objects are laid out in memory. + // For a vector stack variable, like: int4 stack[2], the data will be stored + // in the following ways depending on the StackWidth. + // + // StackWidth = 1: + // + // T0.X = stack[0].x + // T1.X = stack[0].y + // T2.X = stack[0].z + // T3.X = stack[0].w + // T4.X = stack[1].x + // T5.X = stack[1].y + // T6.X = stack[1].z + // T7.X = stack[1].w + // + // StackWidth = 2: + // + // T0.X = stack[0].x + // T0.Y = stack[0].y + // T1.X = stack[0].z + // T1.Y = stack[0].w + // T2.X = stack[1].x + // T2.Y = stack[1].y + // T3.X = stack[1].z + // T3.Y = stack[1].w + // + // StackWidth = 4: + // T0.X = stack[0].x + // T0.Y = stack[0].y + // T0.Z = stack[0].z + // T0.W = stack[0].w + // T1.X = stack[1].x + // T1.Y = stack[1].y + // T1.Z = stack[1].z + // T1.W = stack[1].w + return 1; +} + +/// \returns The number of registers allocated for \p FI. +int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF, + int FI, + unsigned &FrameReg) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); + + // Fill in FrameReg output argument. + FrameReg = RI->getFrameRegister(MF); + + // Start the offset at 2 so we don't overwrite work group information. + // XXX: We should only do this when the shader actually uses this + // information. + unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4); + int UpperBound = FI == -1 ? MFI->getNumObjects() : FI; + + for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) { + OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(i)); + OffsetBytes += MFI->getObjectSize(i); + // Each register holds 4 bytes, so we must always align the offset to at + // least 4 bytes, so that 2 frame objects won't share the same register. + OffsetBytes = RoundUpToAlignment(OffsetBytes, 4); + } + + if (FI != -1) + OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(FI)); + + return OffsetBytes / (getStackWidth(MF) * 4); +} + +const TargetFrameLowering::SpillSlot * +AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const { + NumEntries = 0; + return nullptr; +} +void AMDGPUFrameLowering::emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const {} +void +AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { +} + +bool +AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const { + return false; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h new file mode 100644 index 0000000..257a3da --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h @@ -0,0 +1,44 @@ +//===--------------------- AMDGPUFrameLowering.h ----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface to describe a layout of a stack frame on an AMDGPU target. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H + +#include "llvm/Target/TargetFrameLowering.h" + +namespace llvm { + +/// \brief Information about the stack frame layout on the AMDGPU targets. +/// +/// It holds the direction of the stack growth, the known stack alignment on +/// entry to each function, and the offset to the locals area. +/// See TargetFrameInfo for more comments. +class AMDGPUFrameLowering : public TargetFrameLowering { +public: + AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO, + unsigned TransAl = 1); + virtual ~AMDGPUFrameLowering(); + + /// \returns The number of 32-bit sub-registers that are used when storing + /// values to the stack. + unsigned getStackWidth(const MachineFunction &MF) const; + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; + const SpillSlot * + getCalleeSavedSpillSlots(unsigned &NumEntries) const override; + void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + bool hasFP(const MachineFunction &MF) const override; +}; +} // namespace llvm +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp new file mode 100644 index 0000000..b33040b --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -0,0 +1,1513 @@ +//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief Defines an instruction selector for the AMDGPU target. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUDiagnosticInfoUnsupported.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPUISelLowering.h" // For AMDGPUISD +#include "AMDGPURegisterInfo.h" +#include "AMDGPUSubtarget.h" +#include "R600InstrInfo.h" +#include "SIDefines.h" +#include "SIISelLowering.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/IR/Function.h" + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Instruction Selector Implementation +//===----------------------------------------------------------------------===// + +namespace { +/// AMDGPU specific code to select AMDGPU machine instructions for +/// SelectionDAG operations. +class AMDGPUDAGToDAGISel : public SelectionDAGISel { + // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can + // make the right decision when generating code for different targets. + const AMDGPUSubtarget *Subtarget; + +public: + AMDGPUDAGToDAGISel(TargetMachine &TM); + virtual ~AMDGPUDAGToDAGISel(); + bool runOnMachineFunction(MachineFunction &MF) override; + SDNode *Select(SDNode *N) override; + const char *getPassName() const override; + void PreprocessISelDAG() override; + void PostprocessISelDAG() override; + +private: + bool isInlineImmediate(SDNode *N) const; + bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs, + const R600InstrInfo *TII); + bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &); + bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &); + + // Complex pattern selectors + bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2); + bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2); + bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2); + + static bool checkType(const Value *ptr, unsigned int addrspace); + static bool checkPrivateAddress(const MachineMemOperand *Op); + + static bool isGlobalStore(const StoreSDNode *N); + static bool isFlatStore(const StoreSDNode *N); + static bool isPrivateStore(const StoreSDNode *N); + static bool isLocalStore(const StoreSDNode *N); + static bool isRegionStore(const StoreSDNode *N); + + bool isCPLoad(const LoadSDNode *N) const; + bool isConstantLoad(const LoadSDNode *N, int cbID) const; + bool isGlobalLoad(const LoadSDNode *N) const; + bool isFlatLoad(const LoadSDNode *N) const; + bool isParamLoad(const LoadSDNode *N) const; + bool isPrivateLoad(const LoadSDNode *N) const; + bool isLocalLoad(const LoadSDNode *N) const; + bool isRegionLoad(const LoadSDNode *N) const; + + SDNode *glueCopyToM0(SDNode *N) const; + + const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; + bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); + bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg, + SDValue& Offset); + bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); + bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); + bool isDSOffsetLegal(const SDValue &Base, unsigned Offset, + unsigned OffsetBits) const; + bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; + bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, + SDValue &Offset1) const; + bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, + SDValue &SOffset, SDValue &Offset, SDValue &Offen, + SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, + SDValue &TFE) const; + bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, + SDValue &SOffset, SDValue &Offset, SDValue &GLC, + SDValue &SLC, SDValue &TFE) const; + bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, + SDValue &VAddr, SDValue &SOffset, SDValue &Offset, + SDValue &SLC) const; + bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr, + SDValue &SOffset, SDValue &ImmOffset) const; + bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, + SDValue &Offset, SDValue &GLC, SDValue &SLC, + SDValue &TFE) const; + bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, + SDValue &Offset, SDValue &GLC) const; + bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, + bool &Imm) const; + bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, + bool &Imm) const; + bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; + bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const; + bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const; + bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const; + bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const; + bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const; + SDNode *SelectAddrSpaceCast(SDNode *N); + bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp, SDValue &Omod) const; + bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp, SDValue &Omod) const; + + bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Omod) const; + bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp, + SDValue &Omod) const; + + SDNode *SelectADD_SUB_I64(SDNode *N); + SDNode *SelectDIV_SCALE(SDNode *N); + + SDNode *getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val, + uint32_t Offset, uint32_t Width); + SDNode *SelectS_BFEFromShifts(SDNode *N); + SDNode *SelectS_BFE(SDNode *N); + + // Include the pieces autogenerated from the target description. +#include "AMDGPUGenDAGISel.inc" +}; +} // end anonymous namespace + +/// \brief This pass converts a legalized DAG into a AMDGPU-specific +// DAG, ready for instruction scheduling. +FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM) { + return new AMDGPUDAGToDAGISel(TM); +} + +AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM) + : SelectionDAGISel(TM) {} + +bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { + Subtarget = &static_cast<const AMDGPUSubtarget &>(MF.getSubtarget()); + return SelectionDAGISel::runOnMachineFunction(MF); +} + +AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() { +} + +bool AMDGPUDAGToDAGISel::isInlineImmediate(SDNode *N) const { + const SITargetLowering *TL + = static_cast<const SITargetLowering *>(getTargetLowering()); + return TL->analyzeImmediate(N) == 0; +} + +/// \brief Determine the register class for \p OpNo +/// \returns The register class of the virtual register that will be used for +/// the given operand number \OpNo or NULL if the register class cannot be +/// determined. +const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, + unsigned OpNo) const { + if (!N->isMachineOpcode()) + return nullptr; + + switch (N->getMachineOpcode()) { + default: { + const MCInstrDesc &Desc = + Subtarget->getInstrInfo()->get(N->getMachineOpcode()); + unsigned OpIdx = Desc.getNumDefs() + OpNo; + if (OpIdx >= Desc.getNumOperands()) + return nullptr; + int RegClass = Desc.OpInfo[OpIdx].RegClass; + if (RegClass == -1) + return nullptr; + + return Subtarget->getRegisterInfo()->getRegClass(RegClass); + } + case AMDGPU::REG_SEQUENCE: { + unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + const TargetRegisterClass *SuperRC = + Subtarget->getRegisterInfo()->getRegClass(RCID); + + SDValue SubRegOp = N->getOperand(OpNo + 1); + unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue(); + return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC, + SubRegIdx); + } + } +} + +bool AMDGPUDAGToDAGISel::SelectADDRParam( + SDValue Addr, SDValue& R1, SDValue& R2) { + + if (Addr.getOpcode() == ISD::FrameIndex) { + if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { + R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); + } + } else if (Addr.getOpcode() == ISD::ADD) { + R1 = Addr.getOperand(0); + R2 = Addr.getOperand(1); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); + } + return true; +} + +bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) { + if (Addr.getOpcode() == ISD::TargetExternalSymbol || + Addr.getOpcode() == ISD::TargetGlobalAddress) { + return false; + } + return SelectADDRParam(Addr, R1, R2); +} + + +bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) { + if (Addr.getOpcode() == ISD::TargetExternalSymbol || + Addr.getOpcode() == ISD::TargetGlobalAddress) { + return false; + } + + if (Addr.getOpcode() == ISD::FrameIndex) { + if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { + R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64); + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); + } + } else if (Addr.getOpcode() == ISD::ADD) { + R1 = Addr.getOperand(0); + R2 = Addr.getOperand(1); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); + } + return true; +} + +SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || + !checkType(cast<MemSDNode>(N)->getMemOperand()->getValue(), + AMDGPUAS::LOCAL_ADDRESS)) + return N; + + const SITargetLowering& Lowering = + *static_cast<const SITargetLowering*>(getTargetLowering()); + + // Write max value to m0 before each load operation + + SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N), + CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); + + SDValue Glue = M0.getValue(1); + + SmallVector <SDValue, 8> Ops; + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { + Ops.push_back(N->getOperand(i)); + } + Ops.push_back(Glue); + CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); + + return N; +} + +static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { + switch (NumVectorElts) { + case 1: + return AMDGPU::SReg_32RegClassID; + case 2: + return AMDGPU::SReg_64RegClassID; + case 4: + return AMDGPU::SReg_128RegClassID; + case 8: + return AMDGPU::SReg_256RegClassID; + case 16: + return AMDGPU::SReg_512RegClassID; + } + + llvm_unreachable("invalid vector size"); +} + +SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { + unsigned int Opc = N->getOpcode(); + if (N->isMachineOpcode()) { + N->setNodeId(-1); + return nullptr; // Already selected. + } + + if (isa<AtomicSDNode>(N)) + N = glueCopyToM0(N); + + switch (Opc) { + default: break; + // We are selecting i64 ADD here instead of custom lower it during + // DAG legalization, so we can fold some i64 ADDs used for address + // calculation into the LOAD and STORE instructions. + case ISD::ADD: + case ISD::SUB: { + if (N->getValueType(0) != MVT::i64 || + Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) + break; + + return SelectADD_SUB_I64(N); + } + case ISD::SCALAR_TO_VECTOR: + case AMDGPUISD::BUILD_VERTICAL_VECTOR: + case ISD::BUILD_VECTOR: { + unsigned RegClassID; + const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo(); + EVT VT = N->getValueType(0); + unsigned NumVectorElts = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + assert(EltVT.bitsEq(MVT::i32)); + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + RegClassID = selectSGPRVectorRegClassID(NumVectorElts); + } else { + // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG + // that adds a 128 bits reg copy when going through TwoAddressInstructions + // pass. We want to avoid 128 bits copies as much as possible because they + // can't be bundled by our scheduler. + switch(NumVectorElts) { + case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break; + case 4: + if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR) + RegClassID = AMDGPU::R600_Reg128VerticalRegClassID; + else + RegClassID = AMDGPU::R600_Reg128RegClassID; + break; + default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); + } + } + + SDLoc DL(N); + SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); + + if (NumVectorElts == 1) { + return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, + N->getOperand(0), RegClass); + } + + assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not " + "supported yet"); + // 16 = Max Num Vector Elements + // 2 = 2 REG_SEQUENCE operands per element (value, subreg index) + // 1 = Vector Register Class + SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1); + + RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); + bool IsRegSeq = true; + unsigned NOps = N->getNumOperands(); + for (unsigned i = 0; i < NOps; i++) { + // XXX: Why is this here? + if (isa<RegisterSDNode>(N->getOperand(i))) { + IsRegSeq = false; + break; + } + RegSeqArgs[1 + (2 * i)] = N->getOperand(i); + RegSeqArgs[1 + (2 * i) + 1] = + CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, + MVT::i32); + } + + if (NOps != NumVectorElts) { + // Fill in the missing undef elements if this was a scalar_to_vector. + assert(Opc == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts); + + MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + DL, EltVT); + for (unsigned i = NOps; i < NumVectorElts; ++i) { + RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0); + RegSeqArgs[1 + (2 * i) + 1] = + CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32); + } + } + + if (!IsRegSeq) + break; + return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), + RegSeqArgs); + } + case ISD::BUILD_PAIR: { + SDValue RC, SubReg0, SubReg1; + if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { + break; + } + SDLoc DL(N); + if (N->getValueType(0) == MVT::i128) { + RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32); + SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32); + SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32); + } else if (N->getValueType(0) == MVT::i64) { + RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32); + SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); + } else { + llvm_unreachable("Unhandled value type for BUILD_PAIR"); + } + const SDValue Ops[] = { RC, N->getOperand(0), SubReg0, + N->getOperand(1), SubReg1 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, + DL, N->getValueType(0), Ops); + } + + case ISD::Constant: + case ISD::ConstantFP: { + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || + N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N)) + break; + + uint64_t Imm; + if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) + Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue(); + else { + ConstantSDNode *C = cast<ConstantSDNode>(N); + Imm = C->getZExtValue(); + } + + SDLoc DL(N); + SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, + MVT::i32)); + SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getConstant(Imm >> 32, DL, MVT::i32)); + const SDValue Ops[] = { + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), + SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32) + }; + + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, + N->getValueType(0), Ops); + } + case ISD::LOAD: + case ISD::STORE: { + N = glueCopyToM0(N); + break; + } + + case AMDGPUISD::BFE_I32: + case AMDGPUISD::BFE_U32: { + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) + break; + + // There is a scalar version available, but unlike the vector version which + // has a separate operand for the offset and width, the scalar version packs + // the width and offset into a single operand. Try to move to the scalar + // version if the offsets are constant, so that we can try to keep extended + // loads of kernel arguments in SGPRs. + + // TODO: Technically we could try to pattern match scalar bitshifts of + // dynamic values, but it's probably not useful. + ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!Offset) + break; + + ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2)); + if (!Width) + break; + + bool Signed = Opc == AMDGPUISD::BFE_I32; + + uint32_t OffsetVal = Offset->getZExtValue(); + uint32_t WidthVal = Width->getZExtValue(); + + return getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, SDLoc(N), + N->getOperand(0), OffsetVal, WidthVal); + } + case AMDGPUISD::DIV_SCALE: { + return SelectDIV_SCALE(N); + } + case ISD::CopyToReg: { + const SITargetLowering& Lowering = + *static_cast<const SITargetLowering*>(getTargetLowering()); + Lowering.legalizeTargetIndependentNode(N, *CurDAG); + break; + } + case ISD::ADDRSPACECAST: + return SelectAddrSpaceCast(N); + case ISD::AND: + case ISD::SRL: + case ISD::SRA: + if (N->getValueType(0) != MVT::i32 || + Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) + break; + + return SelectS_BFE(N); + } + + return SelectCode(N); +} + +bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) { + assert(AS != 0 && "Use checkPrivateAddress instead."); + if (!Ptr) + return false; + + return Ptr->getType()->getPointerAddressSpace() == AS; +} + +bool AMDGPUDAGToDAGISel::checkPrivateAddress(const MachineMemOperand *Op) { + if (Op->getPseudoValue()) + return true; + + if (PointerType *PT = dyn_cast<PointerType>(Op->getValue()->getType())) + return PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; + + return false; +} + +bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) { + const Value *MemVal = N->getMemOperand()->getValue(); + return (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) && + !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) && + !checkType(MemVal, AMDGPUAS::REGION_ADDRESS)); +} + +bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isFlatStore(const StoreSDNode *N) { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const { + const Value *MemVal = N->getMemOperand()->getValue(); + if (CbId == -1) + return checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS); + + return checkType(MemVal, AMDGPUAS::CONSTANT_BUFFER_0 + CbId); +} + +bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const { + if (N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || + N->getMemoryVT().bitsLT(MVT::i32)) + return true; + + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) const { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::PARAM_I_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isLocalLoad(const LoadSDNode *N) const { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isFlatLoad(const LoadSDNode *N) const { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isRegionLoad(const LoadSDNode *N) const { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const { + MachineMemOperand *MMO = N->getMemOperand(); + if (checkPrivateAddress(N->getMemOperand())) { + if (MMO) { + const PseudoSourceValue *PSV = MMO->getPseudoValue(); + if (PSV && PSV->isConstantPool()) { + return true; + } + } + } + return false; +} + +bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const { + if (checkPrivateAddress(N->getMemOperand())) { + // Check to make sure we are not a constant pool load or a constant load + // that is marked as a private load + if (isCPLoad(N) || isConstantLoad(N, -1)) { + return false; + } + } + + const Value *MemVal = N->getMemOperand()->getValue(); + if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) && + !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) && + !checkType(MemVal, AMDGPUAS::FLAT_ADDRESS) && + !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) && + !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) && + !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) && + !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)) { + return true; + } + return false; +} + +const char *AMDGPUDAGToDAGISel::getPassName() const { + return "AMDGPU DAG->DAG Pattern Instruction Selection"; +} + +#ifdef DEBUGTMP +#undef INT64_C +#endif +#undef DEBUGTMP + +//===----------------------------------------------------------------------===// +// Complex Patterns +//===----------------------------------------------------------------------===// + +bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr, + SDValue& IntPtr) { + if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) { + IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr), + true); + return true; + } + return false; +} + +bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr, + SDValue& BaseReg, SDValue &Offset) { + if (!isa<ConstantSDNode>(Addr)) { + BaseReg = Addr; + Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true); + return true; + } + return false; +} + +bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, + SDValue &Offset) { + ConstantSDNode *IMMOffset; + + if (Addr.getOpcode() == ISD::ADD + && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) + && isInt<16>(IMMOffset->getZExtValue())) { + + Base = Addr.getOperand(0); + Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), + MVT::i32); + return true; + // If the pointer address is constant, we can move it to the offset field. + } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr)) + && isInt<16>(IMMOffset->getZExtValue())) { + Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), + SDLoc(CurDAG->getEntryNode()), + AMDGPU::ZERO, MVT::i32); + Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), + MVT::i32); + return true; + } + + // Default case, no offset + Base = Addr; + Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, + SDValue &Offset) { + ConstantSDNode *C; + SDLoc DL(Addr); + + if ((C = dyn_cast<ConstantSDNode>(Addr))) { + Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); + Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); + } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && + (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) { + Base = Addr.getOperand(0); + Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); + } else { + Base = Addr; + Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); + } + + return true; +} + +SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { + SDLoc DL(N); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + bool IsAdd = (N->getOpcode() == ISD::ADD); + + SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); + + SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, LHS, Sub0); + SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, LHS, Sub1); + + SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, RHS, Sub0); + SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, RHS, Sub1); + + SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue); + SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) }; + + + unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; + unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; + + SDNode *AddLo = CurDAG->getMachineNode( Opc, DL, VTList, AddLoArgs); + SDValue Carry(AddLo, 1); + SDNode *AddHi + = CurDAG->getMachineNode(CarryOpc, DL, MVT::i32, + SDValue(Hi0, 0), SDValue(Hi1, 0), Carry); + + SDValue Args[5] = { + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), + SDValue(AddLo,0), + Sub0, + SDValue(AddHi,0), + Sub1, + }; + return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args); +} + +// We need to handle this here because tablegen doesn't support matching +// instructions with multiple outputs. +SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { + SDLoc SL(N); + EVT VT = N->getValueType(0); + + assert(VT == MVT::f32 || VT == MVT::f64); + + unsigned Opc + = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; + + // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, + // omod + SDValue Ops[8]; + + SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]); + SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]); + SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]); + return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops); +} + +bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset, + unsigned OffsetBits) const { + if ((OffsetBits == 16 && !isUInt<16>(Offset)) || + (OffsetBits == 8 && !isUInt<8>(Offset))) + return false; + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS || + Subtarget->unsafeDSOffsetFoldingEnabled()) + return true; + + // On Southern Islands instruction with a negative base value and an offset + // don't seem to work. + return CurDAG->SignBitIsZero(Base); +} + +bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, + SDValue &Offset) const { + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + ConstantSDNode *C1 = cast<ConstantSDNode>(N1); + if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) { + // (add n0, c0) + Base = N0; + Offset = N1; + return true; + } + } else if (Addr.getOpcode() == ISD::SUB) { + // sub C, x -> add (sub 0, x), C + if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) { + int64_t ByteOffset = C->getSExtValue(); + if (isUInt<16>(ByteOffset)) { + SDLoc DL(Addr); + SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); + + // XXX - This is kind of hacky. Create a dummy sub node so we can check + // the known bits in isDSOffsetLegal. We need to emit the selected node + // here, so this is thrown away. + SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32, + Zero, Addr.getOperand(1)); + + if (isDSOffsetLegal(Sub, ByteOffset, 16)) { + MachineSDNode *MachineSub + = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32, + Zero, Addr.getOperand(1)); + + Base = SDValue(MachineSub, 0); + Offset = Addr.getOperand(0); + return true; + } + } + } + } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { + // If we have a constant address, prefer to put the constant into the + // offset. This can save moves to load the constant address since multiple + // operations can share the zero base address register, and enables merging + // into read2 / write2 instructions. + + SDLoc DL(Addr); + + if (isUInt<16>(CAddr->getZExtValue())) { + SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); + MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, + DL, MVT::i32, Zero); + Base = SDValue(MovZero, 0); + Offset = Addr; + return true; + } + } + + // default case + Base = Addr; + Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16); + return true; +} + +// TODO: If offset is too big, put low 16-bit into offset. +bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, + SDValue &Offset0, + SDValue &Offset1) const { + SDLoc DL(Addr); + + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + ConstantSDNode *C1 = cast<ConstantSDNode>(N1); + unsigned DWordOffset0 = C1->getZExtValue() / 4; + unsigned DWordOffset1 = DWordOffset0 + 1; + // (add n0, c0) + if (isDSOffsetLegal(N0, DWordOffset1, 8)) { + Base = N0; + Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); + return true; + } + } else if (Addr.getOpcode() == ISD::SUB) { + // sub C, x -> add (sub 0, x), C + if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) { + unsigned DWordOffset0 = C->getZExtValue() / 4; + unsigned DWordOffset1 = DWordOffset0 + 1; + + if (isUInt<8>(DWordOffset0)) { + SDLoc DL(Addr); + SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); + + // XXX - This is kind of hacky. Create a dummy sub node so we can check + // the known bits in isDSOffsetLegal. We need to emit the selected node + // here, so this is thrown away. + SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32, + Zero, Addr.getOperand(1)); + + if (isDSOffsetLegal(Sub, DWordOffset1, 8)) { + MachineSDNode *MachineSub + = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32, + Zero, Addr.getOperand(1)); + + Base = SDValue(MachineSub, 0); + Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); + return true; + } + } + } + } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { + unsigned DWordOffset0 = CAddr->getZExtValue() / 4; + unsigned DWordOffset1 = DWordOffset0 + 1; + assert(4 * DWordOffset0 == CAddr->getZExtValue()); + + if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) { + SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); + MachineSDNode *MovZero + = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, + DL, MVT::i32, Zero); + Base = SDValue(MovZero, 0); + Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); + return true; + } + } + + // default case + Base = Addr; + Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8); + return true; +} + +static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) { + return isUInt<12>(Imm->getZExtValue()); +} + +bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, + SDValue &VAddr, SDValue &SOffset, + SDValue &Offset, SDValue &Offen, + SDValue &Idxen, SDValue &Addr64, + SDValue &GLC, SDValue &SLC, + SDValue &TFE) const { + // Subtarget prefers to use flat instruction + if (Subtarget->useFlatForGlobal()) + return false; + + SDLoc DL(Addr); + + GLC = CurDAG->getTargetConstant(0, DL, MVT::i1); + SLC = CurDAG->getTargetConstant(0, DL, MVT::i1); + TFE = CurDAG->getTargetConstant(0, DL, MVT::i1); + + Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); + Offen = CurDAG->getTargetConstant(0, DL, MVT::i1); + Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1); + SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); + + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + ConstantSDNode *C1 = cast<ConstantSDNode>(N1); + + if (N0.getOpcode() == ISD::ADD) { + // (add (add N2, N3), C1) -> addr64 + SDValue N2 = N0.getOperand(0); + SDValue N3 = N0.getOperand(1); + Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); + Ptr = N2; + VAddr = N3; + } else { + + // (add N0, C1) -> offset + VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); + Ptr = N0; + } + + if (isLegalMUBUFImmOffset(C1)) { + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); + return true; + } else if (isUInt<32>(C1->getZExtValue())) { + // Illegal offset, store it in soffset. + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)), + 0); + return true; + } + } + + if (Addr.getOpcode() == ISD::ADD) { + // (add N0, N1) -> addr64 + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); + Ptr = N0; + VAddr = N1; + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + return true; + } + + // default case -> offset + VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); + Ptr = Addr; + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + + return true; +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, + SDValue &VAddr, SDValue &SOffset, + SDValue &Offset, SDValue &GLC, + SDValue &SLC, SDValue &TFE) const { + SDValue Ptr, Offen, Idxen, Addr64; + + // addr64 bit was removed for volcanic islands. + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return false; + + if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, + GLC, SLC, TFE)) + return false; + + ConstantSDNode *C = cast<ConstantSDNode>(Addr64); + if (C->getSExtValue()) { + SDLoc DL(Addr); + + const SITargetLowering& Lowering = + *static_cast<const SITargetLowering*>(getTargetLowering()); + + SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0); + return true; + } + + return false; +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, + SDValue &VAddr, SDValue &SOffset, + SDValue &Offset, + SDValue &SLC) const { + SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1); + SDValue GLC, TFE; + + return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE); +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, + SDValue &VAddr, SDValue &SOffset, + SDValue &ImmOffset) const { + + SDLoc DL(Addr); + MachineFunction &MF = CurDAG->getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + + Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); + SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32); + + // (add n0, c1) + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + // Offsets in vaddr must be positive. + if (CurDAG->SignBitIsZero(N0)) { + ConstantSDNode *C1 = cast<ConstantSDNode>(N1); + if (isLegalMUBUFImmOffset(C1)) { + VAddr = N0; + ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); + return true; + } + } + } + + // (node) + VAddr = Addr; + ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, + SDValue &SOffset, SDValue &Offset, + SDValue &GLC, SDValue &SLC, + SDValue &TFE) const { + SDValue Ptr, VAddr, Offen, Idxen, Addr64; + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + + if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, + GLC, SLC, TFE)) + return false; + + if (!cast<ConstantSDNode>(Offen)->getSExtValue() && + !cast<ConstantSDNode>(Idxen)->getSExtValue() && + !cast<ConstantSDNode>(Addr64)->getSExtValue()) { + uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | + APInt::getAllOnesValue(32).getZExtValue(); // Size + SDLoc DL(Addr); + + const SITargetLowering& Lowering = + *static_cast<const SITargetLowering*>(getTargetLowering()); + + SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0); + return true; + } + return false; +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, + SDValue &Soffset, SDValue &Offset, + SDValue &GLC) const { + SDValue SLC, TFE; + + return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); +} + +/// +/// \param EncodedOffset This is the immediate value that will be encoded +/// directly into the instruction. On SI/CI the \p EncodedOffset +/// will be in units of dwords and on VI+ it will be units of bytes. +static bool isLegalSMRDImmOffset(const AMDGPUSubtarget *ST, + int64_t EncodedOffset) { + return ST->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ? + isUInt<8>(EncodedOffset) : isUInt<20>(EncodedOffset); +} + +bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, + SDValue &Offset, bool &Imm) const { + + // FIXME: Handle non-constant offsets. + ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode); + if (!C) + return false; + + SDLoc SL(ByteOffsetNode); + AMDGPUSubtarget::Generation Gen = Subtarget->getGeneration(); + int64_t ByteOffset = C->getSExtValue(); + int64_t EncodedOffset = Gen < AMDGPUSubtarget::VOLCANIC_ISLANDS ? + ByteOffset >> 2 : ByteOffset; + + if (isLegalSMRDImmOffset(Subtarget, EncodedOffset)) { + Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32); + Imm = true; + return true; + } + + if (!isUInt<32>(EncodedOffset) || !isUInt<32>(ByteOffset)) + return false; + + if (Gen == AMDGPUSubtarget::SEA_ISLANDS && isUInt<32>(EncodedOffset)) { + // 32-bit Immediates are supported on Sea Islands. + Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32); + } else { + SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32); + Offset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, + C32Bit), 0); + } + Imm = false; + return true; +} + +bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, + SDValue &Offset, bool &Imm) const { + + SDLoc SL(Addr); + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + + if (SelectSMRDOffset(N1, Offset, Imm)) { + SBase = N0; + return true; + } + } + SBase = Addr; + Offset = CurDAG->getTargetConstant(0, SL, MVT::i32); + Imm = true; + return true; +} + +bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase, + SDValue &Offset) const { + bool Imm; + return SelectSMRD(Addr, SBase, Offset, Imm) && Imm; +} + +bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase, + SDValue &Offset) const { + + if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS) + return false; + + bool Imm; + if (!SelectSMRD(Addr, SBase, Offset, Imm)) + return false; + + return !Imm && isa<ConstantSDNode>(Offset); +} + +bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase, + SDValue &Offset) const { + bool Imm; + return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm && + !isa<ConstantSDNode>(Offset); +} + +bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr, + SDValue &Offset) const { + bool Imm; + return SelectSMRDOffset(Addr, Offset, Imm) && Imm; +} + +bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr, + SDValue &Offset) const { + if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS) + return false; + + bool Imm; + if (!SelectSMRDOffset(Addr, Offset, Imm)) + return false; + + return !Imm && isa<ConstantSDNode>(Offset); +} + +bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr, + SDValue &Offset) const { + bool Imm; + return SelectSMRDOffset(Addr, Offset, Imm) && !Imm && + !isa<ConstantSDNode>(Offset); +} + +// FIXME: This is incorrect and only enough to be able to compile. +SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { + AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N); + SDLoc DL(N); + + const MachineFunction &MF = CurDAG->getMachineFunction(); + DiagnosticInfoUnsupported NotImplemented(*MF.getFunction(), + "addrspacecast not implemented"); + CurDAG->getContext()->diagnose(NotImplemented); + + assert(Subtarget->hasFlatAddressSpace() && + "addrspacecast only supported with flat address space!"); + + assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS || + ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) && + "Can only cast to / from flat address space!"); + + // The flat instructions read the address as the index of the VGPR holding the + // address, so casting should just be reinterpreting the base VGPR, so just + // insert trunc / bitcast / zext. + + SDValue Src = ASC->getOperand(0); + EVT DestVT = ASC->getValueType(0); + EVT SrcVT = Src.getValueType(); + + unsigned SrcSize = SrcVT.getSizeInBits(); + unsigned DestSize = DestVT.getSizeInBits(); + + if (SrcSize > DestSize) { + assert(SrcSize == 64 && DestSize == 32); + return CurDAG->getMachineNode( + TargetOpcode::EXTRACT_SUBREG, + DL, + DestVT, + Src, + CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32)); + } + + if (DestSize > SrcSize) { + assert(SrcSize == 32 && DestSize == 64); + + // FIXME: This is probably wrong, we should never be defining + // a register class with both VGPRs and SGPRs + SDValue RC = CurDAG->getTargetConstant(AMDGPU::VS_64RegClassID, DL, + MVT::i32); + + const SDValue Ops[] = { + RC, + Src, + CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getConstant(0, DL, MVT::i32)), 0), + CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32) + }; + + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, + DL, N->getValueType(0), Ops); + } + + assert(SrcSize == 64 && DestSize == 64); + return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode(); +} + +SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val, + uint32_t Offset, uint32_t Width) { + // Transformation function, pack the offset and width of a BFE into + // the format expected by the S_BFE_I32 / S_BFE_U32. In the second + // source, bits [5:0] contain the offset and bits [22:16] the width. + uint32_t PackedVal = Offset | (Width << 16); + SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32); + + return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst); +} + +SDNode *AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) { + // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c) + // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c) + // Predicate: 0 < b <= c < 32 + + const SDValue &Shl = N->getOperand(0); + ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1)); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); + + if (B && C) { + uint32_t BVal = B->getZExtValue(); + uint32_t CVal = C->getZExtValue(); + + if (0 < BVal && BVal <= CVal && CVal < 32) { + bool Signed = N->getOpcode() == ISD::SRA; + unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; + + return getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), + CVal - BVal, 32 - CVal); + } + } + return SelectCode(N); +} + +SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { + switch (N->getOpcode()) { + case ISD::AND: + if (N->getOperand(0).getOpcode() == ISD::SRL) { + // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)" + // Predicate: isMask(mask) + const SDValue &Srl = N->getOperand(0); + ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1)); + ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1)); + + if (Shift && Mask) { + uint32_t ShiftVal = Shift->getZExtValue(); + uint32_t MaskVal = Mask->getZExtValue(); + + if (isMask_32(MaskVal)) { + uint32_t WidthVal = countPopulation(MaskVal); + + return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), Srl.getOperand(0), + ShiftVal, WidthVal); + } + } + } + break; + case ISD::SRL: + if (N->getOperand(0).getOpcode() == ISD::AND) { + // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)" + // Predicate: isMask(mask >> b) + const SDValue &And = N->getOperand(0); + ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1)); + ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1)); + + if (Shift && Mask) { + uint32_t ShiftVal = Shift->getZExtValue(); + uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal; + + if (isMask_32(MaskVal)) { + uint32_t WidthVal = countPopulation(MaskVal); + + return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), And.getOperand(0), + ShiftVal, WidthVal); + } + } + } else if (N->getOperand(0).getOpcode() == ISD::SHL) + return SelectS_BFEFromShifts(N); + break; + case ISD::SRA: + if (N->getOperand(0).getOpcode() == ISD::SHL) + return SelectS_BFEFromShifts(N); + break; + } + + return SelectCode(N); +} + +bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + + unsigned Mods = 0; + + Src = In; + + if (Src.getOpcode() == ISD::FNEG) { + Mods |= SISrcMods::NEG; + Src = Src.getOperand(0); + } + + if (Src.getOpcode() == ISD::FABS) { + Mods |= SISrcMods::ABS; + Src = Src.getOperand(0); + } + + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + bool Res = SelectVOP3Mods(In, Src, SrcMods); + return Res && cast<ConstantSDNode>(SrcMods)->isNullValue(); +} + +bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, + SDValue &SrcMods, SDValue &Clamp, + SDValue &Omod) const { + SDLoc DL(In); + // FIXME: Handle Clamp and Omod + Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32); + Omod = CurDAG->getTargetConstant(0, DL, MVT::i32); + + return SelectVOP3Mods(In, Src, SrcMods); +} + +bool AMDGPUDAGToDAGISel::SelectVOP3NoMods0(SDValue In, SDValue &Src, + SDValue &SrcMods, SDValue &Clamp, + SDValue &Omod) const { + bool Res = SelectVOP3Mods0(In, Src, SrcMods, Clamp, Omod); + + return Res && cast<ConstantSDNode>(SrcMods)->isNullValue() && + cast<ConstantSDNode>(Clamp)->isNullValue() && + cast<ConstantSDNode>(Omod)->isNullValue(); +} + +bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, + SDValue &SrcMods, + SDValue &Omod) const { + // FIXME: Handle Omod + Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); + + return SelectVOP3Mods(In, Src, SrcMods); +} + +bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, + SDValue &SrcMods, + SDValue &Clamp, + SDValue &Omod) const { + Clamp = Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); + return SelectVOP3Mods(In, Src, SrcMods); +} + +void AMDGPUDAGToDAGISel::PreprocessISelDAG() { + bool Modified = false; + + // XXX - Other targets seem to be able to do this without a worklist. + SmallVector<LoadSDNode *, 8> LoadsToReplace; + SmallVector<StoreSDNode *, 8> StoresToReplace; + + for (SDNode &Node : CurDAG->allnodes()) { + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(&Node)) { + EVT VT = LD->getValueType(0); + if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) + continue; + + // To simplify the TableGen patters, we replace all i64 loads with v2i32 + // loads. Alternatively, we could promote i64 loads to v2i32 during DAG + // legalization, however, so places (ExpandUnalignedLoad) in the DAG + // legalizer assume that if i64 is legal, so doing this promotion early + // can cause problems. + LoadsToReplace.push_back(LD); + } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(&Node)) { + // Handle i64 stores here for the same reason mentioned above for loads. + SDValue Value = ST->getValue(); + if (Value.getValueType() != MVT::i64 || ST->isTruncatingStore()) + continue; + StoresToReplace.push_back(ST); + } + } + + for (LoadSDNode *LD : LoadsToReplace) { + SDLoc SL(LD); + + SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SL, LD->getChain(), + LD->getBasePtr(), LD->getMemOperand()); + SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL, + MVT::i64, NewLoad); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1)); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 0), BitCast); + Modified = true; + } + + for (StoreSDNode *ST : StoresToReplace) { + SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(ST), + MVT::v2i32, ST->getValue()); + const SDValue StoreOps[] = { + ST->getChain(), + NewValue, + ST->getBasePtr(), + ST->getOffset() + }; + + CurDAG->UpdateNodeOperands(ST, StoreOps); + Modified = true; + } + + // XXX - Is this necessary? + if (Modified) + CurDAG->RemoveDeadNodes(); +} + +void AMDGPUDAGToDAGISel::PostprocessISelDAG() { + const AMDGPUTargetLowering& Lowering = + *static_cast<const AMDGPUTargetLowering*>(getTargetLowering()); + bool IsModified = false; + do { + IsModified = false; + // Go over all selected nodes and try to fold them a bit more + for (SDNode &Node : CurDAG->allnodes()) { + MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(&Node); + if (!MachineNode) + continue; + + SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG); + if (ResNode != &Node) { + ReplaceUses(&Node, ResNode); + IsModified = true; + } + } + CurDAG->RemoveDeadNodes(); + } while (IsModified); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp new file mode 100644 index 0000000..1a59a46 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -0,0 +1,3102 @@ +//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This is the parent TargetLowering class for hardware code gen +/// targets. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUISelLowering.h" +#include "AMDGPU.h" +#include "AMDGPUDiagnosticInfoUnsupported.h" +#include "AMDGPUFrameLowering.h" +#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPURegisterInfo.h" +#include "AMDGPUSubtarget.h" +#include "R600MachineFunctionInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/IR/DataLayout.h" + +using namespace llvm; + +static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + unsigned Offset = State.AllocateStack(ValVT.getStoreSize(), + ArgFlags.getOrigAlign()); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + + return true; +} + +#include "AMDGPUGenCallingConv.inc" + +// Find a larger type to do a load / store of a vector with. +EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { + unsigned StoreSize = VT.getStoreSizeInBits(); + if (StoreSize <= 32) + return EVT::getIntegerVT(Ctx, StoreSize); + + assert(StoreSize % 32 == 0 && "Store size not a multiple of 32"); + return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); +} + +// Type for a vector that will be loaded to. +EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) { + unsigned StoreSize = VT.getStoreSizeInBits(); + if (StoreSize <= 32) + return EVT::getIntegerVT(Ctx, 32); + + return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); +} + +AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, + const AMDGPUSubtarget &STI) + : TargetLowering(TM), Subtarget(&STI) { + setOperationAction(ISD::Constant, MVT::i32, Legal); + setOperationAction(ISD::Constant, MVT::i64, Legal); + setOperationAction(ISD::ConstantFP, MVT::f32, Legal); + setOperationAction(ISD::ConstantFP, MVT::f64, Legal); + + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + setOperationAction(ISD::BRIND, MVT::Other, Expand); + + // This is totally unsupported, just custom lower to produce an error. + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); + + // We need to custom lower some of the intrinsics + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + + // Library functions. These default to Expand, but we have instructions + // for them. + setOperationAction(ISD::FCEIL, MVT::f32, Legal); + setOperationAction(ISD::FEXP2, MVT::f32, Legal); + setOperationAction(ISD::FPOW, MVT::f32, Legal); + setOperationAction(ISD::FLOG2, MVT::f32, Legal); + setOperationAction(ISD::FABS, MVT::f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::f32, Legal); + setOperationAction(ISD::FRINT, MVT::f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); + + setOperationAction(ISD::FROUND, MVT::f32, Custom); + setOperationAction(ISD::FROUND, MVT::f64, Custom); + + setOperationAction(ISD::FREM, MVT::f32, Custom); + setOperationAction(ISD::FREM, MVT::f64, Custom); + + // v_mad_f32 does not support denormals according to some sources. + if (!Subtarget->hasFP32Denormals()) + setOperationAction(ISD::FMAD, MVT::f32, Legal); + + // Expand to fneg + fadd. + setOperationAction(ISD::FSUB, MVT::f64, Expand); + + // Lower floating point store/load to integer store/load to reduce the number + // of patterns in tablegen. + setOperationAction(ISD::STORE, MVT::f32, Promote); + AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); + + setOperationAction(ISD::STORE, MVT::v2f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); + + setOperationAction(ISD::STORE, MVT::v4f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); + + setOperationAction(ISD::STORE, MVT::v8f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); + + setOperationAction(ISD::STORE, MVT::v16f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); + + setOperationAction(ISD::STORE, MVT::f64, Promote); + AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64); + + setOperationAction(ISD::STORE, MVT::v2f64, Promote); + AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64); + + // Custom lowering of vector stores is required for local address space + // stores. + setOperationAction(ISD::STORE, MVT::v4i32, Custom); + + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); + setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); + + // XXX: This can be change to Custom, once ExpandVectorStores can + // handle 64-bit stores. + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); + + setTruncStoreAction(MVT::i64, MVT::i16, Expand); + setTruncStoreAction(MVT::i64, MVT::i8, Expand); + setTruncStoreAction(MVT::i64, MVT::i1, Expand); + setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); + setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand); + + + setOperationAction(ISD::LOAD, MVT::f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); + + setOperationAction(ISD::LOAD, MVT::v2f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); + + setOperationAction(ISD::LOAD, MVT::v4f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); + + setOperationAction(ISD::LOAD, MVT::v8f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); + + setOperationAction(ISD::LOAD, MVT::v16f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); + + setOperationAction(ISD::LOAD, MVT::f64, Promote); + AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64); + + setOperationAction(ISD::LOAD, MVT::v2f64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64); + + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); + + // There are no 64-bit extloads. These should be done as a 32-bit extload and + // an extension to 64-bit. + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand); + } + + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand); + } + + setOperationAction(ISD::BR_CC, MVT::i1, Expand); + + if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { + setOperationAction(ISD::FCEIL, MVT::f64, Custom); + setOperationAction(ISD::FTRUNC, MVT::f64, Custom); + setOperationAction(ISD::FRINT, MVT::f64, Custom); + setOperationAction(ISD::FFLOOR, MVT::f64, Custom); + } + + if (!Subtarget->hasBFI()) { + // fcopysign can be done in a single instruction with BFI. + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + } + + setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); + + setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); + setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand); + setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand); + + setTruncStoreAction(MVT::f64, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + + const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; + for (MVT VT : ScalarIntVTs) { + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); + + // GPU does not have divrem function for signed or unsigned. + setOperationAction(ISD::SDIVREM, VT, Custom); + setOperationAction(ISD::UDIVREM, VT, Custom); + + // GPU does not have [S|U]MUL_LOHI functions as a single instruction. + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + + setOperationAction(ISD::BSWAP, VT, Expand); + setOperationAction(ISD::CTTZ, VT, Expand); + setOperationAction(ISD::CTLZ, VT, Expand); + } + + if (!Subtarget->hasBCNT(32)) + setOperationAction(ISD::CTPOP, MVT::i32, Expand); + + if (!Subtarget->hasBCNT(64)) + setOperationAction(ISD::CTPOP, MVT::i64, Expand); + + // The hardware supports 32-bit ROTR, but not ROTL. + setOperationAction(ISD::ROTL, MVT::i32, Expand); + setOperationAction(ISD::ROTL, MVT::i64, Expand); + setOperationAction(ISD::ROTR, MVT::i64, Expand); + + setOperationAction(ISD::MUL, MVT::i64, Expand); + setOperationAction(ISD::MULHU, MVT::i64, Expand); + setOperationAction(ISD::MULHS, MVT::i64, Expand); + setOperationAction(ISD::UDIV, MVT::i32, Expand); + setOperationAction(ISD::UREM, MVT::i32, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + + setOperationAction(ISD::SMIN, MVT::i32, Legal); + setOperationAction(ISD::UMIN, MVT::i32, Legal); + setOperationAction(ISD::SMAX, MVT::i32, Legal); + setOperationAction(ISD::UMAX, MVT::i32, Legal); + + if (Subtarget->hasFFBH()) + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); + else + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); + + if (!Subtarget->hasFFBL()) + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); + + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); + + setOperationAction(ISD::CTLZ, MVT::i64, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); + + static const MVT::SimpleValueType VectorIntTypes[] = { + MVT::v2i32, MVT::v4i32 + }; + + for (MVT VT : VectorIntTypes) { + // Expand the following operations for the current type by default. + setOperationAction(ISD::ADD, VT, Expand); + setOperationAction(ISD::AND, VT, Expand); + setOperationAction(ISD::FP_TO_SINT, VT, Expand); + setOperationAction(ISD::FP_TO_UINT, VT, Expand); + setOperationAction(ISD::MUL, VT, Expand); + setOperationAction(ISD::OR, VT, Expand); + setOperationAction(ISD::SHL, VT, Expand); + setOperationAction(ISD::SRA, VT, Expand); + setOperationAction(ISD::SRL, VT, Expand); + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); + setOperationAction(ISD::SUB, VT, Expand); + setOperationAction(ISD::SINT_TO_FP, VT, Expand); + setOperationAction(ISD::UINT_TO_FP, VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Custom); + setOperationAction(ISD::UDIVREM, VT, Expand); + setOperationAction(ISD::ADDC, VT, Expand); + setOperationAction(ISD::SUBC, VT, Expand); + setOperationAction(ISD::ADDE, VT, Expand); + setOperationAction(ISD::SUBE, VT, Expand); + setOperationAction(ISD::SELECT, VT, Expand); + setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::XOR, VT, Expand); + setOperationAction(ISD::BSWAP, VT, Expand); + setOperationAction(ISD::CTPOP, VT, Expand); + setOperationAction(ISD::CTTZ, VT, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::CTLZ, VT, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); + } + + static const MVT::SimpleValueType FloatVectorTypes[] = { + MVT::v2f32, MVT::v4f32 + }; + + for (MVT VT : FloatVectorTypes) { + setOperationAction(ISD::FABS, VT, Expand); + setOperationAction(ISD::FMINNUM, VT, Expand); + setOperationAction(ISD::FMAXNUM, VT, Expand); + setOperationAction(ISD::FADD, VT, Expand); + setOperationAction(ISD::FCEIL, VT, Expand); + setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FDIV, VT, Expand); + setOperationAction(ISD::FEXP2, VT, Expand); + setOperationAction(ISD::FLOG2, VT, Expand); + setOperationAction(ISD::FREM, VT, Expand); + setOperationAction(ISD::FPOW, VT, Expand); + setOperationAction(ISD::FFLOOR, VT, Expand); + setOperationAction(ISD::FTRUNC, VT, Expand); + setOperationAction(ISD::FMUL, VT, Expand); + setOperationAction(ISD::FMA, VT, Expand); + setOperationAction(ISD::FRINT, VT, Expand); + setOperationAction(ISD::FNEARBYINT, VT, Expand); + setOperationAction(ISD::FSQRT, VT, Expand); + setOperationAction(ISD::FSIN, VT, Expand); + setOperationAction(ISD::FSUB, VT, Expand); + setOperationAction(ISD::FNEG, VT, Expand); + setOperationAction(ISD::SELECT, VT, Expand); + setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::FCOPYSIGN, VT, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); + } + + setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); + setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); + + setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::SELECT); + setTargetDAGCombine(ISD::SELECT_CC); + setTargetDAGCombine(ISD::STORE); + + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::FSUB); + + setBooleanContents(ZeroOrNegativeOneBooleanContent); + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); + + setSchedulingPreference(Sched::RegPressure); + setJumpIsExpensive(true); + + // SI at least has hardware support for floating point exceptions, but no way + // of using or handling them is implemented. They are also optional in OpenCL + // (Section 7.3) + setHasFloatingPointExceptions(false); + + setSelectIsExpensive(false); + PredictableSelectIsExpensive = false; + + setFsqrtIsCheap(true); + + // We want to find all load dependencies for long chains of stores to enable + // merging into very wide vectors. The problem is with vectors with > 4 + // elements. MergeConsecutiveStores will attempt to merge these because x8/x16 + // vectors are a legal type, even though we have to split the loads + // usually. When we can more precisely specify load legality per address + // space, we should be able to make FindBetterChain/MergeConsecutiveStores + // smarter so that they can figure out what to do in 2 iterations without all + // N > 4 stores on the same chain. + GatherAllAliasesMaxDepth = 16; + + // FIXME: Need to really handle these. + MaxStoresPerMemcpy = 4096; + MaxStoresPerMemmove = 4096; + MaxStoresPerMemset = 4096; +} + +//===----------------------------------------------------------------------===// +// Target Information +//===----------------------------------------------------------------------===// + +MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const { + return MVT::i32; +} + +bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const { + return true; +} + +// The backend supports 32 and 64 bit floating point immediates. +// FIXME: Why are we reporting vectors of FP immediates as legal? +bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { + EVT ScalarVT = VT.getScalarType(); + return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64); +} + +// We don't want to shrink f64 / f32 constants. +bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { + EVT ScalarVT = VT.getScalarType(); + return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64); +} + +bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, + ISD::LoadExtType, + EVT NewVT) const { + + unsigned NewSize = NewVT.getStoreSizeInBits(); + + // If we are reducing to a 32-bit load, this is always better. + if (NewSize == 32) + return true; + + EVT OldVT = N->getValueType(0); + unsigned OldSize = OldVT.getStoreSizeInBits(); + + // Don't produce extloads from sub 32-bit types. SI doesn't have scalar + // extloads, so doing one requires using a buffer_load. In cases where we + // still couldn't use a scalar load, using the wider load shouldn't really + // hurt anything. + + // If the old size already had to be an extload, there's no harm in continuing + // to reduce the width. + return (OldSize < 32); +} + +bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, + EVT CastTy) const { + if (LoadTy.getSizeInBits() != CastTy.getSizeInBits()) + return true; + + unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits(); + unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits(); + + return ((LScalarSize <= CastScalarSize) || + (CastScalarSize >= 32) || + (LScalarSize < 32)); +} + +// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also +// profitable with the expansion for 64-bit since it's generally good to +// speculate things. +// FIXME: These should really have the size as a parameter. +bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const { + return true; +} + +bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const { + return true; +} + +//===---------------------------------------------------------------------===// +// Target Properties +//===---------------------------------------------------------------------===// + +bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { + assert(VT.isFloatingPoint()); + return VT == MVT::f32 || VT == MVT::f64; +} + +bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { + assert(VT.isFloatingPoint()); + return VT == MVT::f32 || VT == MVT::f64; +} + +bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT, + unsigned NumElem, + unsigned AS) const { + return true; +} + +bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const { + // There are few operations which truly have vector input operands. Any vector + // operation is going to involve operations on each component, and a + // build_vector will be a copy per element, so it always makes sense to use a + // build_vector input in place of the extracted element to avoid a copy into a + // super register. + // + // We should probably only do this if all users are extracts only, but this + // should be the common case. + return true; +} + +bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { + // Truncate is just accessing a subregister. + return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0); +} + +bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { + // Truncate is just accessing a subregister. + return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() && + (Dest->getPrimitiveSizeInBits() % 32 == 0); +} + +bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { + unsigned SrcSize = Src->getScalarSizeInBits(); + unsigned DestSize = Dest->getScalarSizeInBits(); + + return SrcSize == 32 && DestSize == 64; +} + +bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { + // Any register load of a 64-bit value really requires 2 32-bit moves. For all + // practical purposes, the extra mov 0 to load a 64-bit is free. As used, + // this will enable reducing 64-bit operations the 32-bit, which is always + // good. + return Src == MVT::i32 && Dest == MVT::i64; +} + +bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { + return isZExtFree(Val.getValueType(), VT2); +} + +bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { + // There aren't really 64-bit registers, but pairs of 32-bit ones and only a + // limited number of native 64-bit operations. Shrinking an operation to fit + // in a single 32-bit register should always be helpful. As currently used, + // this is much less general than the name suggests, and is only used in + // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is + // not profitable, and may actually be harmful. + return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; +} + +//===---------------------------------------------------------------------===// +// TargetLowering Callbacks +//===---------------------------------------------------------------------===// + +void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State, + const SmallVectorImpl<ISD::InputArg> &Ins) const { + + State.AnalyzeFormalArguments(Ins, CC_AMDGPU); +} + +void AMDGPUTargetLowering::AnalyzeReturn(CCState &State, + const SmallVectorImpl<ISD::OutputArg> &Outs) const { + + State.AnalyzeReturn(Outs, RetCC_SI); +} + +SDValue AMDGPUTargetLowering::LowerReturn( + SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + SDLoc DL, SelectionDAG &DAG) const { + return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain); +} + +//===---------------------------------------------------------------------===// +// Target specific lowering +//===---------------------------------------------------------------------===// + +SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const { + SDValue Callee = CLI.Callee; + SelectionDAG &DAG = CLI.DAG; + + const Function &Fn = *DAG.getMachineFunction().getFunction(); + + StringRef FuncName("<unknown>"); + + if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee)) + FuncName = G->getSymbol(); + else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) + FuncName = G->getGlobal()->getName(); + + DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName); + DAG.getContext()->diagnose(NoCalls); + return SDValue(); +} + +SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { + const Function &Fn = *DAG.getMachineFunction().getFunction(); + + DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "dynamic alloca"); + DAG.getContext()->diagnose(NoDynamicAlloca); + return SDValue(); +} + +SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, + SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + default: + Op.getNode()->dump(); + llvm_unreachable("Custom lowering code for this" + "instruction is not implemented yet!"); + break; + case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); + case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); + case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); + case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); + case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); + case ISD::FREM: return LowerFREM(Op, DAG); + case ISD::FCEIL: return LowerFCEIL(Op, DAG); + case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); + case ISD::FRINT: return LowerFRINT(Op, DAG); + case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); + case ISD::FROUND: return LowerFROUND(Op, DAG); + case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); + case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); + case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); + case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); + case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); + case ISD::CTLZ: + case ISD::CTLZ_ZERO_UNDEF: + return LowerCTLZ(Op, DAG); + case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); + } + return Op; +} + +void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const { + switch (N->getOpcode()) { + case ISD::SIGN_EXTEND_INREG: + // Different parts of legalization seem to interpret which type of + // sign_extend_inreg is the one to check for custom lowering. The extended + // from type is what really matters, but some places check for custom + // lowering of the result type. This results in trying to use + // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do + // nothing here and let the illegal result integer be handled normally. + return; + case ISD::LOAD: { + SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); + if (!Node) + return; + + Results.push_back(SDValue(Node, 0)); + Results.push_back(SDValue(Node, 1)); + // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode + // function + DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); + return; + } + case ISD::STORE: { + SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG); + if (Lowered.getNode()) + Results.push_back(Lowered); + return; + } + default: + return; + } +} + +// FIXME: This implements accesses to initialized globals in the constant +// address space by copying them to private and accessing that. It does not +// properly handle illegal types or vectors. The private vector loads are not +// scalarized, and the illegal scalars hit an assertion. This technique will not +// work well with large initializers, and this should eventually be +// removed. Initialized globals should be placed into a data section that the +// runtime will load into a buffer before the kernel is executed. Uses of the +// global need to be replaced with a pointer loaded from an implicit kernel +// argument into this buffer holding the copy of the data, which will remove the +// need for any of this. +SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, + const GlobalValue *GV, + const SDValue &InitPtr, + SDValue Chain, + SelectionDAG &DAG) const { + const DataLayout &TD = DAG.getDataLayout(); + SDLoc DL(InitPtr); + Type *InitTy = Init->getType(); + + if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) { + EVT VT = EVT::getEVT(InitTy); + PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); + return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr, + MachinePointerInfo(UndefValue::get(PtrTy)), false, + false, TD.getPrefTypeAlignment(InitTy)); + } + + if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) { + EVT VT = EVT::getEVT(CFP->getType()); + PointerType *PtrTy = PointerType::get(CFP->getType(), 0); + return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr, + MachinePointerInfo(UndefValue::get(PtrTy)), false, + false, TD.getPrefTypeAlignment(CFP->getType())); + } + + if (StructType *ST = dyn_cast<StructType>(InitTy)) { + const StructLayout *SL = TD.getStructLayout(ST); + + EVT PtrVT = InitPtr.getValueType(); + SmallVector<SDValue, 8> Chains; + + for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) { + SDValue Offset = DAG.getConstant(SL->getElementOffset(I), DL, PtrVT); + SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); + + Constant *Elt = Init->getAggregateElement(I); + Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); + } + + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + } + + if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) { + EVT PtrVT = InitPtr.getValueType(); + + unsigned NumElements; + if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy)) + NumElements = AT->getNumElements(); + else if (VectorType *VT = dyn_cast<VectorType>(SeqTy)) + NumElements = VT->getNumElements(); + else + llvm_unreachable("Unexpected type"); + + unsigned EltSize = TD.getTypeAllocSize(SeqTy->getElementType()); + SmallVector<SDValue, 8> Chains; + for (unsigned i = 0; i < NumElements; ++i) { + SDValue Offset = DAG.getConstant(i * EltSize, DL, PtrVT); + SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); + + Constant *Elt = Init->getAggregateElement(i); + Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); + } + + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + } + + if (isa<UndefValue>(Init)) { + EVT VT = EVT::getEVT(InitTy); + PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); + return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr, + MachinePointerInfo(UndefValue::get(PtrTy)), false, + false, TD.getPrefTypeAlignment(InitTy)); + } + + Init->dump(); + llvm_unreachable("Unhandled constant initializer"); +} + +static bool hasDefinedInitializer(const GlobalValue *GV) { + const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV); + if (!GVar || !GVar->hasInitializer()) + return false; + + if (isa<UndefValue>(GVar->getInitializer())) + return false; + + return true; +} + +SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, + SDValue Op, + SelectionDAG &DAG) const { + + const DataLayout &DL = DAG.getDataLayout(); + GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); + const GlobalValue *GV = G->getGlobal(); + + switch (G->getAddressSpace()) { + case AMDGPUAS::LOCAL_ADDRESS: { + // XXX: What does the value of G->getOffset() mean? + assert(G->getOffset() == 0 && + "Do not know what to do with an non-zero offset"); + + // TODO: We could emit code to handle the initialization somewhere. + if (hasDefinedInitializer(GV)) + break; + + unsigned Offset; + if (MFI->LocalMemoryObjects.count(GV) == 0) { + uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType()); + Offset = MFI->LDSSize; + MFI->LocalMemoryObjects[GV] = Offset; + // XXX: Account for alignment? + MFI->LDSSize += Size; + } else { + Offset = MFI->LocalMemoryObjects[GV]; + } + + return DAG.getConstant(Offset, SDLoc(Op), + getPointerTy(DL, AMDGPUAS::LOCAL_ADDRESS)); + } + case AMDGPUAS::CONSTANT_ADDRESS: { + MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); + Type *EltType = GV->getType()->getElementType(); + unsigned Size = DL.getTypeAllocSize(EltType); + unsigned Alignment = DL.getPrefTypeAlignment(EltType); + + MVT PrivPtrVT = getPointerTy(DL, AMDGPUAS::PRIVATE_ADDRESS); + MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); + + int FI = FrameInfo->CreateStackObject(Size, Alignment, false); + SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT); + + const GlobalVariable *Var = cast<GlobalVariable>(GV); + if (!Var->hasInitializer()) { + // This has no use, but bugpoint will hit it. + return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); + } + + const Constant *Init = Var->getInitializer(); + SmallVector<SDNode*, 8> WorkList; + + for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(), + E = DAG.getEntryNode()->use_end(); I != E; ++I) { + if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD) + continue; + WorkList.push_back(*I); + } + SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG); + for (SmallVector<SDNode*, 8>::iterator I = WorkList.begin(), + E = WorkList.end(); I != E; ++I) { + SmallVector<SDValue, 8> Ops; + Ops.push_back(Chain); + for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) { + Ops.push_back((*I)->getOperand(i)); + } + DAG.UpdateNodeOperands(*I, Ops); + } + return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); + } + } + + const Function &Fn = *DAG.getMachineFunction().getFunction(); + DiagnosticInfoUnsupported BadInit(Fn, + "initializer for address space"); + DAG.getContext()->diagnose(BadInit); + return SDValue(); +} + +SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, + SelectionDAG &DAG) const { + SmallVector<SDValue, 8> Args; + + for (const SDUse &U : Op->ops()) + DAG.ExtractVectorElements(U.get(), Args); + + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); +} + +SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, + SelectionDAG &DAG) const { + + SmallVector<SDValue, 8> Args; + unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + EVT VT = Op.getValueType(); + DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, + VT.getVectorNumElements()); + + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); +} + +SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, + SelectionDAG &DAG) const { + + MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = Subtarget->getFrameLowering(); + + FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); + + unsigned FrameIndex = FIN->getIndex(); + unsigned IgnoredFrameReg; + unsigned Offset = + TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); + return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op), + Op.getValueType()); +} + +SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, + SelectionDAG &DAG) const { + unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + switch (IntrinsicID) { + default: return Op; + case AMDGPUIntrinsic::AMDGPU_abs: + case AMDGPUIntrinsic::AMDIL_abs: // Legacy name. + return LowerIntrinsicIABS(Op, DAG); + case AMDGPUIntrinsic::AMDGPU_lrp: + return LowerIntrinsicLRP(Op, DAG); + + case AMDGPUIntrinsic::AMDGPU_clamp: + case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name. + return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::AMDGPU_div_scale: { + // 3rd parameter required to be a constant. + const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3)); + if (!Param) + return DAG.getUNDEF(VT); + + // Translate to the operands expected by the machine instruction. The + // first parameter must be the same as the first instruction. + SDValue Numerator = Op.getOperand(1); + SDValue Denominator = Op.getOperand(2); + + // Note this order is opposite of the machine instruction's operations, + // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The + // intrinsic has the numerator as the first operand to match a normal + // division operation. + + SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; + + return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, + Denominator, Numerator); + } + + case Intrinsic::AMDGPU_div_fmas: + return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), + Op.getOperand(4)); + + case Intrinsic::AMDGPU_div_fixup: + return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::AMDGPU_trig_preop: + return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::AMDGPU_rcp: + return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); + + case Intrinsic::AMDGPU_rsq: + return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_legacy_rsq: + return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); + + case Intrinsic::AMDGPU_rsq_clamped: + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + Type *Type = VT.getTypeForEVT(*DAG.getContext()); + APFloat Max = APFloat::getLargest(Type->getFltSemantics()); + APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); + + SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, + DAG.getConstantFP(Max, DL, VT)); + return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, + DAG.getConstantFP(Min, DL, VT)); + } else { + return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1)); + } + + case Intrinsic::AMDGPU_ldexp: + return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDGPU_imax: + return DAG.getNode(ISD::SMAX, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDGPU_umax: + return DAG.getNode(ISD::UMAX, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDGPU_imin: + return DAG.getNode(ISD::SMIN, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDGPU_umin: + return DAG.getNode(ISD::UMIN, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDGPU_umul24: + return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDGPU_imul24: + return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDGPU_umad24: + return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_imad24: + return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0: + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1: + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2: + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3: + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_bfe_i32: + return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_bfe_u32: + return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_bfi: + return DAG.getNode(AMDGPUISD::BFI, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_bfm: + return DAG.getNode(AMDGPUISD::BFM, DL, VT, + Op.getOperand(1), + Op.getOperand(2)); + + case Intrinsic::AMDGPU_class: + return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDIL_exp: // Legacy name. + return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name. + return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name. + return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDGPU_brev: // Legacy name + return DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(1)); + } +} + +///IABS(a) = SMAX(sub(0, a), a) +SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), + Op.getOperand(1)); + + return DAG.getNode(ISD::SMAX, DL, VT, Neg, Op.getOperand(1)); +} + +/// Linear Interpolation +/// LRP(a, b, c) = muladd(a, b, (1 - a) * c) +SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + // TODO: Should this propagate fast-math-flags? + SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT, + DAG.getConstantFP(1.0f, DL, MVT::f32), + Op.getOperand(1)); + SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA, + Op.getOperand(3)); + return DAG.getNode(ISD::FADD, DL, VT, + DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)), + OneSubAC); +} + +/// \brief Generate Min/Max node +SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL, + EVT VT, + SDValue LHS, + SDValue RHS, + SDValue True, + SDValue False, + SDValue CC, + DAGCombinerInfo &DCI) const { + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return SDValue(); + + if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); + switch (CCOpcode) { + case ISD::SETOEQ: + case ISD::SETONE: + case ISD::SETUNE: + case ISD::SETNE: + case ISD::SETUEQ: + case ISD::SETEQ: + case ISD::SETFALSE: + case ISD::SETFALSE2: + case ISD::SETTRUE: + case ISD::SETTRUE2: + case ISD::SETUO: + case ISD::SETO: + break; + case ISD::SETULE: + case ISD::SETULT: { + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); + return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); + } + case ISD::SETOLE: + case ISD::SETOLT: + case ISD::SETLE: + case ISD::SETLT: { + // Ordered. Assume ordered for undefined. + + // Only do this after legalization to avoid interfering with other combines + // which might occur. + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && + !DCI.isCalledByLegalizer()) + return SDValue(); + + // We need to permute the operands to get the correct NaN behavior. The + // selected operand is the second one based on the failing compare with NaN, + // so permute it based on the compare type the hardware uses. + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); + return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); + } + case ISD::SETUGE: + case ISD::SETUGT: { + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); + return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); + } + case ISD::SETGT: + case ISD::SETGE: + case ISD::SETOGE: + case ISD::SETOGT: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && + !DCI.isCalledByLegalizer()) + return SDValue(); + + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); + return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); + } + case ISD::SETCC_INVALID: + llvm_unreachable("Invalid setcc condcode!"); + } + return SDValue(); +} + +SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op, + SelectionDAG &DAG) const { + LoadSDNode *Load = cast<LoadSDNode>(Op); + EVT MemVT = Load->getMemoryVT(); + EVT MemEltVT = MemVT.getVectorElementType(); + + EVT LoadVT = Op.getValueType(); + EVT EltVT = LoadVT.getVectorElementType(); + EVT PtrVT = Load->getBasePtr().getValueType(); + + unsigned NumElts = Load->getMemoryVT().getVectorNumElements(); + SmallVector<SDValue, 8> Loads; + SmallVector<SDValue, 8> Chains; + + SDLoc SL(Op); + unsigned MemEltSize = MemEltVT.getStoreSize(); + MachinePointerInfo SrcValue(Load->getMemOperand()->getValue()); + + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(), + DAG.getConstant(i * MemEltSize, SL, PtrVT)); + + SDValue NewLoad + = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT, + Load->getChain(), Ptr, + SrcValue.getWithOffset(i * MemEltSize), + MemEltVT, Load->isVolatile(), Load->isNonTemporal(), + Load->isInvariant(), Load->getAlignment()); + Loads.push_back(NewLoad.getValue(0)); + Chains.push_back(NewLoad.getValue(1)); + } + + SDValue Ops[] = { + DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads), + DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains) + }; + + return DAG.getMergeValues(Ops, SL); +} + +SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + // If this is a 2 element vector, we really want to scalarize and not create + // weird 1 element vectors. + if (VT.getVectorNumElements() == 2) + return ScalarizeVectorLoad(Op, DAG); + + LoadSDNode *Load = cast<LoadSDNode>(Op); + SDValue BasePtr = Load->getBasePtr(); + EVT PtrVT = BasePtr.getValueType(); + EVT MemVT = Load->getMemoryVT(); + SDLoc SL(Op); + + const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo(); + + EVT LoVT, HiVT; + EVT LoMemVT, HiMemVT; + SDValue Lo, Hi; + + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); + std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT); + + unsigned Size = LoMemVT.getStoreSize(); + unsigned BaseAlign = Load->getAlignment(); + unsigned HiAlign = MinAlign(BaseAlign, Size); + + SDValue LoLoad + = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, + Load->getChain(), BasePtr, + SrcValue, + LoMemVT, Load->isVolatile(), Load->isNonTemporal(), + Load->isInvariant(), BaseAlign); + + SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, + DAG.getConstant(Size, SL, PtrVT)); + + SDValue HiLoad + = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, + Load->getChain(), HiPtr, + SrcValue.getWithOffset(LoMemVT.getStoreSize()), + HiMemVT, Load->isVolatile(), Load->isNonTemporal(), + Load->isInvariant(), HiAlign); + + SDValue Ops[] = { + DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad), + DAG.getNode(ISD::TokenFactor, SL, MVT::Other, + LoLoad.getValue(1), HiLoad.getValue(1)) + }; + + return DAG.getMergeValues(Ops, SL); +} + +SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, + SelectionDAG &DAG) const { + StoreSDNode *Store = cast<StoreSDNode>(Op); + EVT MemVT = Store->getMemoryVT(); + unsigned MemBits = MemVT.getSizeInBits(); + + // Byte stores are really expensive, so if possible, try to pack 32-bit vector + // truncating store into an i32 store. + // XXX: We could also handle optimize other vector bitwidths. + if (!MemVT.isVector() || MemBits > 32) { + return SDValue(); + } + + SDLoc DL(Op); + SDValue Value = Store->getValue(); + EVT VT = Value.getValueType(); + EVT ElemVT = VT.getVectorElementType(); + SDValue Ptr = Store->getBasePtr(); + EVT MemEltVT = MemVT.getVectorElementType(); + unsigned MemEltBits = MemEltVT.getSizeInBits(); + unsigned MemNumElements = MemVT.getVectorNumElements(); + unsigned PackedSize = MemVT.getStoreSizeInBits(); + SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, DL, MVT::i32); + + assert(Value.getValueType().getScalarSizeInBits() >= 32); + + SDValue PackedValue; + for (unsigned i = 0; i < MemNumElements; ++i) { + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value, + DAG.getConstant(i, DL, MVT::i32)); + Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32); + Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg + + SDValue Shift = DAG.getConstant(MemEltBits * i, DL, MVT::i32); + Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift); + + if (i == 0) { + PackedValue = Elt; + } else { + PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt); + } + } + + if (PackedSize < 32) { + EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize); + return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr, + Store->getMemOperand()->getPointerInfo(), + PackedVT, + Store->isNonTemporal(), Store->isVolatile(), + Store->getAlignment()); + } + + return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr, + Store->getMemOperand()->getPointerInfo(), + Store->isVolatile(), Store->isNonTemporal(), + Store->getAlignment()); +} + +SDValue AMDGPUTargetLowering::ScalarizeVectorStore(SDValue Op, + SelectionDAG &DAG) const { + StoreSDNode *Store = cast<StoreSDNode>(Op); + EVT MemEltVT = Store->getMemoryVT().getVectorElementType(); + EVT EltVT = Store->getValue().getValueType().getVectorElementType(); + EVT PtrVT = Store->getBasePtr().getValueType(); + unsigned NumElts = Store->getMemoryVT().getVectorNumElements(); + SDLoc SL(Op); + + SmallVector<SDValue, 8> Chains; + + unsigned EltSize = MemEltVT.getStoreSize(); + MachinePointerInfo SrcValue(Store->getMemOperand()->getValue()); + + for (unsigned i = 0, e = NumElts; i != e; ++i) { + SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Store->getValue(), + DAG.getConstant(i, SL, MVT::i32)); + + SDValue Offset = DAG.getConstant(i * MemEltVT.getStoreSize(), SL, PtrVT); + SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Store->getBasePtr(), Offset); + SDValue NewStore = + DAG.getTruncStore(Store->getChain(), SL, Val, Ptr, + SrcValue.getWithOffset(i * EltSize), + MemEltVT, Store->isNonTemporal(), Store->isVolatile(), + Store->getAlignment()); + Chains.push_back(NewStore); + } + + return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains); +} + +SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, + SelectionDAG &DAG) const { + StoreSDNode *Store = cast<StoreSDNode>(Op); + SDValue Val = Store->getValue(); + EVT VT = Val.getValueType(); + + // If this is a 2 element vector, we really want to scalarize and not create + // weird 1 element vectors. + if (VT.getVectorNumElements() == 2) + return ScalarizeVectorStore(Op, DAG); + + EVT MemVT = Store->getMemoryVT(); + SDValue Chain = Store->getChain(); + SDValue BasePtr = Store->getBasePtr(); + SDLoc SL(Op); + + EVT LoVT, HiVT; + EVT LoMemVT, HiMemVT; + SDValue Lo, Hi; + + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); + std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT); + + EVT PtrVT = BasePtr.getValueType(); + SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, + DAG.getConstant(LoMemVT.getStoreSize(), SL, + PtrVT)); + + const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo(); + unsigned BaseAlign = Store->getAlignment(); + unsigned Size = LoMemVT.getStoreSize(); + unsigned HiAlign = MinAlign(BaseAlign, Size); + + SDValue LoStore + = DAG.getTruncStore(Chain, SL, Lo, + BasePtr, + SrcValue, + LoMemVT, + Store->isNonTemporal(), + Store->isVolatile(), + BaseAlign); + SDValue HiStore + = DAG.getTruncStore(Chain, SL, Hi, + HiPtr, + SrcValue.getWithOffset(Size), + HiMemVT, + Store->isNonTemporal(), + Store->isVolatile(), + HiAlign); + + return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); +} + + +SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + LoadSDNode *Load = cast<LoadSDNode>(Op); + ISD::LoadExtType ExtType = Load->getExtensionType(); + EVT VT = Op.getValueType(); + EVT MemVT = Load->getMemoryVT(); + + if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) { + assert(VT == MVT::i1 && "Only i1 non-extloads expected"); + // FIXME: Copied from PPC + // First, load into 32 bits, then truncate to 1 bit. + + SDValue Chain = Load->getChain(); + SDValue BasePtr = Load->getBasePtr(); + MachineMemOperand *MMO = Load->getMemOperand(); + + SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, + BasePtr, MVT::i8, MMO); + + SDValue Ops[] = { + DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD), + NewLD.getValue(1) + }; + + return DAG.getMergeValues(Ops, DL); + } + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS || + Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS || + ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32)) + return SDValue(); + + // <SI && AS=PRIVATE && EXTLOAD && size < 32bit, + // register (2-)byte extract. + + // Get Register holding the target. + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), + DAG.getConstant(2, DL, MVT::i32)); + // Load the Register. + SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), + Load->getChain(), Ptr, + DAG.getTargetConstant(0, DL, MVT::i32), + Op.getOperand(2)); + + // Get offset within the register. + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, + Load->getBasePtr(), + DAG.getConstant(0x3, DL, MVT::i32)); + + // Bit offset of target byte (byteIdx * 8). + SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, + DAG.getConstant(3, DL, MVT::i32)); + + // Shift to the right. + Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); + + // Eliminate the upper bits by setting them to ... + EVT MemEltVT = MemVT.getScalarType(); + + // ... ones. + if (ExtType == ISD::SEXTLOAD) { + SDValue MemEltVTNode = DAG.getValueType(MemEltVT); + + SDValue Ops[] = { + DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode), + Load->getChain() + }; + + return DAG.getMergeValues(Ops, DL); + } + + // ... or zeros. + SDValue Ops[] = { + DAG.getZeroExtendInReg(Ret, DL, MemEltVT), + Load->getChain() + }; + + return DAG.getMergeValues(Ops, DL); +} + +SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG); + if (Result.getNode()) { + return Result; + } + + StoreSDNode *Store = cast<StoreSDNode>(Op); + SDValue Chain = Store->getChain(); + if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && + Store->getValue().getValueType().isVector()) { + return SplitVectorStore(Op, DAG); + } + + EVT MemVT = Store->getMemoryVT(); + if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS && + MemVT.bitsLT(MVT::i32)) { + unsigned Mask = 0; + if (Store->getMemoryVT() == MVT::i8) { + Mask = 0xff; + } else if (Store->getMemoryVT() == MVT::i16) { + Mask = 0xffff; + } + SDValue BasePtr = Store->getBasePtr(); + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, + DAG.getConstant(2, DL, MVT::i32)); + SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, + Chain, Ptr, + DAG.getTargetConstant(0, DL, MVT::i32)); + + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, + DAG.getConstant(0x3, DL, MVT::i32)); + + SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, + DAG.getConstant(3, DL, MVT::i32)); + + SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, + Store->getValue()); + + SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); + + SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, + MaskedValue, ShiftAmt); + + SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getConstant(Mask, DL, MVT::i32), + ShiftAmt); + DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, + DAG.getConstant(0xffffffff, DL, MVT::i32)); + Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); + + SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); + return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, + Chain, Value, Ptr, + DAG.getTargetConstant(0, DL, MVT::i32)); + } + return SDValue(); +} + +// This is a shortcut for integer division because we have fast i32<->f32 +// conversions, and fast f32 reciprocal instructions. The fractional part of a +// float is enough to accurately represent up to a 24-bit integer. +SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + MVT IntVT = MVT::i32; + MVT FltVT = MVT::f32; + + ISD::NodeType ToFp = sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; + ISD::NodeType ToInt = sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; + + if (VT.isVector()) { + unsigned NElts = VT.getVectorNumElements(); + IntVT = MVT::getVectorVT(MVT::i32, NElts); + FltVT = MVT::getVectorVT(MVT::f32, NElts); + } + + unsigned BitSize = VT.getScalarType().getSizeInBits(); + + SDValue jq = DAG.getConstant(1, DL, IntVT); + + if (sign) { + // char|short jq = ia ^ ib; + jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); + + // jq = jq >> (bitsize - 2) + jq = DAG.getNode(ISD::SRA, DL, VT, jq, + DAG.getConstant(BitSize - 2, DL, VT)); + + // jq = jq | 0x1 + jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT)); + + // jq = (int)jq + jq = DAG.getSExtOrTrunc(jq, DL, IntVT); + } + + // int ia = (int)LHS; + SDValue ia = sign ? + DAG.getSExtOrTrunc(LHS, DL, IntVT) : DAG.getZExtOrTrunc(LHS, DL, IntVT); + + // int ib, (int)RHS; + SDValue ib = sign ? + DAG.getSExtOrTrunc(RHS, DL, IntVT) : DAG.getZExtOrTrunc(RHS, DL, IntVT); + + // float fa = (float)ia; + SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia); + + // float fb = (float)ib; + SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib); + + // TODO: Should this propagate fast-math-flags? + // float fq = native_divide(fa, fb); + SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT, + fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb)); + + // fq = trunc(fq); + fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq); + + // float fqneg = -fq; + SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq); + + // float fr = mad(fqneg, fb, fa); + SDValue fr = DAG.getNode(ISD::FADD, DL, FltVT, + DAG.getNode(ISD::FMUL, DL, FltVT, fqneg, fb), fa); + + // int iq = (int)fq; + SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq); + + // fr = fabs(fr); + fr = DAG.getNode(ISD::FABS, DL, FltVT, fr); + + // fb = fabs(fb); + fb = DAG.getNode(ISD::FABS, DL, FltVT, fb); + + EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + + // int cv = fr >= fb; + SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE); + + // jq = (cv ? jq : 0); + jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT)); + + // dst = trunc/extend to legal type + iq = sign ? DAG.getSExtOrTrunc(iq, DL, VT) : DAG.getZExtOrTrunc(iq, DL, VT); + + // dst = iq + jq; + SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq); + + // Rem needs compensation, it's easier to recompute it + SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS); + Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem); + + SDValue Res[2] = { + Div, + Rem + }; + return DAG.getMergeValues(Res, DL); +} + +void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, + SelectionDAG &DAG, + SmallVectorImpl<SDValue> &Results) const { + assert(Op.getValueType() == MVT::i64); + + SDLoc DL(Op); + EVT VT = Op.getValueType(); + EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); + + SDValue one = DAG.getConstant(1, DL, HalfVT); + SDValue zero = DAG.getConstant(0, DL, HalfVT); + + //HiLo split + SDValue LHS = Op.getOperand(0); + SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); + SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); + + SDValue RHS = Op.getOperand(1); + SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); + SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); + + if (VT == MVT::i64 && + DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) && + DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) { + + SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), + LHS_Lo, RHS_Lo); + + SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(0), zero); + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(1), zero); + Results.push_back(DIV); + Results.push_back(REM); + return; + } + + // Get Speculative values + SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); + SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); + + SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, zero); + + SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); + SDValue DIV_Lo = zero; + + const unsigned halfBitWidth = HalfVT.getSizeInBits(); + + for (unsigned i = 0; i < halfBitWidth; ++i) { + const unsigned bitPos = halfBitWidth - i - 1; + SDValue POS = DAG.getConstant(bitPos, DL, HalfVT); + // Get value of high bit + SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); + HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); + HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit); + + // Shift + REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT)); + // Add LHS high bit + REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit); + + SDValue BIT = DAG.getConstant(1 << bitPos, DL, HalfVT); + SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE); + + DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); + + // Update REM + SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); + REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE); + } + + SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); + Results.push_back(DIV); + Results.push_back(REM); +} + +SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + if (VT == MVT::i64) { + SmallVector<SDValue, 2> Results; + LowerUDIVREM64(Op, DAG, Results); + return DAG.getMergeValues(Results, DL); + } + + SDValue Num = Op.getOperand(0); + SDValue Den = Op.getOperand(1); + + if (VT == MVT::i32) { + if (DAG.MaskedValueIsZero(Num, APInt::getHighBitsSet(32, 8)) && + DAG.MaskedValueIsZero(Den, APInt::getHighBitsSet(32, 8))) { + // TODO: We technically could do this for i64, but shouldn't that just be + // handled by something generally reducing 64-bit division on 32-bit + // values to 32-bit? + return LowerDIVREM24(Op, DAG, false); + } + } + + // RCP = URECIP(Den) = 2^32 / Den + e + // e is rounding error. + SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); + + // RCP_LO = mul(RCP, Den) */ + SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den); + + // RCP_HI = mulhu (RCP, Den) */ + SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den); + + // NEG_RCP_LO = -RCP_LO + SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), + RCP_LO); + + // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) + SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT), + NEG_RCP_LO, RCP_LO, + ISD::SETEQ); + // Calculate the rounding error from the URECIP instruction + // E = mulhu(ABS_RCP_LO, RCP) + SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP); + + // RCP_A_E = RCP + E + SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E); + + // RCP_S_E = RCP - E + SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E); + + // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) + SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT), + RCP_A_E, RCP_S_E, + ISD::SETEQ); + // Quotient = mulhu(Tmp0, Num) + SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num); + + // Num_S_Remainder = Quotient * Den + SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den); + + // Remainder = Num - Num_S_Remainder + SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder); + + // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) + SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den, + DAG.getConstant(-1, DL, VT), + DAG.getConstant(0, DL, VT), + ISD::SETUGE); + // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) + SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num, + Num_S_Remainder, + DAG.getConstant(-1, DL, VT), + DAG.getConstant(0, DL, VT), + ISD::SETUGE); + // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero + SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den, + Remainder_GE_Zero); + + // Calculate Division result: + + // Quotient_A_One = Quotient + 1 + SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient, + DAG.getConstant(1, DL, VT)); + + // Quotient_S_One = Quotient - 1 + SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient, + DAG.getConstant(1, DL, VT)); + + // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) + SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT), + Quotient, Quotient_A_One, ISD::SETEQ); + + // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) + Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT), + Quotient_S_One, Div, ISD::SETEQ); + + // Calculate Rem result: + + // Remainder_S_Den = Remainder - Den + SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den); + + // Remainder_A_Den = Remainder + Den + SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den); + + // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) + SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT), + Remainder, Remainder_S_Den, ISD::SETEQ); + + // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) + Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT), + Remainder_A_Den, Rem, ISD::SETEQ); + SDValue Ops[2] = { + Div, + Rem + }; + return DAG.getMergeValues(Ops, DL); +} + +SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue NegOne = DAG.getConstant(-1, DL, VT); + + if (VT == MVT::i32 && + DAG.ComputeNumSignBits(LHS) > 8 && + DAG.ComputeNumSignBits(RHS) > 8) { + return LowerDIVREM24(Op, DAG, true); + } + if (VT == MVT::i64 && + DAG.ComputeNumSignBits(LHS) > 32 && + DAG.ComputeNumSignBits(RHS) > 32) { + EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); + + //HiLo split + SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero); + SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero); + SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), + LHS_Lo, RHS_Lo); + SDValue Res[2] = { + DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)), + DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1)) + }; + return DAG.getMergeValues(Res, DL); + } + + SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT); + SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT); + SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign); + SDValue RSign = LHSign; // Remainder sign is the same as LHS + + LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign); + RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign); + + LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign); + RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign); + + SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS); + SDValue Rem = Div.getValue(1); + + Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign); + Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign); + + Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign); + Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign); + + SDValue Res[2] = { + Div, + Rem + }; + return DAG.getMergeValues(Res, DL); +} + +// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y)) +SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + EVT VT = Op.getValueType(); + SDValue X = Op.getOperand(0); + SDValue Y = Op.getOperand(1); + + // TODO: Should this propagate fast-math-flags? + + SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y); + SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y); + + return DAG.getNode(ISD::FSUB, SL, VT, X, Mul); +} + +SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + // result = trunc(src) + // if (src > 0.0 && src != result) + // result += 1.0 + + SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); + + const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); + + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); + + SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT); + SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); + SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); + + SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero); + // TODO: Should this propagate fast-math-flags? + return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); +} + +static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) { + const unsigned FractBits = 52; + const unsigned ExpBits = 11; + + SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, + Hi, + DAG.getConstant(FractBits - 32, SL, MVT::i32), + DAG.getConstant(ExpBits, SL, MVT::i32)); + SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, + DAG.getConstant(1023, SL, MVT::i32)); + + return Exp; +} + +SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + assert(Op.getValueType() == MVT::f64); + + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + const SDValue One = DAG.getConstant(1, SL, MVT::i32); + + SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); + + // Extract the upper half, since this is where we will find the sign and + // exponent. + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One); + + SDValue Exp = extractF64Exponent(Hi, SL, DAG); + + const unsigned FractBits = 52; + + // Extract the sign bit. + const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32); + SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask); + + // Extend back to to 64-bits. + SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, + Zero, SignBit); + SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64); + + SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src); + const SDValue FractMask + = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64); + + SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp); + SDValue Not = DAG.getNOT(SL, Shr, MVT::i64); + SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not); + + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); + + const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32); + + SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); + SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); + + SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0); + SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1); + + return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2); +} + +SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + assert(Op.getValueType() == MVT::f64); + + APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52"); + SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64); + SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src); + + // TODO: Should this propagate fast-math-flags? + + SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign); + SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign); + + SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src); + + APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51"); + SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64); + + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); + SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT); + + return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2); +} + +SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const { + // FNEARBYINT and FRINT are the same, except in their handling of FP + // exceptions. Those aren't really meaningful for us, and OpenCL only has + // rint, so just treat them as equivalent. + return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0)); +} + +// XXX - May require not supporting f32 denormals? +SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + + SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X); + + // TODO: Should this propagate fast-math-flags? + + SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T); + + SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff); + + const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f32); + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); + const SDValue Half = DAG.getConstantFP(0.5, SL, MVT::f32); + + SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X); + + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); + + SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); + + SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero); + + return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel); +} + +SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + + SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X); + + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + const SDValue One = DAG.getConstant(1, SL, MVT::i32); + const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32); + const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32); + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); + + SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); + + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One); + + SDValue Exp = extractF64Exponent(Hi, SL, DAG); + + const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL, + MVT::i64); + + SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp); + SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64, + DAG.getConstant(INT64_C(0x0008000000000000), SL, + MVT::i64), + Exp); + + SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M); + SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT, + DAG.getConstant(0, SL, MVT::i64), Tmp0, + ISD::SETNE); + + SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1, + D, DAG.getConstant(0, SL, MVT::i64)); + SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2); + + K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64)); + K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K); + + SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); + SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); + SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ); + + SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64, + ExpEqNegOne, + DAG.getConstantFP(1.0, SL, MVT::f64), + DAG.getConstantFP(0.0, SL, MVT::f64)); + + SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X); + + K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K); + K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K); + + return K; +} + +SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (VT == MVT::f32) + return LowerFROUND32(Op, DAG); + + if (VT == MVT::f64) + return LowerFROUND64(Op, DAG); + + llvm_unreachable("unhandled type"); +} + +SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + // result = trunc(src); + // if (src < 0.0 && src != result) + // result += -1.0. + + SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); + + const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); + const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64); + + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); + + SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT); + SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); + SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); + + SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero); + // TODO: Should this propagate fast-math-flags? + return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); +} + +SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF; + + if (ZeroUndef && Src.getValueType() == MVT::i32) + return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src); + + SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); + + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + const SDValue One = DAG.getConstant(1, SL, MVT::i32); + + SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); + + EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), MVT::i32); + + SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ); + + SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo); + SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi); + + const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32); + SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32); + + // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x)) + SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi); + + if (!ZeroUndef) { + // Test if the full 64-bit input is zero. + + // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32, + // which we probably don't want. + SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ); + SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0); + + // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction + // with the same cycles, otherwise it is slower. + // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src, + // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ); + + const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32); + + // The instruction returns -1 for 0 input, but the defined intrinsic + // behavior is to return the number of bits. + NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, + SrcIsZero, Bits32, NewCtlz); + } + + return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz); +} + +SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, + bool Signed) const { + // Unsigned + // cul2f(ulong u) + //{ + // uint lz = clz(u); + // uint e = (u != 0) ? 127U + 63U - lz : 0; + // u = (u << lz) & 0x7fffffffffffffffUL; + // ulong t = u & 0xffffffffffUL; + // uint v = (e << 23) | (uint)(u >> 40); + // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U); + // return as_float(v + r); + //} + // Signed + // cl2f(long l) + //{ + // long s = l >> 63; + // float r = cul2f((l + s) ^ s); + // return s ? -r : r; + //} + + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + SDValue L = Src; + + SDValue S; + if (Signed) { + const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64); + S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit); + + SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S); + L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S); + } + + EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), MVT::f32); + + + SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32); + SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64); + SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L); + LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ); + + SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32); + SDValue E = DAG.getSelect(SL, MVT::i32, + DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE), + DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ), + ZeroI32); + + SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64, + DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ), + DAG.getConstant((-1ULL) >> 1, SL, MVT::i64)); + + SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U, + DAG.getConstant(0xffffffffffULL, SL, MVT::i64)); + + SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64, + U, DAG.getConstant(40, SL, MVT::i64)); + + SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32, + DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)), + DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, UShl)); + + SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64); + SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT); + SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ); + + SDValue One = DAG.getConstant(1, SL, MVT::i32); + + SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One); + + SDValue R = DAG.getSelect(SL, MVT::i32, + RCmp, + One, + DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32)); + R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R); + R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R); + + if (!Signed) + return R; + + SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R); + return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R); +} + +SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, + bool Signed) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); + + SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, + DAG.getConstant(0, SL, MVT::i32)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, + DAG.getConstant(1, SL, MVT::i32)); + + SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP, + SL, MVT::f64, Hi); + + SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo); + + SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi, + DAG.getConstant(32, SL, MVT::i32)); + // TODO: Should this propagate fast-math-flags? + return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo); +} + +SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getOperand(0).getValueType() == MVT::i64 && + "operation should be legal"); + + EVT DestVT = Op.getValueType(); + if (DestVT == MVT::f64) + return LowerINT_TO_FP64(Op, DAG, false); + + if (DestVT == MVT::f32) + return LowerINT_TO_FP32(Op, DAG, false); + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getOperand(0).getValueType() == MVT::i64 && + "operation should be legal"); + + EVT DestVT = Op.getValueType(); + if (DestVT == MVT::f32) + return LowerINT_TO_FP32(Op, DAG, true); + + if (DestVT == MVT::f64) + return LowerINT_TO_FP64(Op, DAG, true); + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, + bool Signed) const { + SDLoc SL(Op); + + SDValue Src = Op.getOperand(0); + + SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); + + SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL, + MVT::f64); + SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL, + MVT::f64); + // TODO: Should this propagate fast-math-flags? + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0); + + SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul); + + + SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc); + + SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL, + MVT::i32, FloorMul); + SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma); + + SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Lo, Hi); + + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result); +} + +SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op, + SelectionDAG &DAG) const { + SDValue Src = Op.getOperand(0); + + if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) + return LowerFP64_TO_INT(Op, DAG, true); + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op, + SelectionDAG &DAG) const { + SDValue Src = Op.getOperand(0); + + if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) + return LowerFP64_TO_INT(Op, DAG, false); + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, + SelectionDAG &DAG) const { + EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); + MVT VT = Op.getSimpleValueType(); + MVT ScalarVT = VT.getScalarType(); + + if (!VT.isVector()) + return SDValue(); + + SDValue Src = Op.getOperand(0); + SDLoc DL(Op); + + // TODO: Don't scalarize on Evergreen? + unsigned NElts = VT.getVectorNumElements(); + SmallVector<SDValue, 8> Args; + DAG.ExtractVectorElements(Src, Args, 0, NElts); + + SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType()); + for (unsigned I = 0; I < NElts; ++I) + Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); + + return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args); +} + +//===----------------------------------------------------------------------===// +// Custom DAG optimizations +//===----------------------------------------------------------------------===// + +static bool isU24(SDValue Op, SelectionDAG &DAG) { + APInt KnownZero, KnownOne; + EVT VT = Op.getValueType(); + DAG.computeKnownBits(Op, KnownZero, KnownOne); + + return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24; +} + +static bool isI24(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + + // In order for this to be a signed 24-bit value, bit 23, must + // be a sign bit. + return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated + // as unsigned 24-bit values. + (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24; +} + +static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) { + + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT = Op.getValueType(); + + APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24); + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, true, true); + if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) + DCI.CommitTargetLoweringOpt(TLO); +} + +template <typename IntTy> +static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, + uint32_t Offset, uint32_t Width, SDLoc DL) { + if (Width + Offset < 32) { + uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width); + IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width); + return DAG.getConstant(Result, DL, MVT::i32); + } + + return DAG.getConstant(Src0 >> Offset, DL, MVT::i32); +} + +static bool usesAllNormalStores(SDNode *LoadVal) { + for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) { + if (!ISD::isNormalStore(*I)) + return false; + } + + return true; +} + +// If we have a copy of an illegal type, replace it with a load / store of an +// equivalently sized legal type. This avoids intermediate bit pack / unpack +// instructions emitted when handling extloads and truncstores. Ideally we could +// recognize the pack / unpack pattern to eliminate it. +SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + StoreSDNode *SN = cast<StoreSDNode>(N); + SDValue Value = SN->getValue(); + EVT VT = Value.getValueType(); + + if (isTypeLegal(VT) || SN->isVolatile() || + !ISD::isNormalLoad(Value.getNode()) || VT.getSizeInBits() < 8) + return SDValue(); + + LoadSDNode *LoadVal = cast<LoadSDNode>(Value); + if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal)) + return SDValue(); + + EVT MemVT = LoadVal->getMemoryVT(); + + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; + EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT); + + SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, + LoadVT, SL, + LoadVal->getChain(), + LoadVal->getBasePtr(), + LoadVal->getOffset(), + LoadVT, + LoadVal->getMemOperand()); + + SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0)); + DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false); + + return DAG.getStore(SN->getChain(), SL, NewLoad, + SN->getBasePtr(), SN->getMemOperand()); +} + +SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (N->getValueType(0) != MVT::i64) + return SDValue(); + + // i64 (shl x, 32) -> (build_pair 0, x) + + // Doing this with moves theoretically helps MI optimizations that understand + // copies. 2 v_mov_b32_e32 will have the same code size / cycle count as + // v_lshl_b64. In the SALU case, I think this is slightly worse since it + // doubles the code size and I'm unsure about cycle count. + const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!RHS || RHS->getZExtValue() != 32) + return SDValue(); + + SDValue LHS = N->getOperand(0); + + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; + + // Extract low 32-bits. + SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); + + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Zero, Lo); +} + +SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + + if (VT.isVector() || VT.getSizeInBits() > 32) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue Mul; + + if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { + N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); + Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1); + } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { + N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); + Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1); + } else { + return SDValue(); + } + + // We need to use sext even for MUL_U24, because MUL_U24 is used + // for signed multiply of 8 and 16-bit types. + return DAG.getSExtOrTrunc(Mul, DL, VT); +} + +static bool isNegativeOne(SDValue Val) { + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) + return C->isAllOnesValue(); + return false; +} + +static bool isCtlzOpc(unsigned Opc) { + return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; +} + +// Get FFBH node if the incoming op may have been type legalized from a smaller +// type VT. +// Need to match pre-legalized type because the generic legalization inserts the +// add/sub between the select and compare. +static SDValue getFFBH_U32(const TargetLowering &TLI, + SelectionDAG &DAG, SDLoc SL, SDValue Op) { + EVT VT = Op.getValueType(); + EVT LegalVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + if (LegalVT != MVT::i32) + return SDValue(); + + if (VT != MVT::i32) + Op = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Op); + + SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Op); + if (VT != MVT::i32) + FFBH = DAG.getNode(ISD::TRUNCATE, SL, VT, FFBH); + + return FFBH; +} + +// The native instructions return -1 on 0 input. Optimize out a select that +// produces -1 on 0. +// +// TODO: If zero is not undef, we could also do this if the output is compared +// against the bitwidth. +// +// TODO: Should probably combine against FFBH_U32 instead of ctlz directly. +SDValue AMDGPUTargetLowering::performCtlzCombine(SDLoc SL, + SDValue Cond, + SDValue LHS, + SDValue RHS, + DAGCombinerInfo &DCI) const { + ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); + if (!CmpRhs || !CmpRhs->isNullValue()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + SDValue CmpLHS = Cond.getOperand(0); + + // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x + if (CCOpcode == ISD::SETEQ && + isCtlzOpc(RHS.getOpcode()) && + RHS.getOperand(0) == CmpLHS && + isNegativeOne(LHS)) { + return getFFBH_U32(*this, DAG, SL, CmpLHS); + } + + // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x + if (CCOpcode == ISD::SETNE && + isCtlzOpc(LHS.getOpcode()) && + LHS.getOperand(0) == CmpLHS && + isNegativeOne(RHS)) { + return getFFBH_U32(*this, DAG, SL, CmpLHS); + } + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SDValue Cond = N->getOperand(0); + if (Cond.getOpcode() != ISD::SETCC) + return SDValue(); + + EVT VT = N->getValueType(0); + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + SDValue CC = Cond.getOperand(2); + + SDValue True = N->getOperand(1); + SDValue False = N->getOperand(2); + + if (VT == MVT::f32 && Cond.hasOneUse()) + return CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); + + // There's no reason to not do this if the condition has other uses. + return performCtlzCombine(SDLoc(N), Cond, True, False, DCI); +} + +SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + switch(N->getOpcode()) { + default: + break; + case ISD::SHL: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + break; + + return performShlCombine(N, DCI); + } + case ISD::MUL: + return performMulCombine(N, DCI); + case AMDGPUISD::MUL_I24: + case AMDGPUISD::MUL_U24: { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + simplifyI24(N0, DCI); + simplifyI24(N1, DCI); + return SDValue(); + } + case ISD::SELECT: + return performSelectCombine(N, DCI); + case AMDGPUISD::BFE_I32: + case AMDGPUISD::BFE_U32: { + assert(!N->getValueType(0).isVector() && + "Vector handling of BFE not implemented"); + ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2)); + if (!Width) + break; + + uint32_t WidthVal = Width->getZExtValue() & 0x1f; + if (WidthVal == 0) + return DAG.getConstant(0, DL, MVT::i32); + + ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!Offset) + break; + + SDValue BitsFrom = N->getOperand(0); + uint32_t OffsetVal = Offset->getZExtValue() & 0x1f; + + bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32; + + if (OffsetVal == 0) { + // This is already sign / zero extended, so try to fold away extra BFEs. + unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal); + + unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom); + if (OpSignBits >= SignBits) + return BitsFrom; + + EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal); + if (Signed) { + // This is a sign_extend_inreg. Replace it to take advantage of existing + // DAG Combines. If not eliminated, we will match back to BFE during + // selection. + + // TODO: The sext_inreg of extended types ends, although we can could + // handle them in a single BFE. + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom, + DAG.getValueType(SmallVT)); + } + + return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT); + } + + if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) { + if (Signed) { + return constantFoldBFE<int32_t>(DAG, + CVal->getSExtValue(), + OffsetVal, + WidthVal, + DL); + } + + return constantFoldBFE<uint32_t>(DAG, + CVal->getZExtValue(), + OffsetVal, + WidthVal, + DL); + } + + if ((OffsetVal + WidthVal) >= 32) { + SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32); + return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, + BitsFrom, ShiftVal); + } + + if (BitsFrom.hasOneUse()) { + APInt Demanded = APInt::getBitsSet(32, + OffsetVal, + OffsetVal + WidthVal); + + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) || + TLI.SimplifyDemandedBits(BitsFrom, Demanded, + KnownZero, KnownOne, TLO)) { + DCI.CommitTargetLoweringOpt(TLO); + } + } + + break; + } + + case ISD::STORE: + return performStoreCombine(N, DCI); + } + return SDValue(); +} + +//===----------------------------------------------------------------------===// +// Helper functions +//===----------------------------------------------------------------------===// + +void AMDGPUTargetLowering::getOriginalFunctionArgs( + SelectionDAG &DAG, + const Function *F, + const SmallVectorImpl<ISD::InputArg> &Ins, + SmallVectorImpl<ISD::InputArg> &OrigIns) const { + + for (unsigned i = 0, e = Ins.size(); i < e; ++i) { + if (Ins[i].ArgVT == Ins[i].VT) { + OrigIns.push_back(Ins[i]); + continue; + } + + EVT VT; + if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) { + // Vector has been split into scalars. + VT = Ins[i].ArgVT.getVectorElementType(); + } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() && + Ins[i].ArgVT.getVectorElementType() != + Ins[i].VT.getVectorElementType()) { + // Vector elements have been promoted + VT = Ins[i].ArgVT; + } else { + // Vector has been spilt into smaller vectors. + VT = Ins[i].VT; + } + + ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used, + Ins[i].OrigArgIndex, Ins[i].PartOffset); + OrigIns.push_back(Arg); + } +} + +bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const { + if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { + return CFP->isExactlyValue(1.0); + } + return isAllOnesConstant(Op); +} + +bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const { + if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { + return CFP->getValueAPF().isZero(); + } + return isNullConstant(Op); +} + +SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, + const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + unsigned VirtualRegister; + if (!MRI.isLiveIn(Reg)) { + VirtualRegister = MRI.createVirtualRegister(RC); + MRI.addLiveIn(Reg, VirtualRegister); + } else { + VirtualRegister = MRI.getLiveInVirtReg(Reg); + } + return DAG.getRegister(VirtualRegister, VT); +} + +uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( + const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const { + uint64_t ArgOffset = MFI->ABIArgOffset; + switch (Param) { + case GRID_DIM: + return ArgOffset; + case GRID_OFFSET: + return ArgOffset + 4; + } + llvm_unreachable("unexpected implicit parameter type"); +} + +#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; + +const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { + switch ((AMDGPUISD::NodeType)Opcode) { + case AMDGPUISD::FIRST_NUMBER: break; + // AMDIL DAG nodes + NODE_NAME_CASE(CALL); + NODE_NAME_CASE(UMUL); + NODE_NAME_CASE(RET_FLAG); + NODE_NAME_CASE(BRANCH_COND); + + // AMDGPU DAG nodes + NODE_NAME_CASE(DWORDADDR) + NODE_NAME_CASE(FRACT) + NODE_NAME_CASE(CLAMP) + NODE_NAME_CASE(COS_HW) + NODE_NAME_CASE(SIN_HW) + NODE_NAME_CASE(FMAX_LEGACY) + NODE_NAME_CASE(FMIN_LEGACY) + NODE_NAME_CASE(FMAX3) + NODE_NAME_CASE(SMAX3) + NODE_NAME_CASE(UMAX3) + NODE_NAME_CASE(FMIN3) + NODE_NAME_CASE(SMIN3) + NODE_NAME_CASE(UMIN3) + NODE_NAME_CASE(URECIP) + NODE_NAME_CASE(DIV_SCALE) + NODE_NAME_CASE(DIV_FMAS) + NODE_NAME_CASE(DIV_FIXUP) + NODE_NAME_CASE(TRIG_PREOP) + NODE_NAME_CASE(RCP) + NODE_NAME_CASE(RSQ) + NODE_NAME_CASE(RSQ_LEGACY) + NODE_NAME_CASE(RSQ_CLAMPED) + NODE_NAME_CASE(LDEXP) + NODE_NAME_CASE(FP_CLASS) + NODE_NAME_CASE(DOT4) + NODE_NAME_CASE(CARRY) + NODE_NAME_CASE(BORROW) + NODE_NAME_CASE(BFE_U32) + NODE_NAME_CASE(BFE_I32) + NODE_NAME_CASE(BFI) + NODE_NAME_CASE(BFM) + NODE_NAME_CASE(FFBH_U32) + NODE_NAME_CASE(MUL_U24) + NODE_NAME_CASE(MUL_I24) + NODE_NAME_CASE(MAD_U24) + NODE_NAME_CASE(MAD_I24) + NODE_NAME_CASE(TEXTURE_FETCH) + NODE_NAME_CASE(EXPORT) + NODE_NAME_CASE(CONST_ADDRESS) + NODE_NAME_CASE(REGISTER_LOAD) + NODE_NAME_CASE(REGISTER_STORE) + NODE_NAME_CASE(LOAD_CONSTANT) + NODE_NAME_CASE(LOAD_INPUT) + NODE_NAME_CASE(SAMPLE) + NODE_NAME_CASE(SAMPLEB) + NODE_NAME_CASE(SAMPLED) + NODE_NAME_CASE(SAMPLEL) + NODE_NAME_CASE(CVT_F32_UBYTE0) + NODE_NAME_CASE(CVT_F32_UBYTE1) + NODE_NAME_CASE(CVT_F32_UBYTE2) + NODE_NAME_CASE(CVT_F32_UBYTE3) + NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) + NODE_NAME_CASE(CONST_DATA_PTR) + case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; + NODE_NAME_CASE(SENDMSG) + NODE_NAME_CASE(INTERP_MOV) + NODE_NAME_CASE(INTERP_P1) + NODE_NAME_CASE(INTERP_P2) + NODE_NAME_CASE(STORE_MSKOR) + NODE_NAME_CASE(TBUFFER_STORE_FORMAT) + case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; + } + return nullptr; +} + +SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = Operand.getValueType(); + + if (VT == MVT::f32) { + RefinementSteps = 0; + return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand); + } + + // TODO: There is also f64 rsq instruction, but the documentation is less + // clear on its precision. + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = Operand.getValueType(); + + if (VT == MVT::f32) { + // Reciprocal, < 1 ulp error. + // + // This reciprocal approximation converges to < 0.5 ulp error with one + // newton rhapson performed with two fused multiple adds (FMAs). + + RefinementSteps = 0; + return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand); + } + + // TODO: There is also f64 rcp instruction, but the documentation is less + // clear on its precision. + + return SDValue(); +} + +static void computeKnownBitsForMinMax(const SDValue Op0, + const SDValue Op1, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) { + APInt Op0Zero, Op0One; + APInt Op1Zero, Op1One; + DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth); + DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth); + + KnownZero = Op0Zero & Op1Zero; + KnownOne = Op0One & Op1One; +} + +void AMDGPUTargetLowering::computeKnownBitsForTargetNode( + const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const { + + KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything. + + APInt KnownZero2; + APInt KnownOne2; + unsigned Opc = Op.getOpcode(); + + switch (Opc) { + default: + break; + case ISD::INTRINSIC_WO_CHAIN: { + // FIXME: The intrinsic should just use the node. + switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) { + case AMDGPUIntrinsic::AMDGPU_imax: + case AMDGPUIntrinsic::AMDGPU_umax: + case AMDGPUIntrinsic::AMDGPU_imin: + case AMDGPUIntrinsic::AMDGPU_umin: + computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2), + KnownZero, KnownOne, DAG, Depth); + break; + default: + break; + } + + break; + } + case AMDGPUISD::CARRY: + case AMDGPUISD::BORROW: { + KnownZero = APInt::getHighBitsSet(32, 31); + break; + } + + case AMDGPUISD::BFE_I32: + case AMDGPUISD::BFE_U32: { + ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2)); + if (!CWidth) + return; + + unsigned BitWidth = 32; + uint32_t Width = CWidth->getZExtValue() & 0x1f; + + if (Opc == AMDGPUISD::BFE_U32) + KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width); + + break; + } + } +} + +unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( + SDValue Op, + const SelectionDAG &DAG, + unsigned Depth) const { + switch (Op.getOpcode()) { + case AMDGPUISD::BFE_I32: { + ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); + if (!Width) + return 1; + + unsigned SignBits = 32 - Width->getZExtValue() + 1; + if (!isNullConstant(Op.getOperand(1))) + return SignBits; + + // TODO: Could probably figure something out with non-0 offsets. + unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); + return std::max(SignBits, Op0SignBits); + } + + case AMDGPUISD::BFE_U32: { + ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); + return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1; + } + + case AMDGPUISD::CARRY: + case AMDGPUISD::BORROW: + return 31; + + default: + return 1; + } +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h new file mode 100644 index 0000000..3792541 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -0,0 +1,322 @@ +//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface definition of the TargetLowering class that is common +/// to all AMD GPUs. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H +#define LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H + +#include "llvm/Target/TargetLowering.h" + +namespace llvm { + +class AMDGPUMachineFunction; +class AMDGPUSubtarget; +class MachineRegisterInfo; + +class AMDGPUTargetLowering : public TargetLowering { +protected: + const AMDGPUSubtarget *Subtarget; + +private: + SDValue LowerConstantInitializer(const Constant* Init, const GlobalValue *GV, + const SDValue &InitPtr, + SDValue Chain, + SelectionDAG &DAG) const; + SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + /// \brief Lower vector stores by merging the vector elements into an integer + /// of the same bitwidth. + SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const; + /// \brief Split a vector store into multiple scalar stores. + /// \returns The resulting chain. + + SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const; + SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const; + SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const; + SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + + SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performCtlzCombine(SDLoc SL, SDValue Cond, SDValue LHS, SDValue RHS, + DAGCombinerInfo &DCI) const; + SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; + +protected: + static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); + static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT); + + virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, + SelectionDAG &DAG) const; + + /// \brief Split a vector load into a scalar load of each component. + SDValue ScalarizeVectorLoad(SDValue Op, SelectionDAG &DAG) const; + + /// \brief Split a vector load into 2 loads of half the vector. + SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const; + + /// \brief Split a vector store into a scalar store of each component. + SDValue ScalarizeVectorStore(SDValue Op, SelectionDAG &DAG) const; + + /// \brief Split a vector store into 2 stores of half the vector. + SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const; + void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &Results) const; + bool isHWTrueValue(SDValue Op) const; + bool isHWFalseValue(SDValue Op) const; + + /// The SelectionDAGBuilder will automatically promote function arguments + /// with illegal types. However, this does not work for the AMDGPU targets + /// since the function arguments are stored in memory as these illegal types. + /// In order to handle this properly we need to get the origianl types sizes + /// from the LLVM IR Function and fixup the ISD:InputArg values before + /// passing them to AnalyzeFormalArguments() + void getOriginalFunctionArgs(SelectionDAG &DAG, + const Function *F, + const SmallVectorImpl<ISD::InputArg> &Ins, + SmallVectorImpl<ISD::InputArg> &OrigIns) const; + void AnalyzeFormalArguments(CCState &State, + const SmallVectorImpl<ISD::InputArg> &Ins) const; + void AnalyzeReturn(CCState &State, + const SmallVectorImpl<ISD::OutputArg> &Outs) const; + +public: + AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI); + + bool isFAbsFree(EVT VT) const override; + bool isFNegFree(EVT VT) const override; + bool isTruncateFree(EVT Src, EVT Dest) const override; + bool isTruncateFree(Type *Src, Type *Dest) const override; + + bool isZExtFree(Type *Src, Type *Dest) const override; + bool isZExtFree(EVT Src, EVT Dest) const override; + bool isZExtFree(SDValue Val, EVT VT2) const override; + + bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; + + MVT getVectorIdxTy(const DataLayout &) const override; + bool isSelectSupported(SelectSupportKind) const override; + + bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; + bool ShouldShrinkFPConstant(EVT VT) const override; + bool shouldReduceLoadWidth(SDNode *Load, + ISD::LoadExtType ExtType, + EVT ExtVT) const override; + + bool isLoadBitCastBeneficial(EVT, EVT) const override; + + bool storeOfVectorConstantIsCheap(EVT MemVT, + unsigned NumElem, + unsigned AS) const override; + bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override; + bool isCheapToSpeculateCttz() const override; + bool isCheapToSpeculateCtlz() const override; + + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + SDLoc DL, SelectionDAG &DAG) const override; + SDValue LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const override; + + SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const; + + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + void ReplaceNodeResults(SDNode * N, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const override; + + SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const; + SDValue CombineFMinMaxLegacy(SDLoc DL, + EVT VT, + SDValue LHS, + SDValue RHS, + SDValue True, + SDValue False, + SDValue CC, + DAGCombinerInfo &DCI) const; + + const char* getTargetNodeName(unsigned Opcode) const override; + + SDValue getRsqrtEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const override; + SDValue getRecipEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const override; + + virtual SDNode *PostISelFolding(MachineSDNode *N, + SelectionDAG &DAG) const { + return N; + } + + /// \brief Determine which of the bits specified in \p Mask are known to be + /// either zero or one and return them in the \p KnownZero and \p KnownOne + /// bitsets. + void computeKnownBitsForTargetNode(const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth = 0) const override; + + unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const SelectionDAG &DAG, + unsigned Depth = 0) const override; + + /// \brief Helper function that adds Reg to the LiveIn list of the DAG's + /// MachineFunction. + /// + /// \returns a RegisterSDNode representing Reg. + virtual SDValue CreateLiveInRegister(SelectionDAG &DAG, + const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const; + + enum ImplicitParameter { + GRID_DIM, + GRID_OFFSET + }; + + /// \brief Helper function that returns the byte offset of the given + /// type of implicit parameter. + uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI, + const ImplicitParameter Param) const; +}; + +namespace AMDGPUISD { + +enum NodeType : unsigned { + // AMDIL ISD Opcodes + FIRST_NUMBER = ISD::BUILTIN_OP_END, + CALL, // Function call based on a single integer + UMUL, // 32bit unsigned multiplication + RET_FLAG, + BRANCH_COND, + // End AMDIL ISD Opcodes + DWORDADDR, + FRACT, + CLAMP, + + // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. + // Denormals handled on some parts. + COS_HW, + SIN_HW, + FMAX_LEGACY, + FMIN_LEGACY, + FMAX3, + SMAX3, + UMAX3, + FMIN3, + SMIN3, + UMIN3, + URECIP, + DIV_SCALE, + DIV_FMAS, + DIV_FIXUP, + TRIG_PREOP, // 1 ULP max error for f64 + + // RCP, RSQ - For f32, 1 ULP max error, no denormal handling. + // For f64, max error 2^29 ULP, handles denormals. + RCP, + RSQ, + RSQ_LEGACY, + RSQ_CLAMPED, + LDEXP, + FP_CLASS, + DOT4, + CARRY, + BORROW, + BFE_U32, // Extract range of bits with zero extension to 32-bits. + BFE_I32, // Extract range of bits with sign extension to 32-bits. + BFI, // (src0 & src1) | (~src0 & src2) + BFM, // Insert a range of bits into a 32-bit word. + FFBH_U32, // ctlz with -1 if input is zero. + MUL_U24, + MUL_I24, + MAD_U24, + MAD_I24, + TEXTURE_FETCH, + EXPORT, + CONST_ADDRESS, + REGISTER_LOAD, + REGISTER_STORE, + LOAD_INPUT, + SAMPLE, + SAMPLEB, + SAMPLED, + SAMPLEL, + + // These cvt_f32_ubyte* nodes need to remain consecutive and in order. + CVT_F32_UBYTE0, + CVT_F32_UBYTE1, + CVT_F32_UBYTE2, + CVT_F32_UBYTE3, + /// This node is for VLIW targets and it is used to represent a vector + /// that is stored in consecutive registers with the same channel. + /// For example: + /// |X |Y|Z|W| + /// T0|v.x| | | | + /// T1|v.y| | | | + /// T2|v.z| | | | + /// T3|v.w| | | | + BUILD_VERTICAL_VECTOR, + /// Pointer to the start of the shader's constant data. + CONST_DATA_PTR, + SENDMSG, + INTERP_MOV, + INTERP_P1, + INTERP_P2, + FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, + STORE_MSKOR, + LOAD_CONSTANT, + TBUFFER_STORE_FORMAT, + LAST_AMDGPU_ISD_NUMBER +}; + + +} // End namespace AMDGPUISD + +} // End namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp new file mode 100644 index 0000000..a266e71 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -0,0 +1,377 @@ +//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Implementation of the TargetInstrInfo class that is common to all +/// AMD GPUs. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUInstrInfo.h" +#include "AMDGPURegisterInfo.h" +#include "AMDGPUTargetMachine.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +#define GET_INSTRINFO_CTOR_DTOR +#define GET_INSTRINFO_NAMED_OPS +#define GET_INSTRMAP_INFO +#include "AMDGPUGenInstrInfo.inc" + +// Pin the vtable to this file. +void AMDGPUInstrInfo::anchor() {} + +AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &st) + : AMDGPUGenInstrInfo(-1, -1), ST(st) {} + +const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const { + return RI; +} + +bool AMDGPUInstrInfo::isCoalescableExtInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned &SubIdx) const { +// TODO: Implement this function + return false; +} + +unsigned AMDGPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} + +unsigned AMDGPUInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} + +bool AMDGPUInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const { +// TODO: Implement this function + return false; +} +unsigned AMDGPUInstrInfo::isStoreFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} +unsigned AMDGPUInstrInfo::isStoreFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} +bool AMDGPUInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const { +// TODO: Implement this function + return false; +} + +MachineInstr * +AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const { +// TODO: Implement this function + return nullptr; +} + +void +AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, + int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + llvm_unreachable("Not Implemented"); +} + +void +AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + llvm_unreachable("Not Implemented"); +} + +bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const { + MachineBasicBlock *MBB = MI->getParent(); + int OffsetOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::addr); + // addr is a custom operand with multiple MI operands, and only the + // first MI operand is given a name. + int RegOpIdx = OffsetOpIdx + 1; + int ChanOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::chan); + if (isRegisterLoad(*MI)) { + int DstOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::dst); + unsigned RegIndex = MI->getOperand(RegOpIdx).getImm(); + unsigned Channel = MI->getOperand(ChanOpIdx).getImm(); + unsigned Address = calculateIndirectAddress(RegIndex, Channel); + unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg(); + if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) { + buildMovInstr(MBB, MI, MI->getOperand(DstOpIdx).getReg(), + getIndirectAddrRegClass()->getRegister(Address)); + } else { + buildIndirectRead(MBB, MI, MI->getOperand(DstOpIdx).getReg(), + Address, OffsetReg); + } + } else if (isRegisterStore(*MI)) { + int ValOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::val); + unsigned RegIndex = MI->getOperand(RegOpIdx).getImm(); + unsigned Channel = MI->getOperand(ChanOpIdx).getImm(); + unsigned Address = calculateIndirectAddress(RegIndex, Channel); + unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg(); + if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) { + buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address), + MI->getOperand(ValOpIdx).getReg()); + } else { + buildIndirectWrite(MBB, MI, MI->getOperand(ValOpIdx).getReg(), + calculateIndirectAddress(RegIndex, Channel), + OffsetReg); + } + } else { + return false; + } + + MBB->erase(MI); + return true; +} + +MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl( + MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, int FrameIndex) const { +// TODO: Implement this function + return nullptr; +} +MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl( + MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const { + // TODO: Implement this function + return nullptr; +} +bool +AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, + unsigned Reg, bool UnfoldLoad, + bool UnfoldStore, + SmallVectorImpl<MachineInstr*> &NewMIs) const { + // TODO: Implement this function + return false; +} + +bool +AMDGPUInstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, + SmallVectorImpl<SDNode*> &NewNodes) const { + // TODO: Implement this function + return false; +} + +unsigned +AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, + bool UnfoldLoad, bool UnfoldStore, + unsigned *LoadRegIndex) const { + // TODO: Implement this function + return 0; +} + +bool AMDGPUInstrInfo::enableClusterLoads() const { + return true; +} + +// FIXME: This behaves strangely. If, for example, you have 32 load + stores, +// the first 16 loads will be interleaved with the stores, and the next 16 will +// be clustered as expected. It should really split into 2 16 store batches. +// +// Loads are clustered until this returns false, rather than trying to schedule +// groups of stores. This also means we have to deal with saying different +// address space loads should be clustered, and ones which might cause bank +// conflicts. +// +// This might be deprecated so it might not be worth that much effort to fix. +bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, + int64_t Offset0, int64_t Offset1, + unsigned NumLoads) const { + assert(Offset1 > Offset0 && + "Second offset should be larger than first offset!"); + // If we have less than 16 loads in a row, and the offsets are within 64 + // bytes, then schedule together. + + // A cacheline is 64 bytes (for global memory). + return (NumLoads <= 16 && (Offset1 - Offset0) < 64); +} + +bool +AMDGPUInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) + const { + // TODO: Implement this function + return true; +} +void AMDGPUInstrInfo::insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const { + // TODO: Implement this function +} + +bool AMDGPUInstrInfo::isPredicated(const MachineInstr *MI) const { + // TODO: Implement this function + return false; +} + +bool AMDGPUInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1, + ArrayRef<MachineOperand> Pred2) const { + // TODO: Implement this function + return false; +} + +bool AMDGPUInstrInfo::DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const { + // TODO: Implement this function + return false; +} + +bool AMDGPUInstrInfo::isPredicable(MachineInstr *MI) const { + // TODO: Implement this function + return MI->getDesc().isPredicable(); +} + +bool +AMDGPUInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { + // TODO: Implement this function + return true; +} + +bool AMDGPUInstrInfo::isRegisterStore(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_STORE; +} + +bool AMDGPUInstrInfo::isRegisterLoad(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD; +} + +int AMDGPUInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + int Offset = -1; + + if (MFI->getNumObjects() == 0) { + return -1; + } + + if (MRI.livein_empty()) { + return 0; + } + + const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass(); + for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(), + LE = MRI.livein_end(); + LI != LE; ++LI) { + unsigned Reg = LI->first; + if (TargetRegisterInfo::isVirtualRegister(Reg) || + !IndirectRC->contains(Reg)) + continue; + + unsigned RegIndex; + unsigned RegEnd; + for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd; + ++RegIndex) { + if (IndirectRC->getRegister(RegIndex) == Reg) + break; + } + Offset = std::max(Offset, (int)RegIndex); + } + + return Offset + 1; +} + +int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { + int Offset = 0; + const MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Variable sized objects are not supported + assert(!MFI->hasVarSizedObjects()); + + if (MFI->getNumObjects() == 0) { + return -1; + } + + unsigned IgnoredFrameReg; + Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexReference( + MF, -1, IgnoredFrameReg); + + return getIndirectIndexBegin(MF) + Offset; +} + +int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const { + switch (Channels) { + default: return Opcode; + case 1: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_1); + case 2: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_2); + case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3); + } +} + +// Wrapper for Tablegen'd function. enum Subtarget is not defined in any +// header files, so we need to wrap it in a function that takes unsigned +// instead. +namespace llvm { +namespace AMDGPU { +static int getMCOpcode(uint16_t Opcode, unsigned Gen) { + return getMCOpcodeGen(Opcode, (enum Subtarget)Gen); +} +} +} + +// This must be kept in sync with the SISubtarget class in SIInstrInfo.td +enum SISubtarget { + SI = 0, + VI = 1 +}; + +static enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) { + switch (Gen) { + default: + return SI; + case AMDGPUSubtarget::VOLCANIC_ISLANDS: + return VI; + } +} + +int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { + int MCOp = AMDGPU::getMCOpcode( + Opcode, AMDGPUSubtargetToSISubtarget(ST.getGeneration())); + + // -1 means that Opcode is already a native instruction. + if (MCOp == -1) + return Opcode; + + // (uint16_t)-1 means that Opcode is a pseudo instruction that has + // no encoding in the given subtarget generation. + if (MCOp == (uint16_t)-1) + return -1; + + return MCOp; +} + +ArrayRef<std::pair<int, const char *>> +AMDGPUInstrInfo::getSerializableTargetIndices() const { + static const std::pair<int, const char *> TargetIndices[] = { + {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; + return makeArrayRef(TargetIndices); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h new file mode 100644 index 0000000..53e8b23 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -0,0 +1,208 @@ +//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Contains the definition of a TargetInstrInfo class that is common +/// to all AMD GPUs. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H +#define LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H + +#include "AMDGPURegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" +#include <map> + +#define GET_INSTRINFO_HEADER +#define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_OPERAND_ENUM +#include "AMDGPUGenInstrInfo.inc" + +#define OPCODE_IS_ZERO_INT AMDGPU::PRED_SETE_INT +#define OPCODE_IS_NOT_ZERO_INT AMDGPU::PRED_SETNE_INT +#define OPCODE_IS_ZERO AMDGPU::PRED_SETE +#define OPCODE_IS_NOT_ZERO AMDGPU::PRED_SETNE + +namespace llvm { + +class AMDGPUSubtarget; +class MachineFunction; +class MachineInstr; +class MachineInstrBuilder; + +class AMDGPUInstrInfo : public AMDGPUGenInstrInfo { +private: + const AMDGPURegisterInfo RI; + virtual void anchor(); +protected: + const AMDGPUSubtarget &ST; +public: + explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st); + + virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0; + + bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, + unsigned &DstReg, unsigned &SubIdx) const override; + + unsigned isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const override; + unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const override; + bool hasLoadFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const override; + unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const; + unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const; + bool hasStoreFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const; + + MachineInstr * + convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const override; + + + bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + +protected: + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, + int FrameIndex) const override; + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, + MachineInstr *LoadMI) const override; + +public: + /// \returns the smallest register index that will be accessed by an indirect + /// read or write or -1 if indirect addressing is not used by this program. + int getIndirectIndexBegin(const MachineFunction &MF) const; + + /// \returns the largest register index that will be accessed by an indirect + /// read or write or -1 if indirect addressing is not used by this program. + int getIndirectIndexEnd(const MachineFunction &MF) const; + + bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, + unsigned Reg, bool UnfoldLoad, bool UnfoldStore, + SmallVectorImpl<MachineInstr *> &NewMIs) const override; + bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, + SmallVectorImpl<SDNode *> &NewNodes) const override; + unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, + bool UnfoldLoad, bool UnfoldStore, + unsigned *LoadRegIndex = nullptr) const override; + + bool enableClusterLoads() const override; + + bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const override; + + bool + ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; + void insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const override; + bool isPredicated(const MachineInstr *MI) const override; + bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1, + ArrayRef<MachineOperand> Pred2) const override; + bool DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const override; + bool isPredicable(MachineInstr *MI) const override; + bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; + + // Helper functions that check the opcode for status information + bool isRegisterStore(const MachineInstr &MI) const; + bool isRegisterLoad(const MachineInstr &MI) const; + + /// \brief Return a target-specific opcode if Opcode is a pseudo instruction. + /// Return -1 if the target-specific opcode for the pseudo instruction does + /// not exist. If Opcode is not a pseudo instruction, this is identity. + int pseudoToMCOpcode(int Opcode) const; + + /// \brief Return the descriptor of the target-specific machine instruction + /// that corresponds to the specified pseudo or native opcode. + const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const { + return get(pseudoToMCOpcode(Opcode)); + } + + ArrayRef<std::pair<int, const char *>> + getSerializableTargetIndices() const override; + +//===---------------------------------------------------------------------===// +// Pure virtual funtions to be implemented by sub-classes. +//===---------------------------------------------------------------------===// + + virtual bool isMov(unsigned opcode) const = 0; + + /// \brief Calculate the "Indirect Address" for the given \p RegIndex and + /// \p Channel + /// + /// We model indirect addressing using a virtual address space that can be + /// accesed with loads and stores. The "Indirect Address" is the memory + /// address in this virtual address space that maps to the given \p RegIndex + /// and \p Channel. + virtual unsigned calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const = 0; + + /// \returns The register class to be used for loading and storing values + /// from an "Indirect Address" . + virtual const TargetRegisterClass *getIndirectAddrRegClass() const = 0; + + /// \brief Build instruction(s) for an indirect register write. + /// + /// \returns The instruction that performs the indirect register write + virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const = 0; + + /// \brief Build instruction(s) for an indirect register read. + /// + /// \returns The instruction that performs the indirect register read + virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const = 0; + + /// \brief Build a MOV instruction. + virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned DstReg, unsigned SrcReg) const = 0; + + /// \brief Given a MIMG \p Opcode that writes all 4 channels, return the + /// equivalent opcode that writes \p Channels Channels. + int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const; + +}; + +namespace AMDGPU { + LLVM_READONLY + int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex); +} // End namespace AMDGPU + +} // End llvm namespace + +#define AMDGPU_FLAG_REGISTER_LOAD (UINT64_C(1) << 63) +#define AMDGPU_FLAG_REGISTER_STORE (UINT64_C(1) << 62) + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td new file mode 100644 index 0000000..575dfe4 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -0,0 +1,245 @@ +//===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains DAG node defintions for the AMDGPU target. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// AMDGPU DAG Profiles +//===----------------------------------------------------------------------===// + +def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [ + SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3> +]>; + +def AMDGPUTrigPreOp : SDTypeProfile<1, 2, + [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] +>; + +def AMDGPULdExpOp : SDTypeProfile<1, 2, + [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] +>; + +def AMDGPUFPClassOp : SDTypeProfile<1, 2, + [SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>] +>; + +def AMDGPUDivScaleOp : SDTypeProfile<2, 3, + [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>] +>; + +// float, float, float, vcc +def AMDGPUFmasOp : SDTypeProfile<1, 4, + [SDTCisFP<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<4>] +>; + +//===----------------------------------------------------------------------===// +// AMDGPU DAG Nodes +// + +// This argument to this node is a dword address. +def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>; + +def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>; +def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>; + +// out = a - floor(a) +def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>; + +// out = 1.0 / a +def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; + +// out = 1.0 / sqrt(a) +def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; + +// out = 1.0 / sqrt(a) +def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>; + +// out = 1.0 / sqrt(a) result clamped to +/- max_float. +def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>; + +def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>; + +def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>; + +// out = max(a, b) a and b are floats, where a nan comparison fails. +// This is not commutative because this gives the second operand: +// x < nan ? x : nan -> nan +// nan < x ? nan : x -> x +def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp, + [] +>; + +def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>; + +// out = max(a, b) a and b are signed ints +def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp, + [SDNPCommutative, SDNPAssociative] +>; + +// out = max(a, b) a and b are unsigned ints +def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp, + [SDNPCommutative, SDNPAssociative] +>; + +// out = min(a, b) a and b are floats, where a nan comparison fails. +def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp, + [] +>; + +// FIXME: TableGen doesn't like commutative instructions with more +// than 2 operands. +// out = max(a, b, c) a, b and c are floats +def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = max(a, b, c) a, b, and c are signed ints +def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = max(a, b, c) a, b and c are unsigned ints +def AMDGPUumax3 : SDNode<"AMDGPUISD::UMAX3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = min(a, b, c) a, b and c are floats +def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = min(a, b, c) a, b and c are signed ints +def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = min(a, b) a and b are unsigned ints +def AMDGPUumin3 : SDNode<"AMDGPUISD::UMIN3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = (src0 + src1 > 0xFFFFFFFF) ? 1 : 0 +def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>; + +// out = (src1 > src0) ? 1 : 0 +def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>; + + +def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0", + SDTIntToFPOp, []>; +def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1", + SDTIntToFPOp, []>; +def AMDGPUcvt_f32_ubyte2 : SDNode<"AMDGPUISD::CVT_F32_UBYTE2", + SDTIntToFPOp, []>; +def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3", + SDTIntToFPOp, []>; + + +// urecip - This operation is a helper for integer division, it returns the +// result of 1 / a as a fractional unsigned integer. +// out = (2^32 / a) + e +// e is rounding error +def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>; + +// Special case divide preop and flags. +def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>; + +// Special case divide FMA with scale and flags (src0 = Quotient, +// src1 = Denominator, src2 = Numerator). +def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>; + +// Single or double precision division fixup. +// Special case divide fixup and flags(src0 = Quotient, src1 = +// Denominator, src2 = Numerator). +def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>; + +// Look Up 2.0 / pi src0 with segment select src1[4:0] +def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>; + +def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD", + SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>, + [SDNPHasChain, SDNPMayLoad]>; + +def AMDGPUregister_store : SDNode<"AMDGPUISD::REGISTER_STORE", + SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>, + [SDNPHasChain, SDNPMayStore]>; + +// MSKOR instructions are atomic memory instructions used mainly for storing +// 8-bit and 16-bit values. The definition is: +// +// MSKOR(dst, mask, src) MEM[dst] = ((MEM[dst] & ~mask) | src) +// +// src0: vec4(src, 0, 0, mask) +// src1: dst - rat offset (aka pointer) in dwords +def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR", + SDTypeProfile<0, 2, []>, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + +def AMDGPUround : SDNode<"ISD::FROUND", + SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>; + +def AMDGPUbfe_u32 : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>; +def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; +def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; +def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; + +def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>; + +// Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when +// performing the mulitply. The result is a 32-bit value. +def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp, + [SDNPCommutative] +>; +def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp, + [SDNPCommutative] +>; + +def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp, + [] +>; +def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp, + [] +>; + +def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG", + SDTypeProfile<0, 1, [SDTCisInt<0>]>, + [SDNPHasChain, SDNPInGlue]>; + +def AMDGPUinterp_mov : SDNode<"AMDGPUISD::INTERP_MOV", + SDTypeProfile<1, 3, [SDTCisFP<0>]>, + [SDNPInGlue]>; + +def AMDGPUinterp_p1 : SDNode<"AMDGPUISD::INTERP_P1", + SDTypeProfile<1, 3, [SDTCisFP<0>]>, + [SDNPInGlue, SDNPOutGlue]>; + +def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2", + SDTypeProfile<1, 4, [SDTCisFP<0>]>, + [SDNPInGlue]>; + +//===----------------------------------------------------------------------===// +// Flow Control Profile Types +//===----------------------------------------------------------------------===// +// Branch instruction where second and third are basic blocks +def SDTIL_BRCond : SDTypeProfile<0, 2, [ + SDTCisVT<0, OtherVT> + ]>; + +//===----------------------------------------------------------------------===// +// Flow Control DAG Nodes +//===----------------------------------------------------------------------===// +def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>; + +//===----------------------------------------------------------------------===// +// Call/Return DAG Nodes +//===----------------------------------------------------------------------===// +def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td new file mode 100644 index 0000000..2a7ce6a --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -0,0 +1,647 @@ +//===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains instruction defs that are common to all hw codegen +// targets. +// +//===----------------------------------------------------------------------===// + +class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instruction { + field bit isRegisterLoad = 0; + field bit isRegisterStore = 0; + + let Namespace = "AMDGPU"; + let OutOperandList = outs; + let InOperandList = ins; + let AsmString = asm; + let Pattern = pattern; + let Itinerary = NullALU; + + let TSFlags{63} = isRegisterLoad; + let TSFlags{62} = isRegisterStore; +} + +class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern> + : AMDGPUInst<outs, ins, asm, pattern> { + + field bits<32> Inst = 0xffffffff; + +} + +def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">; +def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">; +def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; + +def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>; +def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>; + +let OperandType = "OPERAND_IMMEDIATE" in { + +def u32imm : Operand<i32> { + let PrintMethod = "printU32ImmOperand"; +} + +def u16imm : Operand<i16> { + let PrintMethod = "printU16ImmOperand"; +} + +def u8imm : Operand<i8> { + let PrintMethod = "printU8ImmOperand"; +} + +} // End OperandType = "OPERAND_IMMEDIATE" + +//===--------------------------------------------------------------------===// +// Custom Operands +//===--------------------------------------------------------------------===// +def brtarget : Operand<OtherVT>; + +//===----------------------------------------------------------------------===// +// PatLeafs for floating-point comparisons +//===----------------------------------------------------------------------===// + +def COND_OEQ : PatLeaf < + (cond), + [{return N->get() == ISD::SETOEQ || N->get() == ISD::SETEQ;}] +>; + +def COND_ONE : PatLeaf < + (cond), + [{return N->get() == ISD::SETONE || N->get() == ISD::SETNE;}] +>; + +def COND_OGT : PatLeaf < + (cond), + [{return N->get() == ISD::SETOGT || N->get() == ISD::SETGT;}] +>; + +def COND_OGE : PatLeaf < + (cond), + [{return N->get() == ISD::SETOGE || N->get() == ISD::SETGE;}] +>; + +def COND_OLT : PatLeaf < + (cond), + [{return N->get() == ISD::SETOLT || N->get() == ISD::SETLT;}] +>; + +def COND_OLE : PatLeaf < + (cond), + [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}] +>; + + +def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>; +def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>; + +//===----------------------------------------------------------------------===// +// PatLeafs for unsigned / unordered comparisons +//===----------------------------------------------------------------------===// + +def COND_UEQ : PatLeaf <(cond), [{return N->get() == ISD::SETUEQ;}]>; +def COND_UNE : PatLeaf <(cond), [{return N->get() == ISD::SETUNE;}]>; +def COND_UGT : PatLeaf <(cond), [{return N->get() == ISD::SETUGT;}]>; +def COND_UGE : PatLeaf <(cond), [{return N->get() == ISD::SETUGE;}]>; +def COND_ULT : PatLeaf <(cond), [{return N->get() == ISD::SETULT;}]>; +def COND_ULE : PatLeaf <(cond), [{return N->get() == ISD::SETULE;}]>; + +// XXX - For some reason R600 version is preferring to use unordered +// for setne? +def COND_UNE_NE : PatLeaf < + (cond), + [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}] +>; + +//===----------------------------------------------------------------------===// +// PatLeafs for signed comparisons +//===----------------------------------------------------------------------===// + +def COND_SGT : PatLeaf <(cond), [{return N->get() == ISD::SETGT;}]>; +def COND_SGE : PatLeaf <(cond), [{return N->get() == ISD::SETGE;}]>; +def COND_SLT : PatLeaf <(cond), [{return N->get() == ISD::SETLT;}]>; +def COND_SLE : PatLeaf <(cond), [{return N->get() == ISD::SETLE;}]>; + +//===----------------------------------------------------------------------===// +// PatLeafs for integer equality +//===----------------------------------------------------------------------===// + +def COND_EQ : PatLeaf < + (cond), + [{return N->get() == ISD::SETEQ || N->get() == ISD::SETUEQ;}] +>; + +def COND_NE : PatLeaf < + (cond), + [{return N->get() == ISD::SETNE || N->get() == ISD::SETUNE;}] +>; + +def COND_NULL : PatLeaf < + (cond), + [{(void)N; return false;}] +>; + +//===----------------------------------------------------------------------===// +// Load/Store Pattern Fragments +//===----------------------------------------------------------------------===// + +class PrivateMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{ + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; +}]>; + +class PrivateLoad <SDPatternOperator op> : PrivateMemOp < + (ops node:$ptr), (op node:$ptr) +>; + +class PrivateStore <SDPatternOperator op> : PrivateMemOp < + (ops node:$value, node:$ptr), (op node:$value, node:$ptr) +>; + +def load_private : PrivateLoad <load>; + +def truncstorei8_private : PrivateStore <truncstorei8>; +def truncstorei16_private : PrivateStore <truncstorei16>; +def store_private : PrivateStore <store>; + +def global_store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; + +// Global address space loads +def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isGlobalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +// Constant address space loads +def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); +}]>; + +class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr), + (ld_node node:$ptr), [{ + LoadSDNode *L = cast<LoadSDNode>(N); + return L->getExtensionType() == ISD::ZEXTLOAD || + L->getExtensionType() == ISD::EXTLOAD; +}]>; + +def az_extload : AZExtLoadBase <unindexedload>; + +def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ + return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8; +}]>; + +def az_extloadi8_global : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ + return isGlobalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ + return isGlobalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ + return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); +}]>; + +def sextloadi8_constant : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ + return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); +}]>; + +def az_extloadi8_local : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ + return isLocalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def sextloadi8_local : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ + return isLocalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def extloadi8_private : PrivateLoad <az_extloadi8>; +def sextloadi8_private : PrivateLoad <sextloadi8>; + +def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ + return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16; +}]>; + +def az_extloadi16_global : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ + return isGlobalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def sextloadi16_global : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ + return isGlobalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ + return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); +}]>; + +def sextloadi16_constant : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ + return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); +}]>; + +def az_extloadi16_local : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ + return isLocalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def sextloadi16_local : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ + return isLocalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def extloadi16_private : PrivateLoad <az_extloadi16>; +def sextloadi16_private : PrivateLoad <sextloadi16>; + +def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ + return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32; +}]>; + +def az_extloadi32_global : PatFrag<(ops node:$ptr), + (az_extloadi32 node:$ptr), [{ + return isGlobalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def az_extloadi32_flat : PatFrag<(ops node:$ptr), + (az_extloadi32 node:$ptr), [{ + return isFlatLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def az_extloadi32_constant : PatFrag<(ops node:$ptr), + (az_extloadi32 node:$ptr), [{ + return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); +}]>; + +def truncstorei8_global : PatFrag<(ops node:$val, node:$ptr), + (truncstorei8 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; + +def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr), + (truncstorei16 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; + +def local_store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; + +def truncstorei8_local : PatFrag<(ops node:$val, node:$ptr), + (truncstorei8 node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; + +def truncstorei16_local : PatFrag<(ops node:$val, node:$ptr), + (truncstorei16 node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; + +def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isLocalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{ + return cast<MemSDNode>(N)->getAlignment() % 8 == 0; +}]>; + +def local_load_aligned8bytes : Aligned8Bytes < + (ops node:$ptr), (local_load node:$ptr) +>; + +def local_store_aligned8bytes : Aligned8Bytes < + (ops node:$val, node:$ptr), (local_store node:$val, node:$ptr) +>; + +class local_binary_atomic_op<SDNode atomic_op> : + PatFrag<(ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), [{ + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; +}]>; + + +def atomic_swap_local : local_binary_atomic_op<atomic_swap>; +def atomic_load_add_local : local_binary_atomic_op<atomic_load_add>; +def atomic_load_sub_local : local_binary_atomic_op<atomic_load_sub>; +def atomic_load_and_local : local_binary_atomic_op<atomic_load_and>; +def atomic_load_or_local : local_binary_atomic_op<atomic_load_or>; +def atomic_load_xor_local : local_binary_atomic_op<atomic_load_xor>; +def atomic_load_nand_local : local_binary_atomic_op<atomic_load_nand>; +def atomic_load_min_local : local_binary_atomic_op<atomic_load_min>; +def atomic_load_max_local : local_binary_atomic_op<atomic_load_max>; +def atomic_load_umin_local : local_binary_atomic_op<atomic_load_umin>; +def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>; + +def mskor_global : PatFrag<(ops node:$val, node:$ptr), + (AMDGPUstore_mskor node:$val, node:$ptr), [{ + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; +}]>; + +multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> { + + def _32_local : PatFrag < + (ops node:$ptr, node:$cmp, node:$swap), + (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ + AtomicSDNode *AN = cast<AtomicSDNode>(N); + return AN->getMemoryVT() == MVT::i32 && + AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; + }]>; + + def _64_local : PatFrag< + (ops node:$ptr, node:$cmp, node:$swap), + (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ + AtomicSDNode *AN = cast<AtomicSDNode>(N); + return AN->getMemoryVT() == MVT::i64 && + AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; + }]>; +} + +defm atomic_cmp_swap : AtomicCmpSwapLocal <atomic_cmp_swap>; + +def mskor_flat : PatFrag<(ops node:$val, node:$ptr), + (AMDGPUstore_mskor node:$val, node:$ptr), [{ + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; +}]>; + +class global_binary_atomic_op<SDNode atomic_op> : PatFrag< + (ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}] +>; + +def atomic_swap_global : global_binary_atomic_op<atomic_swap>; +def atomic_add_global : global_binary_atomic_op<atomic_load_add>; +def atomic_and_global : global_binary_atomic_op<atomic_load_and>; +def atomic_max_global : global_binary_atomic_op<atomic_load_max>; +def atomic_min_global : global_binary_atomic_op<atomic_load_min>; +def atomic_or_global : global_binary_atomic_op<atomic_load_or>; +def atomic_sub_global : global_binary_atomic_op<atomic_load_sub>; +def atomic_umax_global : global_binary_atomic_op<atomic_load_umax>; +def atomic_umin_global : global_binary_atomic_op<atomic_load_umin>; +def atomic_xor_global : global_binary_atomic_op<atomic_load_xor>; + +//===----------------------------------------------------------------------===// +// Misc Pattern Fragments +//===----------------------------------------------------------------------===// + +class Constants { +int TWO_PI = 0x40c90fdb; +int PI = 0x40490fdb; +int TWO_PI_INV = 0x3e22f983; +int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding +int FP32_NEG_ONE = 0xbf800000; +int FP32_ONE = 0x3f800000; +} +def CONST : Constants; + +def FP_ZERO : PatLeaf < + (fpimm), + [{return N->getValueAPF().isZero();}] +>; + +def FP_ONE : PatLeaf < + (fpimm), + [{return N->isExactlyValue(1.0);}] +>; + +def FP_HALF : PatLeaf < + (fpimm), + [{return N->isExactlyValue(0.5);}] +>; + +let isCodeGenOnly = 1, isPseudo = 1 in { + +let usesCustomInserter = 1 in { + +class CLAMP <RegisterClass rc> : AMDGPUShaderInst < + (outs rc:$dst), + (ins rc:$src0), + "CLAMP $dst, $src0", + [(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))] +>; + +class FABS <RegisterClass rc> : AMDGPUShaderInst < + (outs rc:$dst), + (ins rc:$src0), + "FABS $dst, $src0", + [(set f32:$dst, (fabs f32:$src0))] +>; + +class FNEG <RegisterClass rc> : AMDGPUShaderInst < + (outs rc:$dst), + (ins rc:$src0), + "FNEG $dst, $src0", + [(set f32:$dst, (fneg f32:$src0))] +>; + +} // usesCustomInserter = 1 + +multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass, + ComplexPattern addrPat> { +let UseNamedOperandTable = 1 in { + + def RegisterLoad : AMDGPUShaderInst < + (outs dstClass:$dst), + (ins addrClass:$addr, i32imm:$chan), + "RegisterLoad $dst, $addr", + [(set i32:$dst, (AMDGPUregister_load addrPat:$addr, (i32 timm:$chan)))] + > { + let isRegisterLoad = 1; + } + + def RegisterStore : AMDGPUShaderInst < + (outs), + (ins dstClass:$val, addrClass:$addr, i32imm:$chan), + "RegisterStore $val, $addr", + [(AMDGPUregister_store i32:$val, addrPat:$addr, (i32 timm:$chan))] + > { + let isRegisterStore = 1; + } +} +} + +} // End isCodeGenOnly = 1, isPseudo = 1 + +/* Generic helper patterns for intrinsics */ +/* -------------------------------------- */ + +class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul> + : Pat < + (fpow f32:$src0, f32:$src1), + (exp_ieee (mul f32:$src1, (log_ieee f32:$src0))) +>; + +/* Other helper patterns */ +/* --------------------- */ + +/* Extract element pattern */ +class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx, + SubRegIndex sub_reg> + : Pat< + (sub_type (extractelt vec_type:$src, sub_idx)), + (EXTRACT_SUBREG $src, sub_reg) +>; + +/* Insert element pattern */ +class Insert_Element <ValueType elem_type, ValueType vec_type, + int sub_idx, SubRegIndex sub_reg> + : Pat < + (insertelt vec_type:$vec, elem_type:$elem, sub_idx), + (INSERT_SUBREG $vec, $elem, sub_reg) +>; + +// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer +// can handle COPY instructions. +// bitconvert pattern +class BitConvert <ValueType dt, ValueType st, RegisterClass rc> : Pat < + (dt (bitconvert (st rc:$src0))), + (dt rc:$src0) +>; + +// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer +// can handle COPY instructions. +class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat < + (vt (AMDGPUdwordaddr (vt rc:$addr))), + (vt rc:$addr) +>; + +// BFI_INT patterns + +multiclass BFIPatterns <Instruction BFI_INT, + Instruction LoadImm32, + RegisterClass RC64> { + // Definition from ISA doc: + // (y & x) | (z & ~x) + def : Pat < + (or (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))), + (BFI_INT $x, $y, $z) + >; + + // SHA-256 Ch function + // z ^ (x & (y ^ z)) + def : Pat < + (xor i32:$z, (and i32:$x, (xor i32:$y, i32:$z))), + (BFI_INT $x, $y, $z) + >; + + def : Pat < + (fcopysign f32:$src0, f32:$src1), + (BFI_INT (LoadImm32 0x7fffffff), $src0, $src1) + >; + + def : Pat < + (f64 (fcopysign f64:$src0, f64:$src1)), + (REG_SEQUENCE RC64, + (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, + (BFI_INT (LoadImm32 0x7fffffff), + (i32 (EXTRACT_SUBREG $src0, sub1)), + (i32 (EXTRACT_SUBREG $src1, sub1))), sub1) + >; +} + +// SHA-256 Ma patterns + +// ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y +class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : Pat < + (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))), + (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y) +>; + +// Bitfield extract patterns + +def IMMZeroBasedBitfieldMask : PatLeaf <(imm), [{ + return isMask_32(N->getZExtValue()); +}]>; + +def IMMPopCount : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(countPopulation(N->getZExtValue()), SDLoc(N), + MVT::i32); +}]>; + +class BFEPattern <Instruction BFE, Instruction MOV> : Pat < + (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)), + (BFE $src, $rshift, (MOV (i32 (IMMPopCount $mask)))) +>; + +// rotr pattern +class ROTRPattern <Instruction BIT_ALIGN> : Pat < + (rotr i32:$src0, i32:$src1), + (BIT_ALIGN $src0, $src0, $src1) +>; + +// 24-bit arithmetic patterns +def umul24 : PatFrag <(ops node:$x, node:$y), (mul node:$x, node:$y)>; + +// Special conversion patterns + +def cvt_rpi_i32_f32 : PatFrag < + (ops node:$src), + (fp_to_sint (ffloor (fadd $src, FP_HALF))), + [{ (void) N; return TM.Options.NoNaNsFPMath; }] +>; + +def cvt_flr_i32_f32 : PatFrag < + (ops node:$src), + (fp_to_sint (ffloor $src)), + [{ (void)N; return TM.Options.NoNaNsFPMath; }] +>; + +/* +class UMUL24Pattern <Instruction UMUL24> : Pat < + (mul U24:$x, U24:$y), + (UMUL24 $x, $y) +>; +*/ + +class IMad24Pat<Instruction Inst> : Pat < + (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2), + (Inst $src0, $src1, $src2) +>; + +class UMad24Pat<Instruction Inst> : Pat < + (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2), + (Inst $src0, $src1, $src2) +>; + +multiclass Expand24IBitOps<Instruction MulInst, Instruction AddInst> { + def _expand_imad24 : Pat < + (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2), + (AddInst (MulInst $src0, $src1), $src2) + >; + + def _expand_imul24 : Pat < + (AMDGPUmul_i24 i32:$src0, i32:$src1), + (MulInst $src0, $src1) + >; +} + +multiclass Expand24UBitOps<Instruction MulInst, Instruction AddInst> { + def _expand_umad24 : Pat < + (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2), + (AddInst (MulInst $src0, $src1), $src2) + >; + + def _expand_umul24 : Pat < + (AMDGPUmul_u24 i32:$src0, i32:$src1), + (MulInst $src0, $src1) + >; +} + +class RcpPat<Instruction RcpInst, ValueType vt> : Pat < + (fdiv FP_ONE, vt:$src), + (RcpInst $src) +>; + +class RsqPat<Instruction RsqInst, ValueType vt> : Pat < + (AMDGPUrcp (fsqrt vt:$src)), + (RsqInst $src) +>; + +include "R600Instructions.td" +include "R700Instructions.td" +include "EvergreenInstructions.td" +include "CaymanInstructions.td" + +include "SIInstrInfo.td" + diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp new file mode 100644 index 0000000..e94bb60 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp @@ -0,0 +1,77 @@ +//===- AMDGPUIntrinsicInfo.cpp - AMDGPU Intrinsic Information ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU Implementation of the IntrinsicInfo class. +// +//===-----------------------------------------------------------------------===// + +#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" + +using namespace llvm; + +#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN +#include "AMDGPUGenIntrinsics.inc" +#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN + +AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo() + : TargetIntrinsicInfo() {} + +std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, + unsigned numTys) const { + static const char *const names[] = { +#define GET_INTRINSIC_NAME_TABLE +#include "AMDGPUGenIntrinsics.inc" +#undef GET_INTRINSIC_NAME_TABLE + }; + + if (IntrID < Intrinsic::num_intrinsics) { + return nullptr; + } + assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics && + "Invalid intrinsic ID"); + + std::string Result(names[IntrID - Intrinsic::num_intrinsics]); + return Result; +} + +unsigned AMDGPUIntrinsicInfo::lookupName(const char *Name, + unsigned Len) const { + if (!StringRef(Name, Len).startswith("llvm.")) + return 0; // All intrinsics start with 'llvm.' + +#define GET_FUNCTION_RECOGNIZER +#include "AMDGPUGenIntrinsics.inc" +#undef GET_FUNCTION_RECOGNIZER + AMDGPUIntrinsic::ID IntrinsicID = + (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic; + IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name); + + if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) { + return IntrinsicID; + } + return 0; +} + +bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const { +// Overload Table +#define GET_INTRINSIC_OVERLOAD_TABLE +#include "AMDGPUGenIntrinsics.inc" +#undef GET_INTRINSIC_OVERLOAD_TABLE +} + +Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, + Type **Tys, + unsigned numTys) const { + llvm_unreachable("Not implemented"); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h new file mode 100644 index 0000000..4c95b5e --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h @@ -0,0 +1,48 @@ +//===- AMDGPUIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class. +// +//===-----------------------------------------------------------------------===// +#ifndef LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H +#define LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H + +#include "llvm/IR/Intrinsics.h" +#include "llvm/Target/TargetIntrinsicInfo.h" + +namespace llvm { +class TargetMachine; + +namespace AMDGPUIntrinsic { +enum ID { + last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1, +#define GET_INTRINSIC_ENUM_VALUES +#include "AMDGPUGenIntrinsics.inc" +#undef GET_INTRINSIC_ENUM_VALUES + , num_AMDGPU_intrinsics +}; + +} // end namespace AMDGPUIntrinsic + +class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo { +public: + AMDGPUIntrinsicInfo(); + std::string getName(unsigned IntrId, Type **Tys = nullptr, + unsigned numTys = 0) const override; + unsigned lookupName(const char *Name, unsigned Len) const override; + bool isOverloaded(unsigned IID) const override; + Function *getDeclaration(Module *M, unsigned ID, + Type **Tys = nullptr, + unsigned numTys = 0) const override; +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td new file mode 100644 index 0000000..1de3546 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td @@ -0,0 +1,90 @@ +//===-- AMDGPUIntrinsics.td - Common intrinsics -*- tablegen -*-----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines intrinsics that are used by all hw codegen targets. +// +//===----------------------------------------------------------------------===// + +let TargetPrefix = "AMDGPU", isTarget = 1 in { + + def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>; + def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_abs : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_fract : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + + // This is named backwards (instead of rsq_legacy) so we don't have + // to define it with the public builtins intrinsics. This is a + // workaround for how intrinsic names are parsed. If the name is + // llvm.AMDGPU.rsq.legacy, the parser assumes that you meant + // llvm.AMDGPU.rsq.{f32 | f64} and incorrectly mangled the name. + def int_AMDGPU_legacy_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + + def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; + def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>; + def int_AMDGPU_kilp : Intrinsic<[], [], []>; + def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_imax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_umul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_imul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_imad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_umad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cvt_f32_ubyte0 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cvt_f32_ubyte1 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cvt_f32_ubyte2 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cvt_f32_ubyte3 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; + def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_barrier_local : Intrinsic<[], [], [IntrConvergent]>; + def int_AMDGPU_barrier_global : Intrinsic<[], [], [IntrConvergent]>; +} + +// Legacy names for compatibility. +let TargetPrefix = "AMDIL", isTarget = 1 in { + def int_AMDIL_abs : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDIL_fraction : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDIL_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDIL_exp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDIL_round_nearest : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; +} + +let TargetPrefix = "TGSI", isTarget = 1 in { + + def int_TGSI_lit_z : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],[IntrNoMem]>; +} + +include "SIIntrinsics.td" diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp new file mode 100644 index 0000000..dfc652f --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -0,0 +1,144 @@ +//===- AMDGPUMCInstLower.cpp - Lower AMDGPU MachineInstr to an MCInst -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Code to lower AMDGPU MachineInstrs to their corresponding MCInst. +// +//===----------------------------------------------------------------------===// +// + +#include "AMDGPUMCInstLower.h" +#include "AMDGPUAsmPrinter.h" +#include "AMDGPUTargetMachine.h" +#include "InstPrinter/AMDGPUInstPrinter.h" +#include "R600InstrInfo.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCObjectStreamer.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include <algorithm> + +using namespace llvm; + +AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st): + Ctx(ctx), ST(st) +{ } + +void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { + + int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode()); + + if (MCOpcode == -1) { + LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext(); + C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have " + "a target-specific version: " + Twine(MI->getOpcode())); + } + + OutMI.setOpcode(MCOpcode); + + for (const MachineOperand &MO : MI->explicit_operands()) { + MCOperand MCOp; + switch (MO.getType()) { + default: + llvm_unreachable("unknown operand type"); + case MachineOperand::MO_Immediate: + MCOp = MCOperand::createImm(MO.getImm()); + break; + case MachineOperand::MO_Register: + MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST)); + break; + case MachineOperand::MO_MachineBasicBlock: + MCOp = MCOperand::createExpr(MCSymbolRefExpr::create( + MO.getMBB()->getSymbol(), Ctx)); + break; + case MachineOperand::MO_GlobalAddress: { + const GlobalValue *GV = MO.getGlobal(); + MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(GV->getName())); + MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(Sym, Ctx)); + break; + } + case MachineOperand::MO_ExternalSymbol: { + MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName())); + const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); + MCOp = MCOperand::createExpr(Expr); + break; + } + } + OutMI.addOperand(MCOp); + } +} + +void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { + const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>(); + AMDGPUMCInstLower MCInstLowering(OutContext, STI); + +#ifdef _DEBUG + StringRef Err; + if (!STI.getInstrInfo()->verifyInstruction(MI, Err)) { + errs() << "Warning: Illegal instruction detected: " << Err << "\n"; + MI->dump(); + } +#endif + if (MI->isBundle()) { + const MachineBasicBlock *MBB = MI->getParent(); + MachineBasicBlock::const_instr_iterator I = ++MI->getIterator(); + while (I != MBB->instr_end() && I->isInsideBundle()) { + EmitInstruction(&*I); + ++I; + } + } else { + MCInst TmpInst; + MCInstLowering.lower(MI, TmpInst); + EmitToStreamer(*OutStreamer, TmpInst); + + if (STI.dumpCode()) { + // Disassemble instruction/operands to text. + DisasmLines.resize(DisasmLines.size() + 1); + std::string &DisasmLine = DisasmLines.back(); + raw_string_ostream DisasmStream(DisasmLine); + + AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), + *MF->getSubtarget().getInstrInfo(), + *MF->getSubtarget().getRegisterInfo()); + InstPrinter.printInst(&TmpInst, DisasmStream, StringRef(), + MF->getSubtarget()); + + // Disassemble instruction/operands to hex representation. + SmallVector<MCFixup, 4> Fixups; + SmallVector<char, 16> CodeBytes; + raw_svector_ostream CodeStream(CodeBytes); + + auto &ObjStreamer = static_cast<MCObjectStreamer&>(*OutStreamer); + MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter(); + InstEmitter.encodeInstruction(TmpInst, CodeStream, Fixups, + MF->getSubtarget<MCSubtargetInfo>()); + HexLines.resize(HexLines.size() + 1); + std::string &HexLine = HexLines.back(); + raw_string_ostream HexStream(HexLine); + + for (size_t i = 0; i < CodeBytes.size(); i += 4) { + unsigned int CodeDWord = *(unsigned int *)&CodeBytes[i]; + HexStream << format("%s%08X", (i > 0 ? " " : ""), CodeDWord); + } + + DisasmStream.flush(); + DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLine.size()); + } + } +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h new file mode 100644 index 0000000..d322fe0 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h @@ -0,0 +1,35 @@ +//===- AMDGPUMCInstLower.h MachineInstr Lowering Interface ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H +#define LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H + +namespace llvm { + +class AMDGPUSubtarget; +class MachineInstr; +class MCContext; +class MCInst; + +class AMDGPUMCInstLower { + MCContext &Ctx; + const AMDGPUSubtarget &ST; + +public: + AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST); + + /// \brief Lower a MachineInstr to an MCInst + void lower(const MachineInstr *MI, MCInst &OutMI) const; + +}; + +} // End namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp new file mode 100644 index 0000000..5413717 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -0,0 +1,20 @@ +#include "AMDGPUMachineFunction.h" +#include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Function.h" +using namespace llvm; + +// Pin the vtable to this file. +void AMDGPUMachineFunction::anchor() {} + +AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : + MachineFunctionInfo(), + ShaderType(ShaderType::COMPUTE), + LDSSize(0), + ABIArgOffset(0), + ScratchSize(0), + IsKernel(true) { + + ShaderType = AMDGPU::getShaderType(*MF.getFunction()); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h new file mode 100644 index 0000000..46fcee8 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -0,0 +1,50 @@ +//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H +#define LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H + +#include "llvm/CodeGen/MachineFunction.h" +#include <map> + +namespace llvm { + +class AMDGPUMachineFunction : public MachineFunctionInfo { + virtual void anchor(); + unsigned ShaderType; + +public: + AMDGPUMachineFunction(const MachineFunction &MF); + /// A map to keep track of local memory objects and their offsets within + /// the local memory space. + std::map<const GlobalValue *, unsigned> LocalMemoryObjects; + /// Number of bytes in the LDS that are being used. + unsigned LDSSize; + + /// Start of implicit kernel args + unsigned ABIArgOffset; + + unsigned getShaderType() const { + return ShaderType; + } + + bool isKernel() const { + // FIXME: Assume everything is a kernel until function calls are supported. + return true; + } + + unsigned ScratchSize; + bool IsKernel; +}; + +} +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp new file mode 100644 index 0000000..554bf1d --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp @@ -0,0 +1,373 @@ +//===-- AMDGPUOpenCLImageTypeLoweringPass.cpp -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass resolves calls to OpenCL image attribute, image resource ID and +/// sampler resource ID getter functions. +/// +/// Image attributes (size and format) are expected to be passed to the kernel +/// as kernel arguments immediately following the image argument itself, +/// therefore this pass adds image size and format arguments to the kernel +/// functions in the module. The kernel functions with image arguments are +/// re-created using the new signature. The new arguments are added to the +/// kernel metadata with kernel_arg_type set to "image_size" or "image_format". +/// Note: this pass may invalidate pointers to functions. +/// +/// Resource IDs of read-only images, write-only images and samplers are +/// defined to be their index among the kernel arguments of the same +/// type and access qualifier. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/Cloning.h" + +using namespace llvm; + +namespace { + +StringRef GetImageSizeFunc = "llvm.OpenCL.image.get.size"; +StringRef GetImageFormatFunc = "llvm.OpenCL.image.get.format"; +StringRef GetImageResourceIDFunc = "llvm.OpenCL.image.get.resource.id"; +StringRef GetSamplerResourceIDFunc = "llvm.OpenCL.sampler.get.resource.id"; + +StringRef ImageSizeArgMDType = "__llvm_image_size"; +StringRef ImageFormatArgMDType = "__llvm_image_format"; + +StringRef KernelsMDNodeName = "opencl.kernels"; +StringRef KernelArgMDNodeNames[] = { + "kernel_arg_addr_space", + "kernel_arg_access_qual", + "kernel_arg_type", + "kernel_arg_base_type", + "kernel_arg_type_qual"}; +const unsigned NumKernelArgMDNodes = 5; + +typedef SmallVector<Metadata *, 8> MDVector; +struct KernelArgMD { + MDVector ArgVector[NumKernelArgMDNodes]; +}; + +} // end anonymous namespace + +static inline bool +IsImageType(StringRef TypeString) { + return TypeString == "image2d_t" || TypeString == "image3d_t"; +} + +static inline bool +IsSamplerType(StringRef TypeString) { + return TypeString == "sampler_t"; +} + +static Function * +GetFunctionFromMDNode(MDNode *Node) { + if (!Node) + return nullptr; + + size_t NumOps = Node->getNumOperands(); + if (NumOps != NumKernelArgMDNodes + 1) + return nullptr; + + auto F = mdconst::dyn_extract<Function>(Node->getOperand(0)); + if (!F) + return nullptr; + + // Sanity checks. + size_t ExpectNumArgNodeOps = F->arg_size() + 1; + for (size_t i = 0; i < NumKernelArgMDNodes; ++i) { + MDNode *ArgNode = dyn_cast_or_null<MDNode>(Node->getOperand(i + 1)); + if (ArgNode->getNumOperands() != ExpectNumArgNodeOps) + return nullptr; + if (!ArgNode->getOperand(0)) + return nullptr; + + // FIXME: It should be possible to do image lowering when some metadata + // args missing or not in the expected order. + MDString *StringNode = dyn_cast<MDString>(ArgNode->getOperand(0)); + if (!StringNode || StringNode->getString() != KernelArgMDNodeNames[i]) + return nullptr; + } + + return F; +} + +static StringRef +AccessQualFromMD(MDNode *KernelMDNode, unsigned ArgIdx) { + MDNode *ArgAQNode = cast<MDNode>(KernelMDNode->getOperand(2)); + return cast<MDString>(ArgAQNode->getOperand(ArgIdx + 1))->getString(); +} + +static StringRef +ArgTypeFromMD(MDNode *KernelMDNode, unsigned ArgIdx) { + MDNode *ArgTypeNode = cast<MDNode>(KernelMDNode->getOperand(3)); + return cast<MDString>(ArgTypeNode->getOperand(ArgIdx + 1))->getString(); +} + +static MDVector +GetArgMD(MDNode *KernelMDNode, unsigned OpIdx) { + MDVector Res; + for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) { + MDNode *Node = cast<MDNode>(KernelMDNode->getOperand(i + 1)); + Res.push_back(Node->getOperand(OpIdx)); + } + return Res; +} + +static void +PushArgMD(KernelArgMD &MD, const MDVector &V) { + assert(V.size() == NumKernelArgMDNodes); + for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) { + MD.ArgVector[i].push_back(V[i]); + } +} + +namespace { + +class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass { + static char ID; + + LLVMContext *Context; + Type *Int32Type; + Type *ImageSizeType; + Type *ImageFormatType; + SmallVector<Instruction *, 4> InstsToErase; + + bool replaceImageUses(Argument &ImageArg, uint32_t ResourceID, + Argument &ImageSizeArg, + Argument &ImageFormatArg) { + bool Modified = false; + + for (auto &Use : ImageArg.uses()) { + auto Inst = dyn_cast<CallInst>(Use.getUser()); + if (!Inst) { + continue; + } + + Function *F = Inst->getCalledFunction(); + if (!F) + continue; + + Value *Replacement = nullptr; + StringRef Name = F->getName(); + if (Name.startswith(GetImageResourceIDFunc)) { + Replacement = ConstantInt::get(Int32Type, ResourceID); + } else if (Name.startswith(GetImageSizeFunc)) { + Replacement = &ImageSizeArg; + } else if (Name.startswith(GetImageFormatFunc)) { + Replacement = &ImageFormatArg; + } else { + continue; + } + + Inst->replaceAllUsesWith(Replacement); + InstsToErase.push_back(Inst); + Modified = true; + } + + return Modified; + } + + bool replaceSamplerUses(Argument &SamplerArg, uint32_t ResourceID) { + bool Modified = false; + + for (const auto &Use : SamplerArg.uses()) { + auto Inst = dyn_cast<CallInst>(Use.getUser()); + if (!Inst) { + continue; + } + + Function *F = Inst->getCalledFunction(); + if (!F) + continue; + + Value *Replacement = nullptr; + StringRef Name = F->getName(); + if (Name == GetSamplerResourceIDFunc) { + Replacement = ConstantInt::get(Int32Type, ResourceID); + } else { + continue; + } + + Inst->replaceAllUsesWith(Replacement); + InstsToErase.push_back(Inst); + Modified = true; + } + + return Modified; + } + + bool replaceImageAndSamplerUses(Function *F, MDNode *KernelMDNode) { + uint32_t NumReadOnlyImageArgs = 0; + uint32_t NumWriteOnlyImageArgs = 0; + uint32_t NumSamplerArgs = 0; + + bool Modified = false; + InstsToErase.clear(); + for (auto ArgI = F->arg_begin(); ArgI != F->arg_end(); ++ArgI) { + Argument &Arg = *ArgI; + StringRef Type = ArgTypeFromMD(KernelMDNode, Arg.getArgNo()); + + // Handle image types. + if (IsImageType(Type)) { + StringRef AccessQual = AccessQualFromMD(KernelMDNode, Arg.getArgNo()); + uint32_t ResourceID; + if (AccessQual == "read_only") { + ResourceID = NumReadOnlyImageArgs++; + } else if (AccessQual == "write_only") { + ResourceID = NumWriteOnlyImageArgs++; + } else { + llvm_unreachable("Wrong image access qualifier."); + } + + Argument &SizeArg = *(++ArgI); + Argument &FormatArg = *(++ArgI); + Modified |= replaceImageUses(Arg, ResourceID, SizeArg, FormatArg); + + // Handle sampler type. + } else if (IsSamplerType(Type)) { + uint32_t ResourceID = NumSamplerArgs++; + Modified |= replaceSamplerUses(Arg, ResourceID); + } + } + for (unsigned i = 0; i < InstsToErase.size(); ++i) { + InstsToErase[i]->eraseFromParent(); + } + + return Modified; + } + + std::tuple<Function *, MDNode *> + addImplicitArgs(Function *F, MDNode *KernelMDNode) { + bool Modified = false; + + FunctionType *FT = F->getFunctionType(); + SmallVector<Type *, 8> ArgTypes; + + // Metadata operands for new MDNode. + KernelArgMD NewArgMDs; + PushArgMD(NewArgMDs, GetArgMD(KernelMDNode, 0)); + + // Add implicit arguments to the signature. + for (unsigned i = 0; i < FT->getNumParams(); ++i) { + ArgTypes.push_back(FT->getParamType(i)); + MDVector ArgMD = GetArgMD(KernelMDNode, i + 1); + PushArgMD(NewArgMDs, ArgMD); + + if (!IsImageType(ArgTypeFromMD(KernelMDNode, i))) + continue; + + // Add size implicit argument. + ArgTypes.push_back(ImageSizeType); + ArgMD[2] = ArgMD[3] = MDString::get(*Context, ImageSizeArgMDType); + PushArgMD(NewArgMDs, ArgMD); + + // Add format implicit argument. + ArgTypes.push_back(ImageFormatType); + ArgMD[2] = ArgMD[3] = MDString::get(*Context, ImageFormatArgMDType); + PushArgMD(NewArgMDs, ArgMD); + + Modified = true; + } + if (!Modified) { + return std::make_tuple(nullptr, nullptr); + } + + // Create function with new signature and clone the old body into it. + auto NewFT = FunctionType::get(FT->getReturnType(), ArgTypes, false); + auto NewF = Function::Create(NewFT, F->getLinkage(), F->getName()); + ValueToValueMapTy VMap; + auto NewFArgIt = NewF->arg_begin(); + for (auto &Arg: F->args()) { + auto ArgName = Arg.getName(); + NewFArgIt->setName(ArgName); + VMap[&Arg] = &(*NewFArgIt++); + if (IsImageType(ArgTypeFromMD(KernelMDNode, Arg.getArgNo()))) { + (NewFArgIt++)->setName(Twine("__size_") + ArgName); + (NewFArgIt++)->setName(Twine("__format_") + ArgName); + } + } + SmallVector<ReturnInst*, 8> Returns; + CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns); + + // Build new MDNode. + SmallVector<llvm::Metadata *, 6> KernelMDArgs; + KernelMDArgs.push_back(ConstantAsMetadata::get(NewF)); + for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) + KernelMDArgs.push_back(MDNode::get(*Context, NewArgMDs.ArgVector[i])); + MDNode *NewMDNode = MDNode::get(*Context, KernelMDArgs); + + return std::make_tuple(NewF, NewMDNode); + } + + bool transformKernels(Module &M) { + NamedMDNode *KernelsMDNode = M.getNamedMetadata(KernelsMDNodeName); + if (!KernelsMDNode) + return false; + + bool Modified = false; + for (unsigned i = 0; i < KernelsMDNode->getNumOperands(); ++i) { + MDNode *KernelMDNode = KernelsMDNode->getOperand(i); + Function *F = GetFunctionFromMDNode(KernelMDNode); + if (!F) + continue; + + Function *NewF; + MDNode *NewMDNode; + std::tie(NewF, NewMDNode) = addImplicitArgs(F, KernelMDNode); + if (NewF) { + // Replace old function and metadata with new ones. + F->eraseFromParent(); + M.getFunctionList().push_back(NewF); + M.getOrInsertFunction(NewF->getName(), NewF->getFunctionType(), + NewF->getAttributes()); + KernelsMDNode->setOperand(i, NewMDNode); + + F = NewF; + KernelMDNode = NewMDNode; + Modified = true; + } + + Modified |= replaceImageAndSamplerUses(F, KernelMDNode); + } + + return Modified; + } + + public: + AMDGPUOpenCLImageTypeLoweringPass() : ModulePass(ID) {} + + bool runOnModule(Module &M) override { + Context = &M.getContext(); + Int32Type = Type::getInt32Ty(M.getContext()); + ImageSizeType = ArrayType::get(Int32Type, 3); + ImageFormatType = ArrayType::get(Int32Type, 2); + + return transformKernels(M); + } + + const char *getPassName() const override { + return "AMDGPU OpenCL Image Type Pass"; + } +}; + +char AMDGPUOpenCLImageTypeLoweringPass::ID = 0; + +} // end anonymous namespace + +ModulePass *llvm::createAMDGPUOpenCLImageTypeLoweringPass() { + return new AMDGPUOpenCLImageTypeLoweringPass(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp new file mode 100644 index 0000000..87d50d5 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -0,0 +1,425 @@ +//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass eliminates allocas by either converting them into vectors or +// by migrating them to local address space. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "amdgpu-promote-alloca" + +using namespace llvm; + +namespace { + +class AMDGPUPromoteAlloca : public FunctionPass, + public InstVisitor<AMDGPUPromoteAlloca> { + + static char ID; + Module *Mod; + const AMDGPUSubtarget &ST; + int LocalMemAvailable; + +public: + AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st), + LocalMemAvailable(0) { } + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + const char *getPassName() const override { return "AMDGPU Promote Alloca"; } + void visitAlloca(AllocaInst &I); +}; + +} // End anonymous namespace + +char AMDGPUPromoteAlloca::ID = 0; + +bool AMDGPUPromoteAlloca::doInitialization(Module &M) { + Mod = &M; + return false; +} + +bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { + + FunctionType *FTy = F.getFunctionType(); + + LocalMemAvailable = ST.getLocalMemorySize(); + + + // If the function has any arguments in the local address space, then it's + // possible these arguments require the entire local memory space, so + // we cannot use local memory in the pass. + for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) { + Type *ParamTy = FTy->getParamType(i); + if (ParamTy->isPointerTy() && + ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + LocalMemAvailable = 0; + DEBUG(dbgs() << "Function has local memory argument. Promoting to " + "local memory disabled.\n"); + break; + } + } + + if (LocalMemAvailable > 0) { + // Check how much local memory is being used by global objects + for (Module::global_iterator I = Mod->global_begin(), + E = Mod->global_end(); I != E; ++I) { + GlobalVariable *GV = &*I; + PointerType *GVTy = GV->getType(); + if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) + continue; + for (Value::use_iterator U = GV->use_begin(), + UE = GV->use_end(); U != UE; ++U) { + Instruction *Use = dyn_cast<Instruction>(*U); + if (!Use) + continue; + if (Use->getParent()->getParent() == &F) + LocalMemAvailable -= + Mod->getDataLayout().getTypeAllocSize(GVTy->getElementType()); + } + } + } + + LocalMemAvailable = std::max(0, LocalMemAvailable); + DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n"); + + visit(F); + + return false; +} + +static VectorType *arrayTypeToVecType(Type *ArrayTy) { + return VectorType::get(ArrayTy->getArrayElementType(), + ArrayTy->getArrayNumElements()); +} + +static Value * +calculateVectorIndex(Value *Ptr, + const std::map<GetElementPtrInst *, Value *> &GEPIdx) { + if (isa<AllocaInst>(Ptr)) + return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext())); + + GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr); + + auto I = GEPIdx.find(GEP); + return I == GEPIdx.end() ? nullptr : I->second; +} + +static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { + // FIXME we only support simple cases + if (GEP->getNumOperands() != 3) + return NULL; + + ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1)); + if (!I0 || !I0->isZero()) + return NULL; + + return GEP->getOperand(2); +} + +// Not an instruction handled below to turn into a vector. +// +// TODO: Check isTriviallyVectorizable for calls and handle other +// instructions. +static bool canVectorizeInst(Instruction *Inst, User *User) { + switch (Inst->getOpcode()) { + case Instruction::Load: + case Instruction::BitCast: + case Instruction::AddrSpaceCast: + return true; + case Instruction::Store: { + // Must be the stored pointer operand, not a stored value. + StoreInst *SI = cast<StoreInst>(Inst); + return SI->getPointerOperand() == User; + } + default: + return false; + } +} + +static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { + Type *AllocaTy = Alloca->getAllocatedType(); + + DEBUG(dbgs() << "Alloca Candidate for vectorization \n"); + + // FIXME: There is no reason why we can't support larger arrays, we + // are just being conservative for now. + if (!AllocaTy->isArrayTy() || + AllocaTy->getArrayElementType()->isVectorTy() || + AllocaTy->getArrayNumElements() > 4) { + + DEBUG(dbgs() << " Cannot convert type to vector"); + return false; + } + + std::map<GetElementPtrInst*, Value*> GEPVectorIdx; + std::vector<Value*> WorkList; + for (User *AllocaUser : Alloca->users()) { + GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser); + if (!GEP) { + if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca)) + return false; + + WorkList.push_back(AllocaUser); + continue; + } + + Value *Index = GEPToVectorIndex(GEP); + + // If we can't compute a vector index from this GEP, then we can't + // promote this alloca to vector. + if (!Index) { + DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n'); + return false; + } + + GEPVectorIdx[GEP] = Index; + for (User *GEPUser : AllocaUser->users()) { + if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser)) + return false; + + WorkList.push_back(GEPUser); + } + } + + VectorType *VectorTy = arrayTypeToVecType(AllocaTy); + + DEBUG(dbgs() << " Converting alloca to vector " + << *AllocaTy << " -> " << *VectorTy << '\n'); + + for (std::vector<Value*>::iterator I = WorkList.begin(), + E = WorkList.end(); I != E; ++I) { + Instruction *Inst = cast<Instruction>(*I); + IRBuilder<> Builder(Inst); + switch (Inst->getOpcode()) { + case Instruction::Load: { + Value *Ptr = Inst->getOperand(0); + Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); + Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); + Value *VecValue = Builder.CreateLoad(BitCast); + Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); + Inst->replaceAllUsesWith(ExtractElement); + Inst->eraseFromParent(); + break; + } + case Instruction::Store: { + Value *Ptr = Inst->getOperand(1); + Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); + Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); + Value *VecValue = Builder.CreateLoad(BitCast); + Value *NewVecValue = Builder.CreateInsertElement(VecValue, + Inst->getOperand(0), + Index); + Builder.CreateStore(NewVecValue, BitCast); + Inst->eraseFromParent(); + break; + } + case Instruction::BitCast: + case Instruction::AddrSpaceCast: + break; + + default: + Inst->dump(); + llvm_unreachable("Inconsistency in instructions promotable to vector"); + } + } + return true; +} + +static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) { + bool Success = true; + for (User *User : Val->users()) { + if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end()) + continue; + if (CallInst *CI = dyn_cast<CallInst>(User)) { + // TODO: We might be able to handle some cases where the callee is a + // constantexpr bitcast of a function. + if (!CI->getCalledFunction()) + return false; + + WorkList.push_back(User); + continue; + } + + // FIXME: Correctly handle ptrtoint instructions. + Instruction *UseInst = dyn_cast<Instruction>(User); + if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt) + return false; + + if (StoreInst *SI = dyn_cast_or_null<StoreInst>(UseInst)) { + // Reject if the stored value is not the pointer operand. + if (SI->getPointerOperand() != Val) + return false; + } + + if (!User->getType()->isPointerTy()) + continue; + + WorkList.push_back(User); + + Success &= collectUsesWithPtrTypes(User, WorkList); + } + return Success; +} + +void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { + if (!I.isStaticAlloca()) + return; + + IRBuilder<> Builder(&I); + + // First try to replace the alloca with a vector + Type *AllocaTy = I.getAllocatedType(); + + DEBUG(dbgs() << "Trying to promote " << I << '\n'); + + if (tryPromoteAllocaToVector(&I)) + return; + + DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); + + // FIXME: This is the maximum work group size. We should try to get + // value from the reqd_work_group_size function attribute if it is + // available. + unsigned WorkGroupSize = 256; + int AllocaSize = + WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy); + + if (AllocaSize > LocalMemAvailable) { + DEBUG(dbgs() << " Not enough local memory to promote alloca.\n"); + return; + } + + std::vector<Value*> WorkList; + + if (!collectUsesWithPtrTypes(&I, WorkList)) { + DEBUG(dbgs() << " Do not know how to convert all uses\n"); + return; + } + + DEBUG(dbgs() << "Promoting alloca to local memory\n"); + LocalMemAvailable -= AllocaSize; + + Type *GVTy = ArrayType::get(I.getAllocatedType(), 256); + GlobalVariable *GV = new GlobalVariable( + *Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0, + GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); + + FunctionType *FTy = FunctionType::get( + Type::getInt32Ty(Mod->getContext()), false); + AttributeSet AttrSet; + AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone); + + Value *ReadLocalSizeY = Mod->getOrInsertFunction( + "llvm.r600.read.local.size.y", FTy, AttrSet); + Value *ReadLocalSizeZ = Mod->getOrInsertFunction( + "llvm.r600.read.local.size.z", FTy, AttrSet); + Value *ReadTIDIGX = Mod->getOrInsertFunction( + "llvm.r600.read.tidig.x", FTy, AttrSet); + Value *ReadTIDIGY = Mod->getOrInsertFunction( + "llvm.r600.read.tidig.y", FTy, AttrSet); + Value *ReadTIDIGZ = Mod->getOrInsertFunction( + "llvm.r600.read.tidig.z", FTy, AttrSet); + + Value *TCntY = Builder.CreateCall(ReadLocalSizeY, {}); + Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ, {}); + Value *TIdX = Builder.CreateCall(ReadTIDIGX, {}); + Value *TIdY = Builder.CreateCall(ReadTIDIGY, {}); + Value *TIdZ = Builder.CreateCall(ReadTIDIGZ, {}); + + Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ); + Tmp0 = Builder.CreateMul(Tmp0, TIdX); + Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ); + Value *TID = Builder.CreateAdd(Tmp0, Tmp1); + TID = Builder.CreateAdd(TID, TIdZ); + + std::vector<Value*> Indices; + Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext()))); + Indices.push_back(TID); + + Value *Offset = Builder.CreateGEP(GVTy, GV, Indices); + I.mutateType(Offset->getType()); + I.replaceAllUsesWith(Offset); + I.eraseFromParent(); + + for (std::vector<Value*>::iterator i = WorkList.begin(), + e = WorkList.end(); i != e; ++i) { + Value *V = *i; + CallInst *Call = dyn_cast<CallInst>(V); + if (!Call) { + Type *EltTy = V->getType()->getPointerElementType(); + PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); + + // The operand's value should be corrected on its own. + if (isa<AddrSpaceCastInst>(V)) + continue; + + // FIXME: It doesn't really make sense to try to do this for all + // instructions. + V->mutateType(NewTy); + continue; + } + + IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call); + if (!Intr) { + std::vector<Type*> ArgTypes; + for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands(); + ArgIdx != ArgEnd; ++ArgIdx) { + ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType()); + } + Function *F = Call->getCalledFunction(); + FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes, + F->isVarArg()); + Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(), + NewType, F->getAttributes()); + Function *NewF = cast<Function>(C); + Call->setCalledFunction(NewF); + continue; + } + + Builder.SetInsertPoint(Intr); + switch (Intr->getIntrinsicID()) { + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + // These intrinsics are for address space 0 only + Intr->eraseFromParent(); + continue; + case Intrinsic::memcpy: { + MemCpyInst *MemCpy = cast<MemCpyInst>(Intr); + Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(), + MemCpy->getLength(), MemCpy->getAlignment(), + MemCpy->isVolatile()); + Intr->eraseFromParent(); + continue; + } + case Intrinsic::memset: { + MemSetInst *MemSet = cast<MemSetInst>(Intr); + Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(), + MemSet->getLength(), MemSet->getAlignment(), + MemSet->isVolatile()); + Intr->eraseFromParent(); + continue; + } + default: + Intr->dump(); + llvm_unreachable("Don't know how to promote alloca intrinsic use."); + } + } +} + +FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) { + return new AMDGPUPromoteAlloca(ST); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp new file mode 100644 index 0000000..3ca0eca --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -0,0 +1,63 @@ +//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Parent TargetRegisterInfo class common to all hw codegen targets. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPURegisterInfo.h" +#include "AMDGPUTargetMachine.h" + +using namespace llvm; + +AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {} + +//===----------------------------------------------------------------------===// +// Function handling callbacks - Functions are a seldom used feature of GPUS, so +// they are not supported at this time. +//===----------------------------------------------------------------------===// + +const MCPhysReg AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister; + +const MCPhysReg* +AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + return &CalleeSavedReg; +} + +void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, + int SPAdj, + unsigned FIOperandNum, + RegScavenger *RS) const { + llvm_unreachable("Subroutines not supported yet"); +} + +unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const { + return AMDGPU::NoRegister; +} + +unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const { + static const unsigned SubRegs[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, + AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9, + AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, + AMDGPU::sub15 + }; + + assert(Channel < array_lengthof(SubRegs)); + return SubRegs[Channel]; +} + +unsigned AMDGPURegisterInfo::getIndirectSubReg(unsigned IndirectIndex) const { + + return getSubRegFromChannel(IndirectIndex); +} + +#define GET_REGINFO_TARGET_DESC +#include "AMDGPUGenRegisterInfo.inc" diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h new file mode 100644 index 0000000..0344834 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h @@ -0,0 +1,60 @@ +//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief TargetRegisterInfo interface that is implemented by all hw codegen +/// targets. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H +#define LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H + +#include "llvm/ADT/BitVector.h" +#include "llvm/Target/TargetRegisterInfo.h" + +#define GET_REGINFO_HEADER +#define GET_REGINFO_ENUM +#include "AMDGPUGenRegisterInfo.inc" + +namespace llvm { + +class AMDGPUSubtarget; +class TargetInstrInfo; + +struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { + static const MCPhysReg CalleeSavedReg; + + AMDGPURegisterInfo(); + + BitVector getReservedRegs(const MachineFunction &MF) const override { + assert(!"Unimplemented"); return BitVector(); + } + + virtual unsigned getHWRegIndex(unsigned Reg) const { + assert(!"Unimplemented"); return 0; + } + + /// \returns the sub reg enum value for the given \p Channel + /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0) + unsigned getSubRegFromChannel(unsigned Channel) const; + + const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override; + void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, + unsigned FIOperandNum, + RegScavenger *RS) const override; + unsigned getFrameRegister(const MachineFunction &MF) const override; + + unsigned getIndirectSubReg(unsigned IndirectIndex) const; + +}; + +} // End namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td new file mode 100644 index 0000000..ba0490a --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td @@ -0,0 +1,25 @@ +//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Tablegen register definitions common to all hw codegen targets. +// +//===----------------------------------------------------------------------===// + +let Namespace = "AMDGPU" in { + +foreach Index = 0-15 in { + def sub#Index : SubRegIndex<32, !shl(Index, 5)>; +} + +def INDIRECT_BASE_ADDR : Register <"INDIRECT_BASE_ADDR">; + +} + +include "R600RegisterInfo.td" +include "SIRegisterInfo.td" diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp new file mode 100644 index 0000000..c6af5b9 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -0,0 +1,154 @@ +//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Implements the AMDGPU specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUSubtarget.h" +#include "R600ISelLowering.h" +#include "R600InstrInfo.h" +#include "R600MachineScheduler.h" +#include "SIFrameLowering.h" +#include "SIISelLowering.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/CodeGen/MachineScheduler.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-subtarget" + +#define GET_SUBTARGETINFO_ENUM +#define GET_SUBTARGETINFO_TARGET_DESC +#define GET_SUBTARGETINFO_CTOR +#include "AMDGPUGenSubtargetInfo.inc" + +AMDGPUSubtarget & +AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, + StringRef GPU, StringRef FS) { + // Determine default and user-specified characteristics + // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be + // enabled, but some instructions do not respect them and they run at the + // double precision rate, so don't enable by default. + // + // We want to be able to turn these off, but making this a subtarget feature + // for SI has the unhelpful behavior that it unsets everything else if you + // disable it. + + SmallString<256> FullFS("+promote-alloca,+fp64-denormals,"); + if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. + FullFS += "+flat-for-global,"; + FullFS += FS; + + if (GPU == "" && TT.getArch() == Triple::amdgcn) + GPU = "SI"; + + ParseSubtargetFeatures(GPU, FullFS); + + // FIXME: I don't think think Evergreen has any useful support for + // denormals, but should be checked. Should we issue a warning somewhere + // if someone tries to enable these? + if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { + FP32Denormals = false; + FP64Denormals = false; + } + return *this; +} + +AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, + TargetMachine &TM) + : AMDGPUGenSubtargetInfo(TT, GPU, FS), DevName(GPU), Is64bit(false), + DumpCode(false), R600ALUInst(false), HasVertexCache(false), + TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false), + FP64Denormals(false), FP32Denormals(false), FastFMAF32(false), + CaymanISA(false), FlatAddressSpace(false), FlatForGlobal(false), + EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true), + EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false), + EnableXNACK(false), + WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), + EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false), + GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0), + IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false), + FrameLowering(nullptr), + InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) { + + initializeSubtargetDependencies(TT, GPU, FS); + + const unsigned MaxStackAlign = 64 * 16; // Maximum stack alignment (long16) + + if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { + InstrInfo.reset(new R600InstrInfo(*this)); + TLInfo.reset(new R600TargetLowering(TM, *this)); + + // FIXME: Should have R600 specific FrameLowering + FrameLowering.reset(new AMDGPUFrameLowering( + TargetFrameLowering::StackGrowsUp, + MaxStackAlign, + 0)); + } else { + InstrInfo.reset(new SIInstrInfo(*this)); + TLInfo.reset(new SITargetLowering(TM, *this)); + FrameLowering.reset(new SIFrameLowering( + TargetFrameLowering::StackGrowsUp, + MaxStackAlign, + 0)); + } +} + +unsigned AMDGPUSubtarget::getStackEntrySize() const { + assert(getGeneration() <= NORTHERN_ISLANDS); + switch(getWavefrontSize()) { + case 16: + return 8; + case 32: + return hasCaymanISA() ? 4 : 8; + case 64: + return 4; + default: + llvm_unreachable("Illegal wavefront size."); + } +} + +unsigned AMDGPUSubtarget::getAmdKernelCodeChipID() const { + switch(getGeneration()) { + default: llvm_unreachable("ChipID unknown"); + case SEA_ISLANDS: return 12; + } +} + +AMDGPU::IsaVersion AMDGPUSubtarget::getIsaVersion() const { + return AMDGPU::getIsaVersion(getFeatureBits()); +} + +bool AMDGPUSubtarget::isVGPRSpillingEnabled( + const SIMachineFunctionInfo *MFI) const { + return MFI->getShaderType() == ShaderType::COMPUTE || EnableVGPRSpilling; +} + +void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, + MachineInstr *begin, + MachineInstr *end, + unsigned NumRegionInstrs) const { + if (getGeneration() >= SOUTHERN_ISLANDS) { + + // Track register pressure so the scheduler can try to decrease + // pressure once register usage is above the threshold defined by + // SIRegisterInfo::getRegPressureSetLimit() + Policy.ShouldTrackPressure = true; + + // Enabling both top down and bottom up scheduling seems to give us less + // register spills than just using one of these approaches on its own. + Policy.OnlyTopDown = false; + Policy.OnlyBottomUp = false; + } +} + diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h new file mode 100644 index 0000000..d371227 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -0,0 +1,323 @@ +//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H + +#include "AMDGPU.h" +#include "AMDGPUFrameLowering.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPUISelLowering.h" +#include "AMDGPUSubtarget.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +#define GET_SUBTARGETINFO_HEADER +#include "AMDGPUGenSubtargetInfo.inc" + +namespace llvm { + +class SIMachineFunctionInfo; + +class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { + +public: + enum Generation { + R600 = 0, + R700, + EVERGREEN, + NORTHERN_ISLANDS, + SOUTHERN_ISLANDS, + SEA_ISLANDS, + VOLCANIC_ISLANDS, + }; + + enum { + FIXED_SGPR_COUNT_FOR_INIT_BUG = 80 + }; + + enum { + ISAVersion0_0_0, + ISAVersion7_0_0, + ISAVersion7_0_1, + ISAVersion8_0_0, + ISAVersion8_0_1 + }; + +private: + std::string DevName; + bool Is64bit; + bool DumpCode; + bool R600ALUInst; + bool HasVertexCache; + short TexVTXClauseSize; + Generation Gen; + bool FP64; + bool FP64Denormals; + bool FP32Denormals; + bool FastFMAF32; + bool CaymanISA; + bool FlatAddressSpace; + bool FlatForGlobal; + bool EnableIRStructurizer; + bool EnablePromoteAlloca; + bool EnableIfCvt; + bool EnableLoadStoreOpt; + bool EnableUnsafeDSOffsetFolding; + bool EnableXNACK; + unsigned WavefrontSize; + bool CFALUBug; + int LocalMemorySize; + bool EnableVGPRSpilling; + bool SGPRInitBug; + bool IsGCN; + bool GCN1Encoding; + bool GCN3Encoding; + bool CIInsts; + bool FeatureDisable; + int LDSBankCount; + unsigned IsaVersion; + bool EnableHugeScratchBuffer; + + std::unique_ptr<AMDGPUFrameLowering> FrameLowering; + std::unique_ptr<AMDGPUTargetLowering> TLInfo; + std::unique_ptr<AMDGPUInstrInfo> InstrInfo; + InstrItineraryData InstrItins; + Triple TargetTriple; + +public: + AMDGPUSubtarget(const Triple &TT, StringRef CPU, StringRef FS, + TargetMachine &TM); + AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT, + StringRef GPU, StringRef FS); + + const AMDGPUFrameLowering *getFrameLowering() const override { + return FrameLowering.get(); + } + const AMDGPUInstrInfo *getInstrInfo() const override { + return InstrInfo.get(); + } + const AMDGPURegisterInfo *getRegisterInfo() const override { + return &InstrInfo->getRegisterInfo(); + } + AMDGPUTargetLowering *getTargetLowering() const override { + return TLInfo.get(); + } + const InstrItineraryData *getInstrItineraryData() const override { + return &InstrItins; + } + + void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + + bool is64bit() const { + return Is64bit; + } + + bool hasVertexCache() const { + return HasVertexCache; + } + + short getTexVTXClauseSize() const { + return TexVTXClauseSize; + } + + Generation getGeneration() const { + return Gen; + } + + bool hasHWFP64() const { + return FP64; + } + + bool hasCaymanISA() const { + return CaymanISA; + } + + bool hasFP32Denormals() const { + return FP32Denormals; + } + + bool hasFP64Denormals() const { + return FP64Denormals; + } + + bool hasFastFMAF32() const { + return FastFMAF32; + } + + bool hasFlatAddressSpace() const { + return FlatAddressSpace; + } + + bool useFlatForGlobal() const { + return FlatForGlobal; + } + + bool hasBFE() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasBFI() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasBFM() const { + return hasBFE(); + } + + bool hasBCNT(unsigned Size) const { + if (Size == 32) + return (getGeneration() >= EVERGREEN); + + if (Size == 64) + return (getGeneration() >= SOUTHERN_ISLANDS); + + return false; + } + + bool hasMulU24() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasMulI24() const { + return (getGeneration() >= SOUTHERN_ISLANDS || + hasCaymanISA()); + } + + bool hasFFBL() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasFFBH() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasCARRY() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasBORROW() const { + return (getGeneration() >= EVERGREEN); + } + + bool IsIRStructurizerEnabled() const { + return EnableIRStructurizer; + } + + bool isPromoteAllocaEnabled() const { + return EnablePromoteAlloca; + } + + bool isIfCvtEnabled() const { + return EnableIfCvt; + } + + bool loadStoreOptEnabled() const { + return EnableLoadStoreOpt; + } + + bool unsafeDSOffsetFoldingEnabled() const { + return EnableUnsafeDSOffsetFolding; + } + + unsigned getWavefrontSize() const { + return WavefrontSize; + } + + unsigned getStackEntrySize() const; + + bool hasCFAluBug() const { + assert(getGeneration() <= NORTHERN_ISLANDS); + return CFALUBug; + } + + int getLocalMemorySize() const { + return LocalMemorySize; + } + + bool hasSGPRInitBug() const { + return SGPRInitBug; + } + + int getLDSBankCount() const { + return LDSBankCount; + } + + unsigned getAmdKernelCodeChipID() const; + + AMDGPU::IsaVersion getIsaVersion() const; + + bool enableMachineScheduler() const override { + return true; + } + + void overrideSchedPolicy(MachineSchedPolicy &Policy, + MachineInstr *begin, MachineInstr *end, + unsigned NumRegionInstrs) const override; + + // Helper functions to simplify if statements + bool isTargetELF() const { + return false; + } + + StringRef getDeviceName() const { + return DevName; + } + + bool enableHugeScratchBuffer() const { + return EnableHugeScratchBuffer; + } + + bool dumpCode() const { + return DumpCode; + } + bool r600ALUEncoding() const { + return R600ALUInst; + } + bool isAmdHsaOS() const { + return TargetTriple.getOS() == Triple::AMDHSA; + } + bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const; + + bool isXNACKEnabled() const { + return EnableXNACK; + } + + unsigned getMaxWavesPerCU() const { + if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) + return 10; + + // FIXME: Not sure what this is for other subtagets. + llvm_unreachable("do not know max waves per CU for this subtarget."); + } + + bool enableSubRegLiveness() const override { + return true; + } + + /// \brief Returns the offset in bytes from the start of the input buffer + /// of the first explicit kernel argument. + unsigned getExplicitKernelArgOffset() const { + return isAmdHsaOS() ? 0 : 36; + } + + unsigned getMaxNumUserSGPRs() const { + return 16; + } +}; + +} // End namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp new file mode 100644 index 0000000..b1be619 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -0,0 +1,350 @@ +//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief The AMDGPU target machine contains all of the hardware specific +/// information needed to emit code for R600 and SI GPUs. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetMachine.h" +#include "AMDGPUTargetObjectFile.h" +#include "AMDGPU.h" +#include "AMDGPUTargetTransformInfo.h" +#include "R600ISelLowering.h" +#include "R600InstrInfo.h" +#include "R600MachineScheduler.h" +#include "SIISelLowering.h" +#include "SIInstrInfo.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Verifier.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_os_ostream.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Scalar.h" +#include <llvm/CodeGen/Passes.h> + +using namespace llvm; + +extern "C" void LLVMInitializeAMDGPUTarget() { + // Register the target + RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget); + RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget); + + PassRegistry *PR = PassRegistry::getPassRegistry(); + initializeSILowerI1CopiesPass(*PR); + initializeSIFixSGPRCopiesPass(*PR); + initializeSIFoldOperandsPass(*PR); + initializeSIFixSGPRLiveRangesPass(*PR); + initializeSIFixControlFlowLiveIntervalsPass(*PR); + initializeSILoadStoreOptimizerPass(*PR); + initializeAMDGPUAnnotateKernelFeaturesPass(*PR); + initializeAMDGPUAnnotateUniformValuesPass(*PR); +} + +static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { + if (TT.getOS() == Triple::AMDHSA) + return make_unique<AMDGPUHSATargetObjectFile>(); + + return make_unique<AMDGPUTargetObjectFile>(); +} + +static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { + return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>()); +} + +static MachineSchedRegistry +R600SchedRegistry("r600", "Run R600's custom scheduler", + createR600MachineScheduler); + +static MachineSchedRegistry +SISchedRegistry("si", "Run SI's custom scheduler", + createSIMachineScheduler); + +static std::string computeDataLayout(const Triple &TT) { + std::string Ret = "e-p:32:32"; + + if (TT.getArch() == Triple::amdgcn) { + // 32-bit private, local, and region pointers. 64-bit global and constant. + Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64"; + } + + Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256" + "-v512:512-v1024:1024-v2048:2048-n32:64"; + + return Ret; +} + +AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, + CodeGenOpt::Level OptLevel) + : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM, + OptLevel), + TLOF(createTLOF(getTargetTriple())), Subtarget(TT, CPU, FS, *this), + IntrinsicInfo() { + setRequiresStructuredCFG(true); + initAsmInfo(); +} + +AMDGPUTargetMachine::~AMDGPUTargetMachine() { } + +//===----------------------------------------------------------------------===// +// R600 Target Machine (R600 -> Cayman) +//===----------------------------------------------------------------------===// + +R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, + StringRef FS, StringRef CPU, + TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL) + : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {} + +//===----------------------------------------------------------------------===// +// GCN Target Machine (SI+) +//===----------------------------------------------------------------------===// + +GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, + StringRef FS, StringRef CPU, + TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL) + : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {} + +//===----------------------------------------------------------------------===// +// AMDGPU Pass Setup +//===----------------------------------------------------------------------===// + +namespace { +class AMDGPUPassConfig : public TargetPassConfig { +public: + AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) + : TargetPassConfig(TM, PM) { + + // Exceptions and StackMaps are not supported, so these passes will never do + // anything. + disablePass(&StackMapLivenessID); + disablePass(&FuncletLayoutID); + } + + AMDGPUTargetMachine &getAMDGPUTargetMachine() const { + return getTM<AMDGPUTargetMachine>(); + } + + ScheduleDAGInstrs * + createMachineScheduler(MachineSchedContext *C) const override { + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); + if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) + return createR600MachineScheduler(C); + return nullptr; + } + + void addIRPasses() override; + void addCodeGenPrepare() override; + bool addPreISel() override; + bool addInstSelector() override; + bool addGCPasses() override; +}; + +class R600PassConfig : public AMDGPUPassConfig { +public: + R600PassConfig(TargetMachine *TM, PassManagerBase &PM) + : AMDGPUPassConfig(TM, PM) { } + + bool addPreISel() override; + void addPreRegAlloc() override; + void addPreSched2() override; + void addPreEmitPass() override; +}; + +class GCNPassConfig : public AMDGPUPassConfig { +public: + GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) + : AMDGPUPassConfig(TM, PM) { } + bool addPreISel() override; + bool addInstSelector() override; + void addFastRegAlloc(FunctionPass *RegAllocPass) override; + void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; + void addPreRegAlloc() override; + void addPostRegAlloc() override; + void addPreSched2() override; + void addPreEmitPass() override; +}; + +} // End of anonymous namespace + +TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { + return TargetIRAnalysis([this](const Function &F) { + return TargetTransformInfo( + AMDGPUTTIImpl(this, F.getParent()->getDataLayout())); + }); +} + +void AMDGPUPassConfig::addIRPasses() { + // Function calls are not supported, so make sure we inline everything. + addPass(createAMDGPUAlwaysInlinePass()); + addPass(createAlwaysInlinerPass()); + // We need to add the barrier noop pass, otherwise adding the function + // inlining pass will cause all of the PassConfigs passes to be run + // one function at a time, which means if we have a nodule with two + // functions, then we will generate code for the first function + // without ever running any passes on the second. + addPass(createBarrierNoopPass()); + + // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. + addPass(createAMDGPUOpenCLImageTypeLoweringPass()); + + TargetPassConfig::addIRPasses(); +} + +void AMDGPUPassConfig::addCodeGenPrepare() { + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); + if (ST.isPromoteAllocaEnabled()) { + addPass(createAMDGPUPromoteAlloca(ST)); + addPass(createSROAPass()); + } + TargetPassConfig::addCodeGenPrepare(); +} + +bool +AMDGPUPassConfig::addPreISel() { + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); + addPass(createFlattenCFGPass()); + if (ST.IsIRStructurizerEnabled()) + addPass(createStructurizeCFGPass()); + return false; +} + +bool AMDGPUPassConfig::addInstSelector() { + addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); + return false; +} + +bool AMDGPUPassConfig::addGCPasses() { + // Do nothing. GC is not supported. + return false; +} + +//===----------------------------------------------------------------------===// +// R600 Pass Setup +//===----------------------------------------------------------------------===// + +bool R600PassConfig::addPreISel() { + AMDGPUPassConfig::addPreISel(); + addPass(createR600TextureIntrinsicsReplacer()); + return false; +} + +void R600PassConfig::addPreRegAlloc() { + addPass(createR600VectorRegMerger(*TM)); +} + +void R600PassConfig::addPreSched2() { + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); + addPass(createR600EmitClauseMarkers(), false); + if (ST.isIfCvtEnabled()) + addPass(&IfConverterID, false); + addPass(createR600ClauseMergePass(*TM), false); +} + +void R600PassConfig::addPreEmitPass() { + addPass(createAMDGPUCFGStructurizerPass(), false); + addPass(createR600ExpandSpecialInstrsPass(*TM), false); + addPass(&FinalizeMachineBundlesID, false); + addPass(createR600Packetizer(*TM), false); + addPass(createR600ControlFlowFinalizer(*TM), false); +} + +TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { + return new R600PassConfig(this, PM); +} + +//===----------------------------------------------------------------------===// +// GCN Pass Setup +//===----------------------------------------------------------------------===// + +bool GCNPassConfig::addPreISel() { + AMDGPUPassConfig::addPreISel(); + + // FIXME: We need to run a pass to propagate the attributes when calls are + // supported. + addPass(&AMDGPUAnnotateKernelFeaturesID); + + addPass(createSinkingPass()); + addPass(createSITypeRewriter()); + addPass(createSIAnnotateControlFlowPass()); + addPass(createAMDGPUAnnotateUniformValues()); + + return false; +} + +bool GCNPassConfig::addInstSelector() { + AMDGPUPassConfig::addInstSelector(); + addPass(createSILowerI1CopiesPass()); + addPass(&SIFixSGPRCopiesID); + addPass(createSIFoldOperandsPass()); + return false; +} + +void GCNPassConfig::addPreRegAlloc() { + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); + + // This needs to be run directly before register allocation because + // earlier passes might recompute live intervals. + // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass + if (getOptLevel() > CodeGenOpt::None) { + insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); + } + + if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) { + // Don't do this with no optimizations since it throws away debug info by + // merging nonadjacent loads. + + // This should be run after scheduling, but before register allocation. It + // also need extra copies to the address operand to be eliminated. + insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); + insertPass(&MachineSchedulerID, &RegisterCoalescerID); + } + addPass(createSIShrinkInstructionsPass(), false); +} + +void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { + addPass(&SIFixSGPRLiveRangesID); + TargetPassConfig::addFastRegAlloc(RegAllocPass); +} + +void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { + // We want to run this after LiveVariables is computed to avoid computing them + // twice. + // FIXME: We shouldn't disable the verifier here. r249087 introduced a failure + // that needs to be fixed. + insertPass(&LiveVariablesID, &SIFixSGPRLiveRangesID, /*VerifyAfter=*/false); + TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); +} + +void GCNPassConfig::addPostRegAlloc() { + addPass(createSIShrinkInstructionsPass(), false); +} + +void GCNPassConfig::addPreSched2() { +} + +void GCNPassConfig::addPreEmitPass() { + addPass(createSIInsertWaits(*TM), false); + addPass(createSILowerControlFlowPass(*TM), false); +} + +TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { + return new GCNPassConfig(this, PM); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h new file mode 100644 index 0000000..236e3f8 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -0,0 +1,89 @@ +//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief The AMDGPU TargetMachine interface definition for hw codgen targets. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H +#define LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H + +#include "AMDGPUFrameLowering.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPUSubtarget.h" +#include "R600ISelLowering.h" +#include "llvm/IR/DataLayout.h" + +namespace llvm { + +//===----------------------------------------------------------------------===// +// AMDGPU Target Machine (R600+) +//===----------------------------------------------------------------------===// + +class AMDGPUTargetMachine : public LLVMTargetMachine { +private: + +protected: + std::unique_ptr<TargetLoweringObjectFile> TLOF; + AMDGPUSubtarget Subtarget; + AMDGPUIntrinsicInfo IntrinsicInfo; + +public: + AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef FS, + StringRef CPU, TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL); + ~AMDGPUTargetMachine(); + + const AMDGPUSubtarget *getSubtargetImpl() const { return &Subtarget; } + const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override { + return &Subtarget; + } + const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { + return &IntrinsicInfo; + } + TargetIRAnalysis getTargetIRAnalysis() override; + + TargetLoweringObjectFile *getObjFileLowering() const override { + return TLOF.get(); + } +}; + +//===----------------------------------------------------------------------===// +// R600 Target Machine (R600 -> Cayman) +//===----------------------------------------------------------------------===// + +class R600TargetMachine : public AMDGPUTargetMachine { + +public: + R600TargetMachine(const Target &T, const Triple &TT, StringRef FS, + StringRef CPU, TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL); + + TargetPassConfig *createPassConfig(PassManagerBase &PM) override; +}; + +//===----------------------------------------------------------------------===// +// GCN Target Machine (SI+) +//===----------------------------------------------------------------------===// + +class GCNTargetMachine : public AMDGPUTargetMachine { + +public: + GCNTargetMachine(const Target &T, const Triple &TT, StringRef FS, + StringRef CPU, TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL); + + TargetPassConfig *createPassConfig(PassManagerBase &PM) override; +}; + +} // End namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp new file mode 100644 index 0000000..e050f21 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp @@ -0,0 +1,87 @@ +//===-- AMDGPUHSATargetObjectFile.cpp - AMDGPU Object Files ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetObjectFile.h" +#include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/Support/ELF.h" + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Generic Object File +//===----------------------------------------------------------------------===// + +MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV, + SectionKind Kind, + Mangler &Mang, + const TargetMachine &TM) const { + if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GV)) + return TextSection; + + return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM); +} + +//===----------------------------------------------------------------------===// +// HSA Object File +//===----------------------------------------------------------------------===// + + +void AMDGPUHSATargetObjectFile::Initialize(MCContext &Ctx, + const TargetMachine &TM){ + TargetLoweringObjectFileELF::Initialize(Ctx, TM); + InitializeELF(TM.Options.UseInitArray); + + TextSection = AMDGPU::getHSATextSection(Ctx); + + DataGlobalAgentSection = AMDGPU::getHSADataGlobalAgentSection(Ctx); + DataGlobalProgramSection = AMDGPU::getHSADataGlobalProgramSection(Ctx); + + RodataReadonlyAgentSection = AMDGPU::getHSARodataReadonlyAgentSection(Ctx); +} + +bool AMDGPUHSATargetObjectFile::isAgentAllocationSection( + const char *SectionName) const { + return cast<MCSectionELF>(DataGlobalAgentSection) + ->getSectionName() + .equals(SectionName); +} + +bool AMDGPUHSATargetObjectFile::isAgentAllocation(const GlobalValue *GV) const { + // Read-only segments can only have agent allocation. + return AMDGPU::isReadOnlySegment(GV) || + (AMDGPU::isGlobalSegment(GV) && GV->hasSection() && + isAgentAllocationSection(GV->getSection())); +} + +bool AMDGPUHSATargetObjectFile::isProgramAllocation( + const GlobalValue *GV) const { + // The default for global segments is program allocation. + return AMDGPU::isGlobalSegment(GV) && !isAgentAllocation(GV); +} + +MCSection *AMDGPUHSATargetObjectFile::SelectSectionForGlobal( + const GlobalValue *GV, SectionKind Kind, + Mangler &Mang, + const TargetMachine &TM) const { + if (Kind.isText() && !GV->hasComdat()) + return getTextSection(); + + if (AMDGPU::isGlobalSegment(GV)) { + if (isAgentAllocation(GV)) + return DataGlobalAgentSection; + + if (isProgramAllocation(GV)) + return DataGlobalProgramSection; + } + + return AMDGPUTargetObjectFile::SelectSectionForGlobal(GV, Kind, Mang, TM); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h new file mode 100644 index 0000000..921341e --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h @@ -0,0 +1,51 @@ +//===-- AMDGPUTargetObjectFile.h - AMDGPU Object Info ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file declares the AMDGPU-specific subclass of +/// TargetLoweringObjectFile. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H + +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + +class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF { + public: + MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, + Mangler &Mang, + const TargetMachine &TM) const override; +}; + +class AMDGPUHSATargetObjectFile final : public AMDGPUTargetObjectFile { +private: + MCSection *DataGlobalAgentSection; + MCSection *DataGlobalProgramSection; + MCSection *RodataReadonlyAgentSection; + + bool isAgentAllocationSection(const char *SectionName) const; + bool isAgentAllocation(const GlobalValue *GV) const; + bool isProgramAllocation(const GlobalValue *GV) const; + +public: + void Initialize(MCContext &Ctx, const TargetMachine &TM) override; + + MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, + Mangler &Mang, + const TargetMachine &TM) const override; +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp new file mode 100644 index 0000000..54a003d --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -0,0 +1,182 @@ +//===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// \file +// This file implements a TargetTransformInfo analysis pass specific to the +// AMDGPU target machine. It uses the target's detailed information to provide +// more precise answers to certain TTI queries, while letting the target +// independent and default TTI implementations handle the rest. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetTransformInfo.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/CostTable.h" +#include "llvm/Target/TargetLowering.h" +using namespace llvm; + +#define DEBUG_TYPE "AMDGPUtti" + +void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, + TTI::UnrollingPreferences &UP) { + UP.Threshold = 300; // Twice the default. + UP.MaxCount = UINT_MAX; + UP.Partial = true; + + // TODO: Do we want runtime unrolling? + + for (const BasicBlock *BB : L->getBlocks()) { + const DataLayout &DL = BB->getModule()->getDataLayout(); + for (const Instruction &I : *BB) { + const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I); + if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) + continue; + + const Value *Ptr = GEP->getPointerOperand(); + const AllocaInst *Alloca = + dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL)); + if (Alloca) { + // We want to do whatever we can to limit the number of alloca + // instructions that make it through to the code generator. allocas + // require us to use indirect addressing, which is slow and prone to + // compiler bugs. If this loop does an address calculation on an + // alloca ptr, then we want to use a higher than normal loop unroll + // threshold. This will give SROA a better chance to eliminate these + // allocas. + // + // Don't use the maximum allowed value here as it will make some + // programs way too big. + UP.Threshold = 800; + } + } + } +} + +unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) { + if (Vec) + return 0; + + // Number of VGPRs on SI. + if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) + return 256; + + return 4 * 128; // XXX - 4 channels. Should these count as vector instead? +} + +unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) { + return Vector ? 0 : 32; +} + +unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { + // Semi-arbitrary large amount. + return 64; +} + +unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) { + // XXX - For some reason this isn't called for switch. + switch (Opcode) { + case Instruction::Br: + case Instruction::Ret: + return 10; + default: + return BaseT::getCFInstrCost(Opcode); + } +} + +int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, + unsigned Index) { + switch (Opcode) { + case Instruction::ExtractElement: + // Dynamic indexing isn't free and is best avoided. + return Index == ~0u ? 2 : 0; + default: + return BaseT::getVectorInstrCost(Opcode, ValTy, Index); + } +} + +static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII, + const IntrinsicInst *I) { + switch (I->getIntrinsicID()) { + default: + return false; + case Intrinsic::not_intrinsic: + // This means we have an intrinsic that isn't defined in + // IntrinsicsAMDGPU.td + break; + + case Intrinsic::amdgcn_interp_p1: + case Intrinsic::amdgcn_interp_p2: + case Intrinsic::amdgcn_mbcnt_hi: + case Intrinsic::amdgcn_mbcnt_lo: + case Intrinsic::r600_read_tidig_x: + case Intrinsic::r600_read_tidig_y: + case Intrinsic::r600_read_tidig_z: + return true; + } + + StringRef Name = I->getCalledFunction()->getName(); + switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) { + default: + return false; + case AMDGPUIntrinsic::SI_tid: + case AMDGPUIntrinsic::SI_fs_interp: + return true; + } +} + +static bool isArgPassedInSGPR(const Argument *A) { + const Function *F = A->getParent(); + unsigned ShaderType = AMDGPU::getShaderType(*F); + + // Arguments to compute shaders are never a source of divergence. + if (ShaderType == ShaderType::COMPUTE) + return true; + + // For non-compute shaders, SGPR inputs are marked with either inreg or byval. + if (F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::InReg) || + F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::ByVal)) + return true; + + // Everything else is in VGPRs. + return false; +} + +/// +/// \returns true if the result of the value could potentially be +/// different across workitems in a wavefront. +bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { + + if (const Argument *A = dyn_cast<Argument>(V)) + return !isArgPassedInSGPR(A); + + // Loads from the private address space are divergent, because threads + // can execute the load instruction with the same inputs and get different + // results. + // + // All other loads are not divergent, because if threads issue loads with the + // same arguments, they will always get the same result. + if (const LoadInst *Load = dyn_cast<LoadInst>(V)) + return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; + + if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { + const TargetMachine &TM = getTLI()->getTargetMachine(); + return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic); + } + + // Assume all function calls are a source of divergence. + if (isa<CallInst>(V) || isa<InvokeInst>(V)) + return true; + + return false; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h new file mode 100644 index 0000000..976afb0 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -0,0 +1,72 @@ +//===-- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file a TargetTransformInfo::Concept conforming object specific to the +/// AMDGPU target machine. It uses the target's detailed information to +/// provide more precise answers to certain TTI queries, while letting the +/// target independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H +#define LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H + +#include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/Target/TargetLowering.h" + +namespace llvm { + +class AMDGPUTTIImpl : public BasicTTIImplBase<AMDGPUTTIImpl> { + typedef BasicTTIImplBase<AMDGPUTTIImpl> BaseT; + typedef TargetTransformInfo TTI; + friend BaseT; + + const AMDGPUSubtarget *ST; + const AMDGPUTargetLowering *TLI; + + const AMDGPUSubtarget *getST() const { return ST; } + const AMDGPUTargetLowering *getTLI() const { return TLI; } + +public: + explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const DataLayout &DL) + : BaseT(TM, DL), ST(TM->getSubtargetImpl()), + TLI(ST->getTargetLowering()) {} + + // Provide value semantics. MSVC requires that we spell all of these out. + AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg) + : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {} + AMDGPUTTIImpl(AMDGPUTTIImpl &&Arg) + : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), + TLI(std::move(Arg.TLI)) {} + + bool hasBranchDivergence() { return true; } + + void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); + + TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { + assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); + return ST->hasBCNT(TyWidth) ? TTI::PSK_FastHardware : TTI::PSK_Software; + } + + unsigned getNumberOfRegisters(bool Vector); + unsigned getRegisterBitWidth(bool Vector); + unsigned getMaxInterleaveFactor(unsigned VF); + + unsigned getCFInstrCost(unsigned Opcode); + + int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); + bool isSourceOfDivergence(const Value *V) const; +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp new file mode 100644 index 0000000..917efd1 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -0,0 +1,1897 @@ +//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//==-----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPUSubtarget.h" +#include "R600InstrInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include <deque> + +using namespace llvm; + +#define DEBUG_TYPE "structcfg" + +#define DEFAULT_VEC_SLOTS 8 + +// TODO: move-begin. + +//===----------------------------------------------------------------------===// +// +// Statistics for CFGStructurizer. +// +//===----------------------------------------------------------------------===// + +STATISTIC(numSerialPatternMatch, "CFGStructurizer number of serial pattern " + "matched"); +STATISTIC(numIfPatternMatch, "CFGStructurizer number of if pattern " + "matched"); +STATISTIC(numLoopcontPatternMatch, "CFGStructurizer number of loop-continue " + "pattern matched"); +STATISTIC(numClonedBlock, "CFGStructurizer cloned blocks"); +STATISTIC(numClonedInstr, "CFGStructurizer cloned instructions"); + +namespace llvm { + void initializeAMDGPUCFGStructurizerPass(PassRegistry&); +} + +//===----------------------------------------------------------------------===// +// +// Miscellaneous utility for CFGStructurizer. +// +//===----------------------------------------------------------------------===// +namespace { +#define SHOWNEWINSTR(i) \ + DEBUG(dbgs() << "New instr: " << *i << "\n"); + +#define SHOWNEWBLK(b, msg) \ +DEBUG( \ + dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ + dbgs() << "\n"; \ +); + +#define SHOWBLK_DETAIL(b, msg) \ +DEBUG( \ + if (b) { \ + dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ + b->print(dbgs()); \ + dbgs() << "\n"; \ + } \ +); + +#define INVALIDSCCNUM -1 + +template<class NodeT> +void ReverseVector(SmallVectorImpl<NodeT *> &Src) { + size_t sz = Src.size(); + for (size_t i = 0; i < sz/2; ++i) { + NodeT *t = Src[i]; + Src[i] = Src[sz - i - 1]; + Src[sz - i - 1] = t; + } +} + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// +// +// supporting data structure for CFGStructurizer +// +//===----------------------------------------------------------------------===// + + +namespace { + +class BlockInformation { +public: + bool IsRetired; + int SccNum; + BlockInformation() : IsRetired(false), SccNum(INVALIDSCCNUM) {} +}; + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// +// +// CFGStructurizer +// +//===----------------------------------------------------------------------===// + +namespace { +class AMDGPUCFGStructurizer : public MachineFunctionPass { +public: + typedef SmallVector<MachineBasicBlock *, 32> MBBVector; + typedef std::map<MachineBasicBlock *, BlockInformation *> MBBInfoMap; + typedef std::map<MachineLoop *, MachineBasicBlock *> LoopLandInfoMap; + + enum PathToKind { + Not_SinglePath = 0, + SinglePath_InPath = 1, + SinglePath_NotInPath = 2 + }; + + static char ID; + + AMDGPUCFGStructurizer() : + MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) { + initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry()); + } + + const char *getPassName() const override { + return "AMDGPU Control Flow Graph structurizer Pass"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved<MachineFunctionAnalysis>(); + AU.addRequired<MachineFunctionAnalysis>(); + AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachinePostDominatorTree>(); + AU.addRequired<MachineLoopInfo>(); + } + + /// Perform the CFG structurization + bool run(); + + /// Perform the CFG preparation + /// This step will remove every unconditionnal/dead jump instructions and make + /// sure all loops have an exit block + bool prepare(); + + bool runOnMachineFunction(MachineFunction &MF) override { + TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); + TRI = &TII->getRegisterInfo(); + DEBUG(MF.dump();); + OrderedBlks.clear(); + Visited.clear(); + FuncRep = &MF; + MLI = &getAnalysis<MachineLoopInfo>(); + DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI);); + MDT = &getAnalysis<MachineDominatorTree>(); + DEBUG(MDT->print(dbgs(), (const llvm::Module*)nullptr);); + PDT = &getAnalysis<MachinePostDominatorTree>(); + DEBUG(PDT->print(dbgs());); + prepare(); + run(); + DEBUG(MF.dump();); + return true; + } + +protected: + MachineDominatorTree *MDT; + MachinePostDominatorTree *PDT; + MachineLoopInfo *MLI; + const R600InstrInfo *TII; + const R600RegisterInfo *TRI; + + // PRINT FUNCTIONS + /// Print the ordered Blocks. + void printOrderedBlocks() const { + size_t i = 0; + for (MBBVector::const_iterator iterBlk = OrderedBlks.begin(), + iterBlkEnd = OrderedBlks.end(); iterBlk != iterBlkEnd; ++iterBlk, ++i) { + dbgs() << "BB" << (*iterBlk)->getNumber(); + dbgs() << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")"; + if (i != 0 && i % 10 == 0) { + dbgs() << "\n"; + } else { + dbgs() << " "; + } + } + } + static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) { + for (MachineLoop::iterator iter = LoopInfo.begin(), + iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) { + (*iter)->print(dbgs(), 0); + } + } + + // UTILITY FUNCTIONS + int getSCCNum(MachineBasicBlock *MBB) const; + MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const; + bool hasBackEdge(MachineBasicBlock *MBB) const; + static unsigned getLoopDepth(MachineLoop *LoopRep); + bool isRetiredBlock(MachineBasicBlock *MBB) const; + bool isActiveLoophead(MachineBasicBlock *MBB) const; + PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB, + bool AllowSideEntry = true) const; + int countActiveBlock(MBBVector::const_iterator It, + MBBVector::const_iterator E) const; + bool needMigrateBlock(MachineBasicBlock *MBB) const; + + // Utility Functions + void reversePredicateSetter(MachineBasicBlock::iterator I); + /// Compute the reversed DFS post order of Blocks + void orderBlocks(MachineFunction *MF); + + // Function originally from CFGStructTraits + void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode, + DebugLoc DL = DebugLoc()); + MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode, + DebugLoc DL = DebugLoc()); + MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode); + void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode, + DebugLoc DL); + void insertCondBranchBefore(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, int NewOpcode, int RegNum, + DebugLoc DL); + void insertCondBranchEnd(MachineBasicBlock *MBB, int NewOpcode, int RegNum); + static int getBranchNzeroOpcode(int OldOpcode); + static int getBranchZeroOpcode(int OldOpcode); + static int getContinueNzeroOpcode(int OldOpcode); + static int getContinueZeroOpcode(int OldOpcode); + static MachineBasicBlock *getTrueBranch(MachineInstr *MI); + static void setTrueBranch(MachineInstr *MI, MachineBasicBlock *MBB); + static MachineBasicBlock *getFalseBranch(MachineBasicBlock *MBB, + MachineInstr *MI); + static bool isCondBranch(MachineInstr *MI); + static bool isUncondBranch(MachineInstr *MI); + static DebugLoc getLastDebugLocInBB(MachineBasicBlock *MBB); + static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *MBB); + /// The correct naming for this is getPossibleLoopendBlockBranchInstr. + /// + /// BB with backward-edge could have move instructions after the branch + /// instruction. Such move instruction "belong to" the loop backward-edge. + MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB); + static MachineInstr *getReturnInstr(MachineBasicBlock *MBB); + static MachineInstr *getContinueInstr(MachineBasicBlock *MBB); + static bool isReturnBlock(MachineBasicBlock *MBB); + static void cloneSuccessorList(MachineBasicBlock *DstMBB, + MachineBasicBlock *SrcMBB) ; + static MachineBasicBlock *clone(MachineBasicBlock *MBB); + /// MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose + /// because the AMDGPU instruction is not recognized as terminator fix this + /// and retire this routine + void replaceInstrUseOfBlockWith(MachineBasicBlock *SrcMBB, + MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk); + static void wrapup(MachineBasicBlock *MBB); + + + int patternMatch(MachineBasicBlock *MBB); + int patternMatchGroup(MachineBasicBlock *MBB); + int serialPatternMatch(MachineBasicBlock *MBB); + int ifPatternMatch(MachineBasicBlock *MBB); + int loopendPatternMatch(); + int mergeLoop(MachineLoop *LoopRep); + int loopcontPatternMatch(MachineLoop *LoopRep, MachineBasicBlock *LoopHeader); + + void handleLoopcontBlock(MachineBasicBlock *ContingMBB, + MachineLoop *ContingLoop, MachineBasicBlock *ContMBB, + MachineLoop *ContLoop); + /// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in + /// the same loop with LoopLandInfo without explicitly keeping track of + /// loopContBlks and loopBreakBlks, this is a method to get the information. + bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB, + MachineBasicBlock *Src2MBB); + int handleJumpintoIf(MachineBasicBlock *HeadMBB, + MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB); + int handleJumpintoIfImp(MachineBasicBlock *HeadMBB, + MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB); + int improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, + MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, + MachineBasicBlock **LandMBBPtr); + void showImproveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, + MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, + MachineBasicBlock *LandMBB, bool Detail = false); + int cloneOnSideEntryTo(MachineBasicBlock *PreMBB, + MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB); + void mergeSerialBlock(MachineBasicBlock *DstMBB, + MachineBasicBlock *SrcMBB); + + void mergeIfthenelseBlock(MachineInstr *BranchMI, + MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB, + MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB); + void mergeLooplandBlock(MachineBasicBlock *DstMBB, + MachineBasicBlock *LandMBB); + void mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, + MachineBasicBlock *LandMBB); + void settleLoopcontBlock(MachineBasicBlock *ContingMBB, + MachineBasicBlock *ContMBB); + /// normalizeInfiniteLoopExit change + /// B1: + /// uncond_br LoopHeader + /// + /// to + /// B1: + /// cond_br 1 LoopHeader dummyExit + /// and return the newly added dummy exit block + MachineBasicBlock *normalizeInfiniteLoopExit(MachineLoop *LoopRep); + void removeUnconditionalBranch(MachineBasicBlock *MBB); + /// Remove duplicate branches instructions in a block. + /// For instance + /// B0: + /// cond_br X B1 B2 + /// cond_br X B1 B2 + /// is transformed to + /// B0: + /// cond_br X B1 B2 + void removeRedundantConditionalBranch(MachineBasicBlock *MBB); + void addDummyExitBlock(SmallVectorImpl<MachineBasicBlock *> &RetMBB); + void removeSuccessor(MachineBasicBlock *MBB); + MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB, + MachineBasicBlock *PredMBB); + void migrateInstruction(MachineBasicBlock *SrcMBB, + MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I); + void recordSccnum(MachineBasicBlock *MBB, int SCCNum); + void retireBlock(MachineBasicBlock *MBB); + void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = nullptr); + + MachineBasicBlock *findNearestCommonPostDom(std::set<MachineBasicBlock *>&); + /// This is work around solution for findNearestCommonDominator not available + /// to post dom a proper fix should go to Dominators.h. + MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1, + MachineBasicBlock *MBB2); + +private: + MBBInfoMap BlockInfoMap; + LoopLandInfoMap LLInfoMap; + std::map<MachineLoop *, bool> Visited; + MachineFunction *FuncRep; + SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks; +}; + +int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const { + MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB); + if (It == BlockInfoMap.end()) + return INVALIDSCCNUM; + return (*It).second->SccNum; +} + +MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep) + const { + LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep); + if (It == LLInfoMap.end()) + return nullptr; + return (*It).second; +} + +bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const { + MachineLoop *LoopRep = MLI->getLoopFor(MBB); + if (!LoopRep) + return false; + MachineBasicBlock *LoopHeader = LoopRep->getHeader(); + return MBB->isSuccessor(LoopHeader); +} + +unsigned AMDGPUCFGStructurizer::getLoopDepth(MachineLoop *LoopRep) { + return LoopRep ? LoopRep->getLoopDepth() : 0; +} + +bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const { + MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB); + if (It == BlockInfoMap.end()) + return false; + return (*It).second->IsRetired; +} + +bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const { + MachineLoop *LoopRep = MLI->getLoopFor(MBB); + while (LoopRep && LoopRep->getHeader() == MBB) { + MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep); + if(!LoopLand) + return true; + if (!isRetiredBlock(LoopLand)) + return true; + LoopRep = LoopRep->getParentLoop(); + } + return false; +} +AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo( + MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB, + bool AllowSideEntry) const { + assert(DstMBB); + if (SrcMBB == DstMBB) + return SinglePath_InPath; + while (SrcMBB && SrcMBB->succ_size() == 1) { + SrcMBB = *SrcMBB->succ_begin(); + if (SrcMBB == DstMBB) + return SinglePath_InPath; + if (!AllowSideEntry && SrcMBB->pred_size() > 1) + return Not_SinglePath; + } + if (SrcMBB && SrcMBB->succ_size()==0) + return SinglePath_NotInPath; + return Not_SinglePath; +} + +int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It, + MBBVector::const_iterator E) const { + int Count = 0; + while (It != E) { + if (!isRetiredBlock(*It)) + ++Count; + ++It; + } + return Count; +} + +bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const { + unsigned BlockSizeThreshold = 30; + unsigned CloneInstrThreshold = 100; + bool MultiplePreds = MBB && (MBB->pred_size() > 1); + + if(!MultiplePreds) + return false; + unsigned BlkSize = MBB->size(); + return ((BlkSize > BlockSizeThreshold) && + (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold)); +} + +void AMDGPUCFGStructurizer::reversePredicateSetter( + MachineBasicBlock::iterator I) { + while (I--) { + if (I->getOpcode() == AMDGPU::PRED_X) { + switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) { + case OPCODE_IS_ZERO_INT: + static_cast<MachineInstr *>(I)->getOperand(2) + .setImm(OPCODE_IS_NOT_ZERO_INT); + return; + case OPCODE_IS_NOT_ZERO_INT: + static_cast<MachineInstr *>(I)->getOperand(2) + .setImm(OPCODE_IS_ZERO_INT); + return; + case OPCODE_IS_ZERO: + static_cast<MachineInstr *>(I)->getOperand(2) + .setImm(OPCODE_IS_NOT_ZERO); + return; + case OPCODE_IS_NOT_ZERO: + static_cast<MachineInstr *>(I)->getOperand(2) + .setImm(OPCODE_IS_ZERO); + return; + default: + llvm_unreachable("PRED_X Opcode invalid!"); + } + } + } +} + +void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB, + int NewOpcode, DebugLoc DL) { + MachineInstr *MI = MBB->getParent() + ->CreateMachineInstr(TII->get(NewOpcode), DL); + MBB->push_back(MI); + //assume the instruction doesn't take any reg operand ... + SHOWNEWINSTR(MI); +} + +MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB, + int NewOpcode, DebugLoc DL) { + MachineInstr *MI = + MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL); + if (MBB->begin() != MBB->end()) + MBB->insert(MBB->begin(), MI); + else + MBB->push_back(MI); + SHOWNEWINSTR(MI); + return MI; +} + +MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore( + MachineBasicBlock::iterator I, int NewOpcode) { + MachineInstr *OldMI = &(*I); + MachineBasicBlock *MBB = OldMI->getParent(); + MachineInstr *NewMBB = + MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DebugLoc()); + MBB->insert(I, NewMBB); + //assume the instruction doesn't take any reg operand ... + SHOWNEWINSTR(NewMBB); + return NewMBB; +} + +void AMDGPUCFGStructurizer::insertCondBranchBefore( + MachineBasicBlock::iterator I, int NewOpcode, DebugLoc DL) { + MachineInstr *OldMI = &(*I); + MachineBasicBlock *MBB = OldMI->getParent(); + MachineFunction *MF = MBB->getParent(); + MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL); + MBB->insert(I, NewMI); + MachineInstrBuilder MIB(*MF, NewMI); + MIB.addReg(OldMI->getOperand(1).getReg(), false); + SHOWNEWINSTR(NewMI); + //erase later oldInstr->eraseFromParent(); +} + +void AMDGPUCFGStructurizer::insertCondBranchBefore(MachineBasicBlock *blk, + MachineBasicBlock::iterator I, int NewOpcode, int RegNum, + DebugLoc DL) { + MachineFunction *MF = blk->getParent(); + MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL); + //insert before + blk->insert(I, NewInstr); + MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false); + SHOWNEWINSTR(NewInstr); +} + +void AMDGPUCFGStructurizer::insertCondBranchEnd(MachineBasicBlock *MBB, + int NewOpcode, int RegNum) { + MachineFunction *MF = MBB->getParent(); + MachineInstr *NewInstr = + MF->CreateMachineInstr(TII->get(NewOpcode), DebugLoc()); + MBB->push_back(NewInstr); + MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false); + SHOWNEWINSTR(NewInstr); +} + +int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) { + switch(OldOpcode) { + case AMDGPU::JUMP_COND: + case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; + case AMDGPU::BRANCH_COND_i32: + case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32; + default: llvm_unreachable("internal error"); + } + return -1; +} + +int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) { + switch(OldOpcode) { + case AMDGPU::JUMP_COND: + case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; + case AMDGPU::BRANCH_COND_i32: + case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32; + default: llvm_unreachable("internal error"); + } + return -1; +} + +int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) { + switch(OldOpcode) { + case AMDGPU::JUMP_COND: + case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32; + default: llvm_unreachable("internal error"); + }; + return -1; +} + +int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) { + switch(OldOpcode) { + case AMDGPU::JUMP_COND: + case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32; + default: llvm_unreachable("internal error"); + } + return -1; +} + +MachineBasicBlock *AMDGPUCFGStructurizer::getTrueBranch(MachineInstr *MI) { + return MI->getOperand(0).getMBB(); +} + +void AMDGPUCFGStructurizer::setTrueBranch(MachineInstr *MI, + MachineBasicBlock *MBB) { + MI->getOperand(0).setMBB(MBB); +} + +MachineBasicBlock * +AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB, + MachineInstr *MI) { + assert(MBB->succ_size() == 2); + MachineBasicBlock *TrueBranch = getTrueBranch(MI); + MachineBasicBlock::succ_iterator It = MBB->succ_begin(); + MachineBasicBlock::succ_iterator Next = It; + ++Next; + return (*It == TrueBranch) ? *Next : *It; +} + +bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) { + switch (MI->getOpcode()) { + case AMDGPU::JUMP_COND: + case AMDGPU::BRANCH_COND_i32: + case AMDGPU::BRANCH_COND_f32: return true; + default: + return false; + } + return false; +} + +bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) { + switch (MI->getOpcode()) { + case AMDGPU::JUMP: + case AMDGPU::BRANCH: + return true; + default: + return false; + } + return false; +} + +DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) { + //get DebugLoc from the first MachineBasicBlock instruction with debug info + DebugLoc DL; + for (MachineBasicBlock::iterator It = MBB->begin(); It != MBB->end(); + ++It) { + MachineInstr *instr = &(*It); + if (instr->getDebugLoc()) + DL = instr->getDebugLoc(); + } + return DL; +} + +MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr( + MachineBasicBlock *MBB) { + MachineBasicBlock::reverse_iterator It = MBB->rbegin(); + MachineInstr *MI = &*It; + if (MI && (isCondBranch(MI) || isUncondBranch(MI))) + return MI; + return nullptr; +} + +MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr( + MachineBasicBlock *MBB) { + for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend(); + It != E; ++It) { + // FIXME: Simplify + MachineInstr *MI = &*It; + if (MI) { + if (isCondBranch(MI) || isUncondBranch(MI)) + return MI; + else if (!TII->isMov(MI->getOpcode())) + break; + } + } + return nullptr; +} + +MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) { + MachineBasicBlock::reverse_iterator It = MBB->rbegin(); + if (It != MBB->rend()) { + MachineInstr *instr = &(*It); + if (instr->getOpcode() == AMDGPU::RETURN) + return instr; + } + return nullptr; +} + +MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) { + MachineBasicBlock::reverse_iterator It = MBB->rbegin(); + if (It != MBB->rend()) { + MachineInstr *MI = &(*It); + if (MI->getOpcode() == AMDGPU::CONTINUE) + return MI; + } + return nullptr; +} + +bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) { + MachineInstr *MI = getReturnInstr(MBB); + bool IsReturn = (MBB->succ_size() == 0); + if (MI) + assert(IsReturn); + else if (IsReturn) + DEBUG( + dbgs() << "BB" << MBB->getNumber() + <<" is return block without RETURN instr\n";); + return IsReturn; +} + +void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB, + MachineBasicBlock *SrcMBB) { + for (MachineBasicBlock::succ_iterator It = SrcMBB->succ_begin(), + iterEnd = SrcMBB->succ_end(); It != iterEnd; ++It) + DstMBB->addSuccessor(*It); // *iter's predecessor is also taken care of +} + +MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) { + MachineFunction *Func = MBB->getParent(); + MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock(); + Func->push_back(NewMBB); //insert to function + for (MachineBasicBlock::iterator It = MBB->begin(), E = MBB->end(); + It != E; ++It) { + MachineInstr *MI = Func->CloneMachineInstr(It); + NewMBB->push_back(MI); + } + return NewMBB; +} + +void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith( + MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB, + MachineBasicBlock *NewBlk) { + MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB); + if (BranchMI && isCondBranch(BranchMI) && + getTrueBranch(BranchMI) == OldMBB) + setTrueBranch(BranchMI, NewBlk); +} + +void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) { + assert((!MBB->getParent()->getJumpTableInfo() + || MBB->getParent()->getJumpTableInfo()->isEmpty()) + && "found a jump table"); + + //collect continue right before endloop + SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> ContInstr; + MachineBasicBlock::iterator Pre = MBB->begin(); + MachineBasicBlock::iterator E = MBB->end(); + MachineBasicBlock::iterator It = Pre; + while (It != E) { + if (Pre->getOpcode() == AMDGPU::CONTINUE + && It->getOpcode() == AMDGPU::ENDLOOP) + ContInstr.push_back(Pre); + Pre = It; + ++It; + } + + //delete continue right before endloop + for (unsigned i = 0; i < ContInstr.size(); ++i) + ContInstr[i]->eraseFromParent(); + + // TODO to fix up jump table so later phase won't be confused. if + // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but + // there isn't such an interface yet. alternatively, replace all the other + // blocks in the jump table with the entryBlk //} + +} + + +bool AMDGPUCFGStructurizer::prepare() { + bool Changed = false; + + //FIXME: if not reducible flow graph, make it so ??? + + DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";); + + orderBlocks(FuncRep); + + SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> RetBlks; + + // Add an ExitBlk to loop that don't have one + for (MachineLoopInfo::iterator It = MLI->begin(), + E = MLI->end(); It != E; ++It) { + MachineLoop *LoopRep = (*It); + MBBVector ExitingMBBs; + LoopRep->getExitingBlocks(ExitingMBBs); + + if (ExitingMBBs.size() == 0) { + MachineBasicBlock* DummyExitBlk = normalizeInfiniteLoopExit(LoopRep); + if (DummyExitBlk) + RetBlks.push_back(DummyExitBlk); + } + } + + // Remove unconditional branch instr. + // Add dummy exit block iff there are multiple returns. + for (SmallVectorImpl<MachineBasicBlock *>::const_iterator + It = OrderedBlks.begin(), E = OrderedBlks.end(); It != E; ++It) { + MachineBasicBlock *MBB = *It; + removeUnconditionalBranch(MBB); + removeRedundantConditionalBranch(MBB); + if (isReturnBlock(MBB)) { + RetBlks.push_back(MBB); + } + assert(MBB->succ_size() <= 2); + } + + if (RetBlks.size() >= 2) { + addDummyExitBlock(RetBlks); + Changed = true; + } + + return Changed; +} + +bool AMDGPUCFGStructurizer::run() { + + //Assume reducible CFG... + DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n"); + +#ifdef STRESSTEST + //Use the worse block ordering to test the algorithm. + ReverseVector(orderedBlks); +#endif + + DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks();); + int NumIter = 0; + bool Finish = false; + MachineBasicBlock *MBB; + bool MakeProgress = false; + int NumRemainedBlk = countActiveBlock(OrderedBlks.begin(), + OrderedBlks.end()); + + do { + ++NumIter; + DEBUG( + dbgs() << "numIter = " << NumIter + << ", numRemaintedBlk = " << NumRemainedBlk << "\n"; + ); + + SmallVectorImpl<MachineBasicBlock *>::const_iterator It = + OrderedBlks.begin(); + SmallVectorImpl<MachineBasicBlock *>::const_iterator E = + OrderedBlks.end(); + + SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter = + It; + MachineBasicBlock *SccBeginMBB = nullptr; + int SccNumBlk = 0; // The number of active blocks, init to a + // maximum possible number. + int SccNumIter; // Number of iteration in this SCC. + + while (It != E) { + MBB = *It; + + if (!SccBeginMBB) { + SccBeginIter = It; + SccBeginMBB = MBB; + SccNumIter = 0; + SccNumBlk = NumRemainedBlk; // Init to maximum possible number. + DEBUG( + dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB); + dbgs() << "\n"; + ); + } + + if (!isRetiredBlock(MBB)) + patternMatch(MBB); + + ++It; + + bool ContNextScc = true; + if (It == E + || getSCCNum(SccBeginMBB) != getSCCNum(*It)) { + // Just finish one scc. + ++SccNumIter; + int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It); + if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) { + DEBUG( + dbgs() << "Can't reduce SCC " << getSCCNum(MBB) + << ", sccNumIter = " << SccNumIter; + dbgs() << "doesn't make any progress\n"; + ); + ContNextScc = true; + } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) { + SccNumBlk = sccRemainedNumBlk; + It = SccBeginIter; + ContNextScc = false; + DEBUG( + dbgs() << "repeat processing SCC" << getSCCNum(MBB) + << "sccNumIter = " << SccNumIter << '\n'; + ); + } else { + // Finish the current scc. + ContNextScc = true; + } + } else { + // Continue on next component in the current scc. + ContNextScc = false; + } + + if (ContNextScc) + SccBeginMBB = nullptr; + } //while, "one iteration" over the function. + + MachineBasicBlock *EntryMBB = + &*GraphTraits<MachineFunction *>::nodes_begin(FuncRep); + if (EntryMBB->succ_size() == 0) { + Finish = true; + DEBUG( + dbgs() << "Reduce to one block\n"; + ); + } else { + int NewnumRemainedBlk + = countActiveBlock(OrderedBlks.begin(), OrderedBlks.end()); + // consider cloned blocks ?? + if (NewnumRemainedBlk == 1 || NewnumRemainedBlk < NumRemainedBlk) { + MakeProgress = true; + NumRemainedBlk = NewnumRemainedBlk; + } else { + MakeProgress = false; + DEBUG( + dbgs() << "No progress\n"; + ); + } + } + } while (!Finish && MakeProgress); + + // Misc wrap up to maintain the consistency of the Function representation. + wrapup(&*GraphTraits<MachineFunction *>::nodes_begin(FuncRep)); + + // Detach retired Block, release memory. + for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end(); + It != E; ++It) { + if ((*It).second && (*It).second->IsRetired) { + assert(((*It).first)->getNumber() != -1); + DEBUG( + dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n"; + ); + (*It).first->eraseFromParent(); //Remove from the parent Function. + } + delete (*It).second; + } + BlockInfoMap.clear(); + LLInfoMap.clear(); + + if (!Finish) { + DEBUG(FuncRep->viewCFG()); + llvm_unreachable("IRREDUCIBLE_CFG"); + } + + return true; +} + + + +void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) { + int SccNum = 0; + MachineBasicBlock *MBB; + for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd(); + ++It, ++SccNum) { + const std::vector<MachineBasicBlock *> &SccNext = *It; + for (std::vector<MachineBasicBlock *>::const_iterator + blockIter = SccNext.begin(), blockEnd = SccNext.end(); + blockIter != blockEnd; ++blockIter) { + MBB = *blockIter; + OrderedBlks.push_back(MBB); + recordSccnum(MBB, SccNum); + } + } + + //walk through all the block in func to check for unreachable + typedef GraphTraits<MachineFunction *> GTM; + MachineFunction::iterator It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF); + for (; It != E; ++It) { + MachineBasicBlock *MBB = &(*It); + SccNum = getSCCNum(MBB); + if (SccNum == INVALIDSCCNUM) + dbgs() << "unreachable block BB" << MBB->getNumber() << "\n"; + } +} + +int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) { + int NumMatch = 0; + int CurMatch; + + DEBUG( + dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n"; + ); + + while ((CurMatch = patternMatchGroup(MBB)) > 0) + NumMatch += CurMatch; + + DEBUG( + dbgs() << "End patternMatch BB" << MBB->getNumber() + << ", numMatch = " << NumMatch << "\n"; + ); + + return NumMatch; +} + +int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) { + int NumMatch = 0; + NumMatch += loopendPatternMatch(); + NumMatch += serialPatternMatch(MBB); + NumMatch += ifPatternMatch(MBB); + return NumMatch; +} + + +int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) { + if (MBB->succ_size() != 1) + return 0; + + MachineBasicBlock *childBlk = *MBB->succ_begin(); + if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk)) + return 0; + + mergeSerialBlock(MBB, childBlk); + ++numSerialPatternMatch; + return 1; +} + +int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) { + //two edges + if (MBB->succ_size() != 2) + return 0; + if (hasBackEdge(MBB)) + return 0; + MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB); + if (!BranchMI) + return 0; + + assert(isCondBranch(BranchMI)); + int NumMatch = 0; + + MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI); + NumMatch += serialPatternMatch(TrueMBB); + NumMatch += ifPatternMatch(TrueMBB); + MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI); + NumMatch += serialPatternMatch(FalseMBB); + NumMatch += ifPatternMatch(FalseMBB); + MachineBasicBlock *LandBlk; + int Cloned = 0; + + assert (!TrueMBB->succ_empty() || !FalseMBB->succ_empty()); + // TODO: Simplify + if (TrueMBB->succ_size() == 1 && FalseMBB->succ_size() == 1 + && *TrueMBB->succ_begin() == *FalseMBB->succ_begin()) { + // Diamond pattern + LandBlk = *TrueMBB->succ_begin(); + } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) { + // Triangle pattern, false is empty + LandBlk = FalseMBB; + FalseMBB = nullptr; + } else if (FalseMBB->succ_size() == 1 + && *FalseMBB->succ_begin() == TrueMBB) { + // Triangle pattern, true is empty + // We reverse the predicate to make a triangle, empty false pattern; + std::swap(TrueMBB, FalseMBB); + reversePredicateSetter(MBB->end()); + LandBlk = FalseMBB; + FalseMBB = nullptr; + } else if (FalseMBB->succ_size() == 1 + && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) { + LandBlk = *FalseMBB->succ_begin(); + } else if (TrueMBB->succ_size() == 1 + && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) { + LandBlk = *TrueMBB->succ_begin(); + } else { + return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB); + } + + // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the + // new BB created for landBlk==NULL may introduce new challenge to the + // reduction process. + if (LandBlk && + ((TrueMBB && TrueMBB->pred_size() > 1) + || (FalseMBB && FalseMBB->pred_size() > 1))) { + Cloned += improveSimpleJumpintoIf(MBB, TrueMBB, FalseMBB, &LandBlk); + } + + if (TrueMBB && TrueMBB->pred_size() > 1) { + TrueMBB = cloneBlockForPredecessor(TrueMBB, MBB); + ++Cloned; + } + + if (FalseMBB && FalseMBB->pred_size() > 1) { + FalseMBB = cloneBlockForPredecessor(FalseMBB, MBB); + ++Cloned; + } + + mergeIfthenelseBlock(BranchMI, MBB, TrueMBB, FalseMBB, LandBlk); + + ++numIfPatternMatch; + + numClonedBlock += Cloned; + + return 1 + Cloned + NumMatch; +} + +int AMDGPUCFGStructurizer::loopendPatternMatch() { + std::deque<MachineLoop *> NestedLoops; + for (auto &It: *MLI) + for (MachineLoop *ML : depth_first(It)) + NestedLoops.push_front(ML); + + if (NestedLoops.size() == 0) + return 0; + + // Process nested loop outside->inside (we did push_front), + // so "continue" to a outside loop won't be mistaken as "break" + // of the current loop. + int Num = 0; + for (MachineLoop *ExaminedLoop : NestedLoops) { + if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop]) + continue; + DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump();); + int NumBreak = mergeLoop(ExaminedLoop); + if (NumBreak == -1) + break; + Num += NumBreak; + } + return Num; +} + +int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) { + MachineBasicBlock *LoopHeader = LoopRep->getHeader(); + MBBVector ExitingMBBs; + LoopRep->getExitingBlocks(ExitingMBBs); + assert(!ExitingMBBs.empty() && "Infinite Loop not supported"); + DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() << " exiting blocks\n";); + // We assume a single ExitBlk + MBBVector ExitBlks; + LoopRep->getExitBlocks(ExitBlks); + SmallPtrSet<MachineBasicBlock *, 2> ExitBlkSet; + for (unsigned i = 0, e = ExitBlks.size(); i < e; ++i) + ExitBlkSet.insert(ExitBlks[i]); + assert(ExitBlkSet.size() == 1); + MachineBasicBlock *ExitBlk = *ExitBlks.begin(); + assert(ExitBlk && "Loop has several exit block"); + MBBVector LatchBlks; + typedef GraphTraits<Inverse<MachineBasicBlock*> > InvMBBTraits; + InvMBBTraits::ChildIteratorType PI = InvMBBTraits::child_begin(LoopHeader), + PE = InvMBBTraits::child_end(LoopHeader); + for (; PI != PE; PI++) { + if (LoopRep->contains(*PI)) + LatchBlks.push_back(*PI); + } + + for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i) + mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk); + for (unsigned i = 0, e = LatchBlks.size(); i < e; ++i) + settleLoopcontBlock(LatchBlks[i], LoopHeader); + int Match = 0; + do { + Match = 0; + Match += serialPatternMatch(LoopHeader); + Match += ifPatternMatch(LoopHeader); + } while (Match > 0); + mergeLooplandBlock(LoopHeader, ExitBlk); + MachineLoop *ParentLoop = LoopRep->getParentLoop(); + if (ParentLoop) + MLI->changeLoopFor(LoopHeader, ParentLoop); + else + MLI->removeBlock(LoopHeader); + Visited[LoopRep] = true; + return 1; +} + +int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep, + MachineBasicBlock *LoopHeader) { + int NumCont = 0; + SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> ContMBB; + typedef GraphTraits<Inverse<MachineBasicBlock *> > GTIM; + GTIM::ChildIteratorType It = GTIM::child_begin(LoopHeader), + E = GTIM::child_end(LoopHeader); + for (; It != E; ++It) { + MachineBasicBlock *MBB = *It; + if (LoopRep->contains(MBB)) { + handleLoopcontBlock(MBB, MLI->getLoopFor(MBB), + LoopHeader, LoopRep); + ContMBB.push_back(MBB); + ++NumCont; + } + } + + for (SmallVectorImpl<MachineBasicBlock *>::iterator It = ContMBB.begin(), + E = ContMBB.end(); It != E; ++It) { + (*It)->removeSuccessor(LoopHeader, true); + } + + numLoopcontPatternMatch += NumCont; + + return NumCont; +} + + +bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak( + MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) { + if (Src1MBB->succ_size() == 0) { + MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB); + if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) { + MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep]; + if (TheEntry) { + DEBUG( + dbgs() << "isLoopContBreakBlock yes src1 = BB" + << Src1MBB->getNumber() + << " src2 = BB" << Src2MBB->getNumber() << "\n"; + ); + return true; + } + } + } + return false; +} + +int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB, + MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) { + int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB); + if (Num == 0) { + DEBUG( + dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n"; + ); + Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB); + } + return Num; +} + +int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB, + MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) { + int Num = 0; + MachineBasicBlock *DownBlk; + + //trueBlk could be the common post dominator + DownBlk = TrueMBB; + + DEBUG( + dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber() + << " true = BB" << TrueMBB->getNumber() + << ", numSucc=" << TrueMBB->succ_size() + << " false = BB" << FalseMBB->getNumber() << "\n"; + ); + + while (DownBlk) { + DEBUG( + dbgs() << "check down = BB" << DownBlk->getNumber(); + ); + + if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) { + DEBUG( + dbgs() << " working\n"; + ); + + Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk); + Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk); + + numClonedBlock += Num; + Num += serialPatternMatch(*HeadMBB->succ_begin()); + Num += serialPatternMatch(*std::next(HeadMBB->succ_begin())); + Num += ifPatternMatch(HeadMBB); + assert(Num > 0); + + break; + } + DEBUG( + dbgs() << " not working\n"; + ); + DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr; + } // walk down the postDomTree + + return Num; +} + +void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf( + MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB, + MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) { + dbgs() << "head = BB" << HeadMBB->getNumber() + << " size = " << HeadMBB->size(); + if (Detail) { + dbgs() << "\n"; + HeadMBB->print(dbgs()); + dbgs() << "\n"; + } + + if (TrueMBB) { + dbgs() << ", true = BB" << TrueMBB->getNumber() << " size = " + << TrueMBB->size() << " numPred = " << TrueMBB->pred_size(); + if (Detail) { + dbgs() << "\n"; + TrueMBB->print(dbgs()); + dbgs() << "\n"; + } + } + if (FalseMBB) { + dbgs() << ", false = BB" << FalseMBB->getNumber() << " size = " + << FalseMBB->size() << " numPred = " << FalseMBB->pred_size(); + if (Detail) { + dbgs() << "\n"; + FalseMBB->print(dbgs()); + dbgs() << "\n"; + } + } + if (LandMBB) { + dbgs() << ", land = BB" << LandMBB->getNumber() << " size = " + << LandMBB->size() << " numPred = " << LandMBB->pred_size(); + if (Detail) { + dbgs() << "\n"; + LandMBB->print(dbgs()); + dbgs() << "\n"; + } + } + + dbgs() << "\n"; +} + +int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, + MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, + MachineBasicBlock **LandMBBPtr) { + bool MigrateTrue = false; + bool MigrateFalse = false; + + MachineBasicBlock *LandBlk = *LandMBBPtr; + + assert((!TrueMBB || TrueMBB->succ_size() <= 1) + && (!FalseMBB || FalseMBB->succ_size() <= 1)); + + if (TrueMBB == FalseMBB) + return 0; + + MigrateTrue = needMigrateBlock(TrueMBB); + MigrateFalse = needMigrateBlock(FalseMBB); + + if (!MigrateTrue && !MigrateFalse) + return 0; + + // If we need to migrate either trueBlk and falseBlk, migrate the rest that + // have more than one predecessors. without doing this, its predecessor + // rather than headBlk will have undefined value in initReg. + if (!MigrateTrue && TrueMBB && TrueMBB->pred_size() > 1) + MigrateTrue = true; + if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1) + MigrateFalse = true; + + DEBUG( + dbgs() << "before improveSimpleJumpintoIf: "; + showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0); + ); + + // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk + // + // new: headBlk => if () {initReg = 1; org trueBlk branch} else + // {initReg = 0; org falseBlk branch } + // => landBlk => if (initReg) {org trueBlk} else {org falseBlk} + // => org landBlk + // if landBlk->pred_size() > 2, put the about if-else inside + // if (initReg !=2) {...} + // + // add initReg = initVal to headBlk + + const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); + if (!MigrateTrue || !MigrateFalse) { + // XXX: We have an opportunity here to optimize the "branch into if" case + // here. Branch into if looks like this: + // entry + // / | + // diamond_head branch_from + // / \ | + // diamond_false diamond_true + // \ / + // done + // + // The diamond_head block begins the "if" and the diamond_true block + // is the block being "branched into". + // + // If MigrateTrue is true, then TrueBB is the block being "branched into" + // and if MigrateFalse is true, then FalseBB is the block being + // "branched into" + // + // Here is the pseudo code for how I think the optimization should work: + // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head. + // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from. + // 3. Move the branch instruction from diamond_head into its own basic + // block (new_block). + // 4. Add an unconditional branch from diamond_head to new_block + // 5. Replace the branch instruction in branch_from with an unconditional + // branch to new_block. If branch_from has multiple predecessors, then + // we need to replace the True/False block in the branch + // instruction instead of replacing it. + // 6. Change the condition of the branch instruction in new_block from + // COND to (COND || GPR0) + // + // In order insert these MOV instruction, we will need to use the + // RegisterScavenger. Usually liveness stops being tracked during + // the late machine optimization passes, however if we implement + // bool TargetRegisterInfo::requiresRegisterScavenging( + // const MachineFunction &MF) + // and have it return true, liveness will be tracked correctly + // by generic optimization passes. We will also need to make sure that + // all of our target-specific passes that run after regalloc and before + // the CFGStructurizer track liveness and we will need to modify this pass + // to correctly track liveness. + // + // After the above changes, the new CFG should look like this: + // entry + // / | + // diamond_head branch_from + // \ / + // new_block + // / | + // diamond_false diamond_true + // \ / + // done + // + // Without this optimization, we are forced to duplicate the diamond_true + // block and we will end up with a CFG like this: + // + // entry + // / | + // diamond_head branch_from + // / \ | + // diamond_false diamond_true diamond_true (duplicate) + // \ / | + // done --------------------| + // + // Duplicating diamond_true can be very costly especially if it has a + // lot of instructions. + return 0; + } + + int NumNewBlk = 0; + + bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2); + + //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL" + MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF); + + if (LandBlkHasOtherPred) { + llvm_unreachable("Extra register needed to handle CFG"); + unsigned CmpResReg = + HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); + llvm_unreachable("Extra compare instruction needed to handle CFG"); + insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, + CmpResReg, DebugLoc()); + } + + // XXX: We are running this after RA, so creating virtual registers will + // cause an assertion failure in the PostRA scheduling pass. + unsigned InitReg = + HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); + insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg, + DebugLoc()); + + if (MigrateTrue) { + migrateInstruction(TrueMBB, LandBlk, I); + // need to uncondionally insert the assignment to ensure a path from its + // predecessor rather than headBlk has valid value in initReg if + // (initVal != 1). + llvm_unreachable("Extra register needed to handle CFG"); + } + insertInstrBefore(I, AMDGPU::ELSE); + + if (MigrateFalse) { + migrateInstruction(FalseMBB, LandBlk, I); + // need to uncondionally insert the assignment to ensure a path from its + // predecessor rather than headBlk has valid value in initReg if + // (initVal != 0) + llvm_unreachable("Extra register needed to handle CFG"); + } + + if (LandBlkHasOtherPred) { + // add endif + insertInstrBefore(I, AMDGPU::ENDIF); + + // put initReg = 2 to other predecessors of landBlk + for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(), + PE = LandBlk->pred_end(); PI != PE; ++PI) { + MachineBasicBlock *MBB = *PI; + if (MBB != TrueMBB && MBB != FalseMBB) + llvm_unreachable("Extra register needed to handle CFG"); + } + } + DEBUG( + dbgs() << "result from improveSimpleJumpintoIf: "; + showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0); + ); + + // update landBlk + *LandMBBPtr = LandBlk; + + return NumNewBlk; +} + +void AMDGPUCFGStructurizer::handleLoopcontBlock(MachineBasicBlock *ContingMBB, + MachineLoop *ContingLoop, MachineBasicBlock *ContMBB, + MachineLoop *ContLoop) { + DEBUG(dbgs() << "loopcontPattern cont = BB" << ContingMBB->getNumber() + << " header = BB" << ContMBB->getNumber() << "\n"; + dbgs() << "Trying to continue loop-depth = " + << getLoopDepth(ContLoop) + << " from loop-depth = " << getLoopDepth(ContingLoop) << "\n";); + settleLoopcontBlock(ContingMBB, ContMBB); +} + +void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB, + MachineBasicBlock *SrcMBB) { + DEBUG( + dbgs() << "serialPattern BB" << DstMBB->getNumber() + << " <= BB" << SrcMBB->getNumber() << "\n"; + ); + DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end()); + + DstMBB->removeSuccessor(SrcMBB, true); + cloneSuccessorList(DstMBB, SrcMBB); + + removeSuccessor(SrcMBB); + MLI->removeBlock(SrcMBB); + retireBlock(SrcMBB); +} + +void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI, + MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB, + MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) { + assert (TrueMBB); + DEBUG( + dbgs() << "ifPattern BB" << MBB->getNumber(); + dbgs() << "{ "; + if (TrueMBB) { + dbgs() << "BB" << TrueMBB->getNumber(); + } + dbgs() << " } else "; + dbgs() << "{ "; + if (FalseMBB) { + dbgs() << "BB" << FalseMBB->getNumber(); + } + dbgs() << " }\n "; + dbgs() << "landBlock: "; + if (!LandMBB) { + dbgs() << "NULL"; + } else { + dbgs() << "BB" << LandMBB->getNumber(); + } + dbgs() << "\n"; + ); + + int OldOpcode = BranchMI->getOpcode(); + DebugLoc BranchDL = BranchMI->getDebugLoc(); + +// transform to +// if cond +// trueBlk +// else +// falseBlk +// endif +// landBlk + + MachineBasicBlock::iterator I = BranchMI; + insertCondBranchBefore(I, getBranchNzeroOpcode(OldOpcode), + BranchDL); + + if (TrueMBB) { + MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end()); + MBB->removeSuccessor(TrueMBB, true); + if (LandMBB && TrueMBB->succ_size()!=0) + TrueMBB->removeSuccessor(LandMBB, true); + retireBlock(TrueMBB); + MLI->removeBlock(TrueMBB); + } + + if (FalseMBB) { + insertInstrBefore(I, AMDGPU::ELSE); + MBB->splice(I, FalseMBB, FalseMBB->begin(), + FalseMBB->end()); + MBB->removeSuccessor(FalseMBB, true); + if (LandMBB && FalseMBB->succ_size() != 0) + FalseMBB->removeSuccessor(LandMBB, true); + retireBlock(FalseMBB); + MLI->removeBlock(FalseMBB); + } + insertInstrBefore(I, AMDGPU::ENDIF); + + BranchMI->eraseFromParent(); + + if (LandMBB && TrueMBB && FalseMBB) + MBB->addSuccessor(LandMBB); + +} + +void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk, + MachineBasicBlock *LandMBB) { + DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber() + << " land = BB" << LandMBB->getNumber() << "\n";); + + insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc()); + insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc()); + DstBlk->replaceSuccessor(DstBlk, LandMBB); +} + + +void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, + MachineBasicBlock *LandMBB) { + DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber() + << " land = BB" << LandMBB->getNumber() << "\n";); + MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB); + assert(BranchMI && isCondBranch(BranchMI)); + DebugLoc DL = BranchMI->getDebugLoc(); + MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI); + MachineBasicBlock::iterator I = BranchMI; + if (TrueBranch != LandMBB) + reversePredicateSetter(I); + insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL); + insertInstrBefore(I, AMDGPU::BREAK); + insertInstrBefore(I, AMDGPU::ENDIF); + //now branchInst can be erase safely + BranchMI->eraseFromParent(); + //now take care of successors, retire blocks + ExitingMBB->removeSuccessor(LandMBB, true); +} + +void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB, + MachineBasicBlock *ContMBB) { + DEBUG(dbgs() << "settleLoopcontBlock conting = BB" + << ContingMBB->getNumber() + << ", cont = BB" << ContMBB->getNumber() << "\n";); + + MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB); + if (MI) { + assert(isCondBranch(MI)); + MachineBasicBlock::iterator I = MI; + MachineBasicBlock *TrueBranch = getTrueBranch(MI); + int OldOpcode = MI->getOpcode(); + DebugLoc DL = MI->getDebugLoc(); + + bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI); + + if (!UseContinueLogical) { + int BranchOpcode = + TrueBranch == ContMBB ? getBranchNzeroOpcode(OldOpcode) : + getBranchZeroOpcode(OldOpcode); + insertCondBranchBefore(I, BranchOpcode, DL); + // insertEnd to ensure phi-moves, if exist, go before the continue-instr. + insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL); + insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL); + } else { + int BranchOpcode = + TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) : + getContinueZeroOpcode(OldOpcode); + insertCondBranchBefore(I, BranchOpcode, DL); + } + + MI->eraseFromParent(); + } else { + // if we've arrived here then we've already erased the branch instruction + // travel back up the basic block to see the last reference of our debug + // location we've just inserted that reference here so it should be + // representative insertEnd to ensure phi-moves, if exist, go before the + // continue-instr. + insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, + getLastDebugLocInBB(ContingMBB)); + } +} + +int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB, + MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) { + int Cloned = 0; + assert(PreMBB->isSuccessor(SrcMBB)); + while (SrcMBB && SrcMBB != DstMBB) { + assert(SrcMBB->succ_size() == 1); + if (SrcMBB->pred_size() > 1) { + SrcMBB = cloneBlockForPredecessor(SrcMBB, PreMBB); + ++Cloned; + } + + PreMBB = SrcMBB; + SrcMBB = *SrcMBB->succ_begin(); + } + + return Cloned; +} + +MachineBasicBlock * +AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB, + MachineBasicBlock *PredMBB) { + assert(PredMBB->isSuccessor(MBB) && + "succBlk is not a prececessor of curBlk"); + + MachineBasicBlock *CloneMBB = clone(MBB); //clone instructions + replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB); + //srcBlk, oldBlk, newBlk + + PredMBB->replaceSuccessor(MBB, CloneMBB); + + // add all successor to cloneBlk + cloneSuccessorList(CloneMBB, MBB); + + numClonedInstr += MBB->size(); + + DEBUG( + dbgs() << "Cloned block: " << "BB" + << MBB->getNumber() << "size " << MBB->size() << "\n"; + ); + + SHOWNEWBLK(CloneMBB, "result of Cloned block: "); + + return CloneMBB; +} + +void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB, + MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) { + MachineBasicBlock::iterator SpliceEnd; + //look for the input branchinstr, not the AMDGPU branchinstr + MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB); + if (!BranchMI) { + DEBUG( + dbgs() << "migrateInstruction don't see branch instr\n" ; + ); + SpliceEnd = SrcMBB->end(); + } else { + DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI); + SpliceEnd = BranchMI; + } + DEBUG( + dbgs() << "migrateInstruction before splice dstSize = " << DstMBB->size() + << "srcSize = " << SrcMBB->size() << "\n"; + ); + + //splice insert before insertPos + DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd); + + DEBUG( + dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size() + << "srcSize = " << SrcMBB->size() << '\n'; + ); +} + +MachineBasicBlock * +AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) { + MachineBasicBlock *LoopHeader = LoopRep->getHeader(); + MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch(); + + if (!LoopHeader || !LoopLatch) + return nullptr; + MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch); + // Is LoopRep an infinite loop ? + if (!BranchMI || !isUncondBranch(BranchMI)) + return nullptr; + + MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock(); + FuncRep->push_back(DummyExitBlk); //insert to function + SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: "); + DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";); + LLVMContext &Ctx = LoopHeader->getParent()->getFunction()->getContext(); + Ctx.emitError("Extra register needed to handle CFG"); + return nullptr; +} + +void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) { + MachineInstr *BranchMI; + + // I saw two unconditional branch in one basic block in example + // test_fc_do_while_or.c need to fix the upstream on this to remove the loop. + while ((BranchMI = getLoopendBlockBranchInstr(MBB)) + && isUncondBranch(BranchMI)) { + DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI); + BranchMI->eraseFromParent(); + } +} + +void AMDGPUCFGStructurizer::removeRedundantConditionalBranch( + MachineBasicBlock *MBB) { + if (MBB->succ_size() != 2) + return; + MachineBasicBlock *MBB1 = *MBB->succ_begin(); + MachineBasicBlock *MBB2 = *std::next(MBB->succ_begin()); + if (MBB1 != MBB2) + return; + + MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB); + assert(BranchMI && isCondBranch(BranchMI)); + DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI); + BranchMI->eraseFromParent(); + SHOWNEWBLK(MBB1, "Removing redundant successor"); + MBB->removeSuccessor(MBB1, true); +} + +void AMDGPUCFGStructurizer::addDummyExitBlock( + SmallVectorImpl<MachineBasicBlock*> &RetMBB) { + MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock(); + FuncRep->push_back(DummyExitBlk); //insert to function + insertInstrEnd(DummyExitBlk, AMDGPU::RETURN); + + for (SmallVectorImpl<MachineBasicBlock *>::iterator It = RetMBB.begin(), + E = RetMBB.end(); It != E; ++It) { + MachineBasicBlock *MBB = *It; + MachineInstr *MI = getReturnInstr(MBB); + if (MI) + MI->eraseFromParent(); + MBB->addSuccessor(DummyExitBlk); + DEBUG( + dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber() + << " successors\n"; + ); + } + SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: "); +} + +void AMDGPUCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) { + while (MBB->succ_size()) + MBB->removeSuccessor(*MBB->succ_begin()); +} + +void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB, + int SccNum) { + BlockInformation *&srcBlkInfo = BlockInfoMap[MBB]; + if (!srcBlkInfo) + srcBlkInfo = new BlockInformation(); + srcBlkInfo->SccNum = SccNum; +} + +void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) { + DEBUG( + dbgs() << "Retiring BB" << MBB->getNumber() << "\n"; + ); + + BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB]; + + if (!SrcBlkInfo) + SrcBlkInfo = new BlockInformation(); + + SrcBlkInfo->IsRetired = true; + assert(MBB->succ_size() == 0 && MBB->pred_size() == 0 + && "can't retire block yet"); +} + +void AMDGPUCFGStructurizer::setLoopLandBlock(MachineLoop *loopRep, + MachineBasicBlock *MBB) { + MachineBasicBlock *&TheEntry = LLInfoMap[loopRep]; + if (!MBB) { + MBB = FuncRep->CreateMachineBasicBlock(); + FuncRep->push_back(MBB); //insert to function + SHOWNEWBLK(MBB, "DummyLandingBlock for loop without break: "); + } + TheEntry = MBB; + DEBUG( + dbgs() << "setLoopLandBlock loop-header = BB" + << loopRep->getHeader()->getNumber() + << " landing-block = BB" << MBB->getNumber() << "\n"; + ); +} + +MachineBasicBlock * +AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1, + MachineBasicBlock *MBB2) { + + if (PDT->dominates(MBB1, MBB2)) + return MBB1; + if (PDT->dominates(MBB2, MBB1)) + return MBB2; + + MachineDomTreeNode *Node1 = PDT->getNode(MBB1); + MachineDomTreeNode *Node2 = PDT->getNode(MBB2); + + // Handle newly cloned node. + if (!Node1 && MBB1->succ_size() == 1) + return findNearestCommonPostDom(*MBB1->succ_begin(), MBB2); + if (!Node2 && MBB2->succ_size() == 1) + return findNearestCommonPostDom(MBB1, *MBB2->succ_begin()); + + if (!Node1 || !Node2) + return nullptr; + + Node1 = Node1->getIDom(); + while (Node1) { + if (PDT->dominates(Node1, Node2)) + return Node1->getBlock(); + Node1 = Node1->getIDom(); + } + + return nullptr; +} + +MachineBasicBlock * +AMDGPUCFGStructurizer::findNearestCommonPostDom( + std::set<MachineBasicBlock *> &MBBs) { + MachineBasicBlock *CommonDom; + std::set<MachineBasicBlock *>::const_iterator It = MBBs.begin(); + std::set<MachineBasicBlock *>::const_iterator E = MBBs.end(); + for (CommonDom = *It; It != E && CommonDom; ++It) { + MachineBasicBlock *MBB = *It; + if (MBB != CommonDom) + CommonDom = findNearestCommonPostDom(MBB, CommonDom); + } + + DEBUG( + dbgs() << "Common post dominator for exit blocks is "; + if (CommonDom) + dbgs() << "BB" << CommonDom->getNumber() << "\n"; + else + dbgs() << "NULL\n"; + ); + + return CommonDom; +} + +char AMDGPUCFGStructurizer::ID = 0; + +} // end anonymous namespace + + +INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer", + "AMDGPU CFG Structurizer", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer", + "AMDGPU CFG Structurizer", false, false) + +FunctionPass *llvm::createAMDGPUCFGStructurizerPass() { + return new AMDGPUCFGStructurizer(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h b/contrib/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h new file mode 100644 index 0000000..a9ba60c --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h @@ -0,0 +1,641 @@ +//===-- AMDGPUKernelCodeT.h - Print AMDGPU assembly code ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file AMDKernelCodeT.h +//===----------------------------------------------------------------------===// + +#ifndef AMDKERNELCODET_H +#define AMDKERNELCODET_H + +#include "llvm/MC/SubtargetFeature.h" + +#include <cstddef> +#include <cstdint> + +#include "llvm/Support/Debug.h" +//---------------------------------------------------------------------------// +// AMD Kernel Code, and its dependencies // +//---------------------------------------------------------------------------// + +typedef uint8_t hsa_powertwo8_t; +typedef uint32_t hsa_ext_code_kind_t; +typedef uint8_t hsa_ext_brig_profile8_t; +typedef uint8_t hsa_ext_brig_machine_model8_t; +typedef uint64_t hsa_ext_control_directive_present64_t; +typedef uint16_t hsa_ext_exception_kind16_t; +typedef uint32_t hsa_ext_code_kind32_t; + +typedef struct hsa_dim3_s { + uint32_t x; + uint32_t y; + uint32_t z; +} hsa_dim3_t; + +/// The version of the amd_*_code_t struct. Minor versions must be +/// backward compatible. +typedef uint32_t amd_code_version32_t; +enum amd_code_version_t { + AMD_CODE_VERSION_MAJOR = 0, + AMD_CODE_VERSION_MINOR = 1 +}; + +/// The values used to define the number of bytes to use for the +/// swizzle element size. +enum amd_element_byte_size_t { + AMD_ELEMENT_2_BYTES = 0, + AMD_ELEMENT_4_BYTES = 1, + AMD_ELEMENT_8_BYTES = 2, + AMD_ELEMENT_16_BYTES = 3 +}; + +/// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and +/// COMPUTE_PGM_RSRC2 registers. +typedef uint64_t amd_compute_pgm_resource_register64_t; + +/// Every amd_*_code_t has the following properties, which are composed of +/// a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*), +/// bit width (AMD_CODE_PROPERTY_*_WIDTH, and bit shift amount +/// (AMD_CODE_PROPERTY_*_SHIFT) for convenient access. Unused bits must be 0. +/// +/// (Note that bit fields cannot be used as their layout is +/// implementation defined in the C standard and so cannot be used to +/// specify an ABI) +typedef uint32_t amd_code_property32_t; +enum amd_code_property_mask_t { + + /// Enable the setup of the SGPR user data registers + /// (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t + /// for initial register state. + /// + /// The total number of SGPRuser data registers requested must not + /// exceed 16. Any requests beyond 16 will be ignored. + /// + /// Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of + /// SGPR user data registers enabled up to 16). + + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2, + AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3, + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5, + AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT, + + /// Control wave ID base counter for GDS ordered-append. Used to set + /// COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if + /// ORDERED_APPEND_MODE also needs to be settable) + AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 10, + AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT, + + /// The interleave (swizzle) element size in bytes required by the + /// code for private memory. This must be 2, 4, 8 or 16. This value + /// is provided to the finalizer when it is invoked and is recorded + /// here. The hardware will interleave the memory requests of each + /// lane of a wavefront by this element size to ensure each + /// work-item gets a distinct memory memory location. Therefore, the + /// finalizer ensures that all load and store operations done to + /// private memory do not exceed this size. For example, if the + /// element size is 4 (32-bits or dword) and a 64-bit value must be + /// loaded, the finalizer will generate two 32-bit loads. This + /// ensures that the interleaving will get the work-item + /// specific dword for both halves of the 64-bit value. If it just + /// did a 64-bit load then it would get one dword which belonged to + /// its own work-item, but the second dword would belong to the + /// adjacent lane work-item since the interleaving is in dwords. + /// + /// The value used must match the value that the runtime configures + /// the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This + /// is generally DWORD. + /// + /// uSE VALUES FROM THE AMD_ELEMENT_BYTE_SIZE_T ENUM. + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 11, + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2, + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT, + + /// Are global memory addresses 64 bits. Must match + /// amd_kernel_code_t.hsail_machine_model == + /// HSA_MACHINE_LARGE. Must also match + /// SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)), + /// SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+). + AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 13, + AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1, + AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT, + + /// Indicate if the generated ISA is using a dynamically sized call + /// stack. This can happen if calls are implemented using a call + /// stack and recursion, alloca or calls to indirect functions are + /// present. In these cases the Finalizer cannot compute the total + /// private segment size at compile time. In this case the + /// workitem_private_segment_byte_size only specifies the statically + /// know private segment size, and additional space must be added + /// for the call stack. + AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 14, + AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1, + AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT, + + /// Indicate if code generated has support for debugging. + AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 15, + AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1, + AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT, + + AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 15, + AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH = 1, + AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT +}; + +/// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL +/// control directives. These control how the finalizer generates code. This +/// struct is used both as an argument to hsaFinalizeKernel to specify values for +/// the control directives, and is used in HsaKernelCode to record the values of +/// the control directives that the finalize used when generating the code which +/// either came from the finalizer argument or explicit HSAIL control +/// directives. See the definition of the control directives in HSA Programmer's +/// Reference Manual which also defines how the values specified as finalizer +/// arguments have to agree with the control directives in the HSAIL code. +typedef struct hsa_ext_control_directives_s { + /// This is a bit set indicating which control directives have been + /// specified. If the value is 0 then there are no control directives specified + /// and the rest of the fields can be ignored. The bits are accessed using the + /// hsa_ext_control_directives_present_mask_t. Any control directive that is not + /// enabled in this bit set must have the value of all 0s. + hsa_ext_control_directive_present64_t enabled_control_directives; + + /// If enableBreakExceptions is not enabled then must be 0, otherwise must be + /// non-0 and specifies the set of HSAIL exceptions that must have the BREAK + /// policy enabled. If this set is not empty then the generated code may have + /// lower performance than if the set is empty. If the kernel being finalized + /// has any enablebreakexceptions control directives, then the values specified + /// by this argument are unioned with the values in these control + /// directives. If any of the functions the kernel calls have an + /// enablebreakexceptions control directive, then they must be equal or a + /// subset of, this union. + hsa_ext_exception_kind16_t enable_break_exceptions; + + /// If enableDetectExceptions is not enabled then must be 0, otherwise must be + /// non-0 and specifies the set of HSAIL exceptions that must have the DETECT + /// policy enabled. If this set is not empty then the generated code may have + /// lower performance than if the set is empty. However, an implementation + /// should endeavour to make the performance impact small. If the kernel being + /// finalized has any enabledetectexceptions control directives, then the + /// values specified by this argument are unioned with the values in these + /// control directives. If any of the functions the kernel calls have an + /// enabledetectexceptions control directive, then they must be equal or a + /// subset of, this union. + hsa_ext_exception_kind16_t enable_detect_exceptions; + + /// If maxDynamicGroupSize is not enabled then must be 0, and any amount of + /// dynamic group segment can be allocated for a dispatch, otherwise the value + /// specifies the maximum number of bytes of dynamic group segment that can be + /// allocated for a dispatch. If the kernel being finalized has any + /// maxdynamicsize control directives, then the values must be the same, and + /// must be the same as this argument if it is enabled. This value can be used + /// by the finalizer to determine the maximum number of bytes of group memory + /// used by each work-group by adding this value to the group memory required + /// for all group segment variables used by the kernel and all functions it + /// calls, and group memory used to implement other HSAIL features such as + /// fbarriers and the detect exception operations. This can allow the finalizer + /// to determine the expected number of work-groups that can be executed by a + /// compute unit and allow more resources to be allocated to the work-items if + /// it is known that fewer work-groups can be executed due to group memory + /// limitations. + uint32_t max_dynamic_group_size; + + /// If maxFlatGridSize is not enabled then must be 0, otherwise must be greater + /// than 0. See HSA Programmer's Reference Manual description of + /// maxflatgridsize control directive. + uint32_t max_flat_grid_size; + + /// If maxFlatWorkgroupSize is not enabled then must be 0, otherwise must be + /// greater than 0. See HSA Programmer's Reference Manual description of + /// maxflatworkgroupsize control directive. + uint32_t max_flat_workgroup_size; + + /// If requestedWorkgroupsPerCu is not enabled then must be 0, and the + /// finalizer is free to generate ISA that may result in any number of + /// work-groups executing on a single compute unit. Otherwise, the finalizer + /// should attempt to generate ISA that will allow the specified number of + /// work-groups to execute on a single compute unit. This is only a hint and + /// can be ignored by the finalizer. If the kernel being finalized, or any of + /// the functions it calls, has a requested control directive, then the values + /// must be the same. This can be used to determine the number of resources + /// that should be allocated to a single work-group and work-item. For example, + /// a low value may allow more resources to be allocated, resulting in higher + /// per work-item performance, as it is known there will never be more than the + /// specified number of work-groups actually executing on the compute + /// unit. Conversely, a high value may allocate fewer resources, resulting in + /// lower per work-item performance, which is offset by the fact it allows more + /// work-groups to actually execute on the compute unit. + uint32_t requested_workgroups_per_cu; + + /// If not enabled then all elements for Dim3 must be 0, otherwise every + /// element must be greater than 0. See HSA Programmer's Reference Manual + /// description of requiredgridsize control directive. + hsa_dim3_t required_grid_size; + + /// If requiredWorkgroupSize is not enabled then all elements for Dim3 must be + /// 0, and the produced code can be dispatched with any legal work-group range + /// consistent with the dispatch dimensions. Otherwise, the code produced must + /// always be dispatched with the specified work-group range. No element of the + /// specified range must be 0. It must be consistent with required_dimensions + /// and max_flat_workgroup_size. If the kernel being finalized, or any of the + /// functions it calls, has a requiredworkgroupsize control directive, then the + /// values must be the same. Specifying a value can allow the finalizer to + /// optimize work-group id operations, and if the number of work-items in the + /// work-group is less than the WAVESIZE then barrier operations can be + /// optimized to just a memory fence. + hsa_dim3_t required_workgroup_size; + + /// If requiredDim is not enabled then must be 0 and the produced kernel code + /// can be dispatched with 1, 2 or 3 dimensions. If enabled then the value is + /// 1..3 and the code produced must only be dispatched with a dimension that + /// matches. Other values are illegal. If the kernel being finalized, or any of + /// the functions it calls, has a requireddimsize control directive, then the + /// values must be the same. This can be used to optimize the code generated to + /// compute the absolute and flat work-group and work-item id, and the dim + /// HSAIL operations. + uint8_t required_dim; + + /// Reserved. Must be 0. + uint8_t reserved[75]; +} hsa_ext_control_directives_t; + +/// AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel +/// Code Object to set up the hardware to execute the kernel dispatch. +/// +/// Initial Kernel Register State. +/// +/// Initial kernel register state will be set up by CP/SPI prior to the start +/// of execution of every wavefront. This is limited by the constraints of the +/// current hardware. +/// +/// The order of the SGPR registers is defined, but the Finalizer can specify +/// which ones are actually setup in the amd_kernel_code_t object using the +/// enable_sgpr_* bit fields. The register numbers used for enabled registers +/// are dense starting at SGPR0: the first enabled register is SGPR0, the next +/// enabled register is SGPR1 etc.; disabled registers do not have an SGPR +/// number. +/// +/// The initial SGPRs comprise up to 16 User SRGPs that are set up by CP and +/// apply to all waves of the grid. It is possible to specify more than 16 User +/// SGPRs using the enable_sgpr_* bit fields, in which case only the first 16 +/// are actually initialized. These are then immediately followed by the System +/// SGPRs that are set up by ADC/SPI and can have different values for each wave +/// of the grid dispatch. +/// +/// SGPR register initial state is defined as follows: +/// +/// Private Segment Buffer (enable_sgpr_private_segment_buffer): +/// Number of User SGPR registers: 4. V# that can be used, together with +/// Scratch Wave Offset as an offset, to access the Private/Spill/Arg +/// segments using a segment address. It must be set as follows: +/// - Base address: of the scratch memory area used by the dispatch. It +/// does not include the scratch wave offset. It will be the per process +/// SH_HIDDEN_PRIVATE_BASE_VMID plus any offset from this dispatch (for +/// example there may be a per pipe offset, or per AQL Queue offset). +/// - Stride + data_format: Element Size * Index Stride (???) +/// - Cache swizzle: ??? +/// - Swizzle enable: SH_STATIC_MEM_CONFIG.SWIZZLE_ENABLE (must be 1 for +/// scratch) +/// - Num records: Flat Scratch Work Item Size / Element Size (???) +/// - Dst_sel_*: ??? +/// - Num_format: ??? +/// - Element_size: SH_STATIC_MEM_CONFIG.ELEMENT_SIZE (will be DWORD, must +/// agree with amd_kernel_code_t.privateElementSize) +/// - Index_stride: SH_STATIC_MEM_CONFIG.INDEX_STRIDE (will be 64 as must +/// be number of wavefront lanes for scratch, must agree with +/// amd_kernel_code_t.wavefrontSize) +/// - Add tid enable: 1 +/// - ATC: from SH_MEM_CONFIG.PRIVATE_ATC, +/// - Hash_enable: ??? +/// - Heap: ??? +/// - Mtype: from SH_STATIC_MEM_CONFIG.PRIVATE_MTYPE +/// - Type: 0 (a buffer) (???) +/// +/// Dispatch Ptr (enable_sgpr_dispatch_ptr): +/// Number of User SGPR registers: 2. 64 bit address of AQL dispatch packet +/// for kernel actually executing. +/// +/// Queue Ptr (enable_sgpr_queue_ptr): +/// Number of User SGPR registers: 2. 64 bit address of AmdQueue object for +/// AQL queue on which the dispatch packet was queued. +/// +/// Kernarg Segment Ptr (enable_sgpr_kernarg_segment_ptr): +/// Number of User SGPR registers: 2. 64 bit address of Kernarg segment. This +/// is directly copied from the kernargPtr in the dispatch packet. Having CP +/// load it once avoids loading it at the beginning of every wavefront. +/// +/// Dispatch Id (enable_sgpr_dispatch_id): +/// Number of User SGPR registers: 2. 64 bit Dispatch ID of the dispatch +/// packet being executed. +/// +/// Flat Scratch Init (enable_sgpr_flat_scratch_init): +/// Number of User SGPR registers: 2. This is 2 SGPRs. +/// +/// For CI/VI: +/// The first SGPR is a 32 bit byte offset from SH_MEM_HIDDEN_PRIVATE_BASE +/// to base of memory for scratch for this dispatch. This is the same offset +/// used in computing the Scratch Segment Buffer base address. The value of +/// Scratch Wave Offset must be added by the kernel code and moved to +/// SGPRn-4 for use as the FLAT SCRATCH BASE in flat memory instructions. +/// +/// The second SGPR is 32 bit byte size of a single work-item's scratch +/// memory usage. This is directly loaded from the dispatch packet Private +/// Segment Byte Size and rounded up to a multiple of DWORD. +/// +/// \todo [Does CP need to round this to >4 byte alignment?] +/// +/// The kernel code must move to SGPRn-3 for use as the FLAT SCRATCH SIZE in +/// flat memory instructions. Having CP load it once avoids loading it at +/// the beginning of every wavefront. +/// +/// For PI: +/// This is the 64 bit base address of the scratch backing memory for +/// allocated by CP for this dispatch. +/// +/// Private Segment Size (enable_sgpr_private_segment_size): +/// Number of User SGPR registers: 1. The 32 bit byte size of a single +/// work-item's scratch memory allocation. This is the value from the dispatch +/// packet. Private Segment Byte Size rounded up by CP to a multiple of DWORD. +/// +/// \todo [Does CP need to round this to >4 byte alignment?] +/// +/// Having CP load it once avoids loading it at the beginning of every +/// wavefront. +/// +/// \todo [This will not be used for CI/VI since it is the same value as +/// the second SGPR of Flat Scratch Init. However, it is need for PI which +/// changes meaning of Flat Scratchg Init..] +/// +/// Grid Work-Group Count X (enable_sgpr_grid_workgroup_count_x): +/// Number of User SGPR registers: 1. 32 bit count of the number of +/// work-groups in the X dimension for the grid being executed. Computed from +/// the fields in the HsaDispatchPacket as +/// ((gridSize.x+workgroupSize.x-1)/workgroupSize.x). +/// +/// Grid Work-Group Count Y (enable_sgpr_grid_workgroup_count_y): +/// Number of User SGPR registers: 1. 32 bit count of the number of +/// work-groups in the Y dimension for the grid being executed. Computed from +/// the fields in the HsaDispatchPacket as +/// ((gridSize.y+workgroupSize.y-1)/workgroupSize.y). +/// +/// Only initialized if <16 previous SGPRs initialized. +/// +/// Grid Work-Group Count Z (enable_sgpr_grid_workgroup_count_z): +/// Number of User SGPR registers: 1. 32 bit count of the number of +/// work-groups in the Z dimension for the grid being executed. Computed +/// from the fields in the HsaDispatchPacket as +/// ((gridSize.z+workgroupSize.z-1)/workgroupSize.z). +/// +/// Only initialized if <16 previous SGPRs initialized. +/// +/// Work-Group Id X (enable_sgpr_workgroup_id_x): +/// Number of System SGPR registers: 1. 32 bit work group id in X dimension +/// of grid for wavefront. Always present. +/// +/// Work-Group Id Y (enable_sgpr_workgroup_id_y): +/// Number of System SGPR registers: 1. 32 bit work group id in Y dimension +/// of grid for wavefront. +/// +/// Work-Group Id Z (enable_sgpr_workgroup_id_z): +/// Number of System SGPR registers: 1. 32 bit work group id in Z dimension +/// of grid for wavefront. If present then Work-group Id Y will also be +/// present +/// +/// Work-Group Info (enable_sgpr_workgroup_info): +/// Number of System SGPR registers: 1. {first_wave, 14'b0000, +/// ordered_append_term[10:0], threadgroup_size_in_waves[5:0]} +/// +/// Private Segment Wave Byte Offset +/// (enable_sgpr_private_segment_wave_byte_offset): +/// Number of System SGPR registers: 1. 32 bit byte offset from base of +/// dispatch scratch base. Must be used as an offset with Private/Spill/Arg +/// segment address when using Scratch Segment Buffer. It must be added to +/// Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing. +/// +/// +/// The order of the VGPR registers is defined, but the Finalizer can specify +/// which ones are actually setup in the amd_kernel_code_t object using the +/// enableVgpr* bit fields. The register numbers used for enabled registers +/// are dense starting at VGPR0: the first enabled register is VGPR0, the next +/// enabled register is VGPR1 etc.; disabled registers do not have an VGPR +/// number. +/// +/// VGPR register initial state is defined as follows: +/// +/// Work-Item Id X (always initialized): +/// Number of registers: 1. 32 bit work item id in X dimension of work-group +/// for wavefront lane. +/// +/// Work-Item Id X (enable_vgpr_workitem_id > 0): +/// Number of registers: 1. 32 bit work item id in Y dimension of work-group +/// for wavefront lane. +/// +/// Work-Item Id X (enable_vgpr_workitem_id > 0): +/// Number of registers: 1. 32 bit work item id in Z dimension of work-group +/// for wavefront lane. +/// +/// +/// The setting of registers is being done by existing GPU hardware as follows: +/// 1) SGPRs before the Work-Group Ids are set by CP using the 16 User Data +/// registers. +/// 2) Work-group Id registers X, Y, Z are set by SPI which supports any +/// combination including none. +/// 3) Scratch Wave Offset is also set by SPI which is why its value cannot +/// be added into the value Flat Scratch Offset which would avoid the +/// Finalizer generated prolog having to do the add. +/// 4) The VGPRs are set by SPI which only supports specifying either (X), +/// (X, Y) or (X, Y, Z). +/// +/// Flat Scratch Dispatch Offset and Flat Scratch Size are adjacent SGRRs so +/// they can be moved as a 64 bit value to the hardware required SGPRn-3 and +/// SGPRn-4 respectively using the Finalizer ?FLAT_SCRATCH? Register. +/// +/// The global segment can be accessed either using flat operations or buffer +/// operations. If buffer operations are used then the Global Buffer used to +/// access HSAIL Global/Readonly/Kernarg (which are combine) segments using a +/// segment address is not passed into the kernel code by CP since its base +/// address is always 0. Instead the Finalizer generates prolog code to +/// initialize 4 SGPRs with a V# that has the following properties, and then +/// uses that in the buffer instructions: +/// - base address of 0 +/// - no swizzle +/// - ATC=1 +/// - MTYPE set to support memory coherence specified in +/// amd_kernel_code_t.globalMemoryCoherence +/// +/// When the Global Buffer is used to access the Kernarg segment, must add the +/// dispatch packet kernArgPtr to a kernarg segment address before using this V#. +/// Alternatively scalar loads can be used if the kernarg offset is uniform, as +/// the kernarg segment is constant for the duration of the kernel execution. +/// + +typedef struct amd_kernel_code_s { + uint32_t amd_kernel_code_version_major; + uint32_t amd_kernel_code_version_minor; + uint16_t amd_machine_kind; + uint16_t amd_machine_version_major; + uint16_t amd_machine_version_minor; + uint16_t amd_machine_version_stepping; + + /// Byte offset (possibly negative) from start of amd_kernel_code_t + /// object to kernel's entry point instruction. The actual code for + /// the kernel is required to be 256 byte aligned to match hardware + /// requirements (SQ cache line is 16). The code must be position + /// independent code (PIC) for AMD devices to give runtime the + /// option of copying code to discrete GPU memory or APU L2 + /// cache. The Finalizer should endeavour to allocate all kernel + /// machine code in contiguous memory pages so that a device + /// pre-fetcher will tend to only pre-fetch Kernel Code objects, + /// improving cache performance. + int64_t kernel_code_entry_byte_offset; + + /// Range of bytes to consider prefetching expressed as an offset + /// and size. The offset is from the start (possibly negative) of + /// amd_kernel_code_t object. Set both to 0 if no prefetch + /// information is available. + int64_t kernel_code_prefetch_byte_offset; + uint64_t kernel_code_prefetch_byte_size; + + /// Number of bytes of scratch backing memory required for full + /// occupancy of target chip. This takes into account the number of + /// bytes of scratch per work-item, the wavefront size, the maximum + /// number of wavefronts per CU, and the number of CUs. This is an + /// upper limit on scratch. If the grid being dispatched is small it + /// may only need less than this. If the kernel uses no scratch, or + /// the Finalizer has not computed this value, it must be 0. + uint64_t max_scratch_backing_memory_byte_size; + + /// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and + /// COMPUTE_PGM_RSRC2 registers. + uint64_t compute_pgm_resource_registers; + + /// Code properties. See amd_code_property_mask_t for a full list of + /// properties. + uint32_t code_properties; + + /// The amount of memory required for the combined private, spill + /// and arg segments for a work-item in bytes. If + /// is_dynamic_callstack is 1 then additional space must be added to + /// this value for the call stack. + uint32_t workitem_private_segment_byte_size; + + /// The amount of group segment memory required by a work-group in + /// bytes. This does not include any dynamically allocated group + /// segment memory that may be added when the kernel is + /// dispatched. + uint32_t workgroup_group_segment_byte_size; + + /// Number of byte of GDS required by kernel dispatch. Must be 0 if + /// not using GDS. + uint32_t gds_segment_byte_size; + + /// The size in bytes of the kernarg segment that holds the values + /// of the arguments to the kernel. This could be used by CP to + /// prefetch the kernarg segment pointed to by the dispatch packet. + uint64_t kernarg_segment_byte_size; + + /// Number of fbarrier's used in the kernel and all functions it + /// calls. If the implementation uses group memory to allocate the + /// fbarriers then that amount must already be included in the + /// workgroup_group_segment_byte_size total. + uint32_t workgroup_fbarrier_count; + + /// Number of scalar registers used by a wavefront. This includes + /// the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size + /// and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a + /// trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS. + uint16_t wavefront_sgpr_count; + + /// Number of vector registers used by each work-item. Used to set + /// COMPUTE_PGM_RSRC1.VGPRS. + uint16_t workitem_vgpr_count; + + /// If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the + /// first fixed VGPR number reserved. + uint16_t reserved_vgpr_first; + + /// The number of consecutive VGPRs reserved by the client. If + /// is_debug_supported then this count includes VGPRs reserved + /// for debugger use. + uint16_t reserved_vgpr_count; + + /// If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the + /// first fixed SGPR number reserved. + uint16_t reserved_sgpr_first; + + /// The number of consecutive SGPRs reserved by the client. If + /// is_debug_supported then this count includes SGPRs reserved + /// for debugger use. + uint16_t reserved_sgpr_count; + + /// If is_debug_supported is 0 then must be 0. Otherwise, this is the + /// fixed SGPR number used to hold the wave scratch offset for the + /// entire kernel execution, or uint16_t(-1) if the register is not + /// used or not known. + uint16_t debug_wavefront_private_segment_offset_sgpr; + + /// If is_debug_supported is 0 then must be 0. Otherwise, this is the + /// fixed SGPR number of the first of 4 SGPRs used to hold the + /// scratch V# used for the entire kernel execution, or uint16_t(-1) + /// if the registers are not used or not known. + uint16_t debug_private_segment_buffer_sgpr; + + /// The maximum byte alignment of variables used by the kernel in + /// the specified memory segment. Expressed as a power of two. Must + /// be at least HSA_POWERTWO_16. + uint8_t kernarg_segment_alignment; + uint8_t group_segment_alignment; + uint8_t private_segment_alignment; + + /// Wavefront size expressed as a power of two. Must be a power of 2 + /// in range 1..64 inclusive. Used to support runtime query that + /// obtains wavefront size, which may be used by application to + /// allocated dynamic group memory and set the dispatch work-group + /// size. + uint8_t wavefront_size; + + int32_t call_convention; + uint8_t reserved3[12]; + uint64_t runtime_loader_kernel_symbol; + uint64_t control_directives[16]; +} amd_kernel_code_t; + +#endif // AMDKERNELCODET_H diff --git a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp new file mode 100644 index 0000000..d9f753f --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -0,0 +1,1892 @@ +//===-- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "MCTargetDesc/AMDGPUTargetStreamer.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "AMDKernelCodeT.h" +#include "SIDefines.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbolELF.h" +#include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +namespace { + +struct OptionalOperand; + +class AMDGPUOperand : public MCParsedAsmOperand { + enum KindTy { + Token, + Immediate, + Register, + Expression + } Kind; + + SMLoc StartLoc, EndLoc; + +public: + AMDGPUOperand(enum KindTy K) : MCParsedAsmOperand(), Kind(K) {} + + MCContext *Ctx; + + enum ImmTy { + ImmTyNone, + ImmTyDSOffset0, + ImmTyDSOffset1, + ImmTyGDS, + ImmTyOffset, + ImmTyGLC, + ImmTySLC, + ImmTyTFE, + ImmTyClamp, + ImmTyOMod + }; + + struct TokOp { + const char *Data; + unsigned Length; + }; + + struct ImmOp { + bool IsFPImm; + ImmTy Type; + int64_t Val; + }; + + struct RegOp { + unsigned RegNo; + int Modifiers; + const MCRegisterInfo *TRI; + const MCSubtargetInfo *STI; + bool IsForcedVOP3; + }; + + union { + TokOp Tok; + ImmOp Imm; + RegOp Reg; + const MCExpr *Expr; + }; + + void addImmOperands(MCInst &Inst, unsigned N) const { + Inst.addOperand(MCOperand::createImm(getImm())); + } + + StringRef getToken() const { + return StringRef(Tok.Data, Tok.Length); + } + + void addRegOperands(MCInst &Inst, unsigned N) const { + Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), *Reg.STI))); + } + + void addRegOrImmOperands(MCInst &Inst, unsigned N) const { + if (isReg()) + addRegOperands(Inst, N); + else + addImmOperands(Inst, N); + } + + void addRegWithInputModsOperands(MCInst &Inst, unsigned N) const { + Inst.addOperand(MCOperand::createImm( + Reg.Modifiers == -1 ? 0 : Reg.Modifiers)); + addRegOperands(Inst, N); + } + + void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const { + if (isImm()) + addImmOperands(Inst, N); + else { + assert(isExpr()); + Inst.addOperand(MCOperand::createExpr(Expr)); + } + } + + bool defaultTokenHasSuffix() const { + StringRef Token(Tok.Data, Tok.Length); + + return Token.endswith("_e32") || Token.endswith("_e64"); + } + + bool isToken() const override { + return Kind == Token; + } + + bool isImm() const override { + return Kind == Immediate; + } + + bool isInlineImm() const { + float F = BitsToFloat(Imm.Val); + // TODO: Add 0.5pi for VI + return isImm() && ((Imm.Val <= 64 && Imm.Val >= -16) || + (F == 0.0 || F == 0.5 || F == -0.5 || F == 1.0 || F == -1.0 || + F == 2.0 || F == -2.0 || F == 4.0 || F == -4.0)); + } + + bool isDSOffset0() const { + assert(isImm()); + return Imm.Type == ImmTyDSOffset0; + } + + bool isDSOffset1() const { + assert(isImm()); + return Imm.Type == ImmTyDSOffset1; + } + + int64_t getImm() const { + return Imm.Val; + } + + enum ImmTy getImmTy() const { + assert(isImm()); + return Imm.Type; + } + + bool isRegKind() const { + return Kind == Register; + } + + bool isReg() const override { + return Kind == Register && Reg.Modifiers == -1; + } + + bool isRegWithInputMods() const { + return Kind == Register && (Reg.IsForcedVOP3 || Reg.Modifiers != -1); + } + + void setModifiers(unsigned Mods) { + assert(isReg()); + Reg.Modifiers = Mods; + } + + bool hasModifiers() const { + assert(isRegKind()); + return Reg.Modifiers != -1; + } + + unsigned getReg() const override { + return Reg.RegNo; + } + + bool isRegOrImm() const { + return isReg() || isImm(); + } + + bool isRegClass(unsigned RCID) const { + return Reg.TRI->getRegClass(RCID).contains(getReg()); + } + + bool isSCSrc32() const { + return isInlineImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID)); + } + + bool isSSrc32() const { + return isImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID)); + } + + bool isSSrc64() const { + return isImm() || isInlineImm() || + (isReg() && isRegClass(AMDGPU::SReg_64RegClassID)); + } + + bool isSCSrc64() const { + return (isReg() && isRegClass(AMDGPU::SReg_64RegClassID)) || isInlineImm(); + } + + bool isVCSrc32() const { + return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID)); + } + + bool isVCSrc64() const { + return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID)); + } + + bool isVSrc32() const { + return isImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID)); + } + + bool isVSrc64() const { + return isImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID)); + } + + bool isMem() const override { + return false; + } + + bool isExpr() const { + return Kind == Expression; + } + + bool isSoppBrTarget() const { + return isExpr() || isImm(); + } + + SMLoc getStartLoc() const override { + return StartLoc; + } + + SMLoc getEndLoc() const override { + return EndLoc; + } + + void print(raw_ostream &OS) const override { + switch (Kind) { + case Register: + OS << "<register " << getReg() << " mods: " << Reg.Modifiers << '>'; + break; + case Immediate: + OS << getImm(); + break; + case Token: + OS << '\'' << getToken() << '\''; + break; + case Expression: + OS << "<expr " << *Expr << '>'; + break; + } + } + + static std::unique_ptr<AMDGPUOperand> CreateImm(int64_t Val, SMLoc Loc, + enum ImmTy Type = ImmTyNone, + bool IsFPImm = false) { + auto Op = llvm::make_unique<AMDGPUOperand>(Immediate); + Op->Imm.Val = Val; + Op->Imm.IsFPImm = IsFPImm; + Op->Imm.Type = Type; + Op->StartLoc = Loc; + Op->EndLoc = Loc; + return Op; + } + + static std::unique_ptr<AMDGPUOperand> CreateToken(StringRef Str, SMLoc Loc, + bool HasExplicitEncodingSize = true) { + auto Res = llvm::make_unique<AMDGPUOperand>(Token); + Res->Tok.Data = Str.data(); + Res->Tok.Length = Str.size(); + Res->StartLoc = Loc; + Res->EndLoc = Loc; + return Res; + } + + static std::unique_ptr<AMDGPUOperand> CreateReg(unsigned RegNo, SMLoc S, + SMLoc E, + const MCRegisterInfo *TRI, + const MCSubtargetInfo *STI, + bool ForceVOP3) { + auto Op = llvm::make_unique<AMDGPUOperand>(Register); + Op->Reg.RegNo = RegNo; + Op->Reg.TRI = TRI; + Op->Reg.STI = STI; + Op->Reg.Modifiers = -1; + Op->Reg.IsForcedVOP3 = ForceVOP3; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<AMDGPUOperand> CreateExpr(const class MCExpr *Expr, SMLoc S) { + auto Op = llvm::make_unique<AMDGPUOperand>(Expression); + Op->Expr = Expr; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + bool isDSOffset() const; + bool isDSOffset01() const; + bool isSWaitCnt() const; + bool isMubufOffset() const; + bool isSMRDOffset() const; + bool isSMRDLiteralOffset() const; +}; + +class AMDGPUAsmParser : public MCTargetAsmParser { + const MCInstrInfo &MII; + MCAsmParser &Parser; + + unsigned ForcedEncodingSize; + + bool isSI() const { + return AMDGPU::isSI(getSTI()); + } + + bool isCI() const { + return AMDGPU::isCI(getSTI()); + } + + bool isVI() const { + return AMDGPU::isVI(getSTI()); + } + + bool hasSGPR102_SGPR103() const { + return !isVI(); + } + + /// @name Auto-generated Match Functions + /// { + +#define GET_ASSEMBLER_HEADER +#include "AMDGPUGenAsmMatcher.inc" + + /// } + +private: + bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor); + bool ParseDirectiveHSACodeObjectVersion(); + bool ParseDirectiveHSACodeObjectISA(); + bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header); + bool ParseDirectiveAMDKernelCodeT(); + bool ParseSectionDirectiveHSAText(); + bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) const; + bool ParseDirectiveAMDGPUHsaKernel(); + bool ParseDirectiveAMDGPUHsaModuleGlobal(); + bool ParseDirectiveAMDGPUHsaProgramGlobal(); + bool ParseSectionDirectiveHSADataGlobalAgent(); + bool ParseSectionDirectiveHSADataGlobalProgram(); + bool ParseSectionDirectiveHSARodataReadonlyAgent(); + +public: +public: + enum AMDGPUMatchResultTy { + Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY + }; + + AMDGPUAsmParser(const MCSubtargetInfo &STI, MCAsmParser &_Parser, + const MCInstrInfo &MII, + const MCTargetOptions &Options) + : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser), + ForcedEncodingSize(0) { + MCAsmParserExtension::Initialize(Parser); + + if (getSTI().getFeatureBits().none()) { + // Set default features. + copySTI().ToggleFeature("SOUTHERN_ISLANDS"); + } + + setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); + } + + AMDGPUTargetStreamer &getTargetStreamer() { + MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer(); + return static_cast<AMDGPUTargetStreamer &>(TS); + } + + unsigned getForcedEncodingSize() const { + return ForcedEncodingSize; + } + + void setForcedEncodingSize(unsigned Size) { + ForcedEncodingSize = Size; + } + + bool isForcedVOP3() const { + return ForcedEncodingSize == 64; + } + + bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; + unsigned checkTargetMatchPredicate(MCInst &Inst) override; + bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) override; + bool ParseDirective(AsmToken DirectiveID) override; + OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic); + bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) override; + + OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int, + int64_t Default = 0); + OperandMatchResultTy parseIntWithPrefix(const char *Prefix, + OperandVector &Operands, + enum AMDGPUOperand::ImmTy ImmTy = + AMDGPUOperand::ImmTyNone); + OperandMatchResultTy parseNamedBit(const char *Name, OperandVector &Operands, + enum AMDGPUOperand::ImmTy ImmTy = + AMDGPUOperand::ImmTyNone); + OperandMatchResultTy parseOptionalOps( + const ArrayRef<OptionalOperand> &OptionalOps, + OperandVector &Operands); + + + void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands); + void cvtDS(MCInst &Inst, const OperandVector &Operands); + OperandMatchResultTy parseDSOptionalOps(OperandVector &Operands); + OperandMatchResultTy parseDSOff01OptionalOps(OperandVector &Operands); + OperandMatchResultTy parseDSOffsetOptional(OperandVector &Operands); + + bool parseCnt(int64_t &IntVal); + OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands); + OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands); + + OperandMatchResultTy parseFlatOptionalOps(OperandVector &Operands); + OperandMatchResultTy parseFlatAtomicOptionalOps(OperandVector &Operands); + void cvtFlat(MCInst &Inst, const OperandVector &Operands); + + void cvtMubuf(MCInst &Inst, const OperandVector &Operands); + OperandMatchResultTy parseOffset(OperandVector &Operands); + OperandMatchResultTy parseMubufOptionalOps(OperandVector &Operands); + OperandMatchResultTy parseGLC(OperandVector &Operands); + OperandMatchResultTy parseSLC(OperandVector &Operands); + OperandMatchResultTy parseTFE(OperandVector &Operands); + + OperandMatchResultTy parseDMask(OperandVector &Operands); + OperandMatchResultTy parseUNorm(OperandVector &Operands); + OperandMatchResultTy parseR128(OperandVector &Operands); + + void cvtVOP3(MCInst &Inst, const OperandVector &Operands); + OperandMatchResultTy parseVOP3OptionalOps(OperandVector &Operands); +}; + +struct OptionalOperand { + const char *Name; + AMDGPUOperand::ImmTy Type; + bool IsBit; + int64_t Default; + bool (*ConvertResult)(int64_t&); +}; + +} + +static int getRegClass(bool IsVgpr, unsigned RegWidth) { + if (IsVgpr) { + switch (RegWidth) { + default: return -1; + case 1: return AMDGPU::VGPR_32RegClassID; + case 2: return AMDGPU::VReg_64RegClassID; + case 3: return AMDGPU::VReg_96RegClassID; + case 4: return AMDGPU::VReg_128RegClassID; + case 8: return AMDGPU::VReg_256RegClassID; + case 16: return AMDGPU::VReg_512RegClassID; + } + } + + switch (RegWidth) { + default: return -1; + case 1: return AMDGPU::SGPR_32RegClassID; + case 2: return AMDGPU::SGPR_64RegClassID; + case 4: return AMDGPU::SReg_128RegClassID; + case 8: return AMDGPU::SReg_256RegClassID; + case 16: return AMDGPU::SReg_512RegClassID; + } +} + +static unsigned getRegForName(StringRef RegName) { + + return StringSwitch<unsigned>(RegName) + .Case("exec", AMDGPU::EXEC) + .Case("vcc", AMDGPU::VCC) + .Case("flat_scratch", AMDGPU::FLAT_SCR) + .Case("m0", AMDGPU::M0) + .Case("scc", AMDGPU::SCC) + .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) + .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) + .Case("vcc_lo", AMDGPU::VCC_LO) + .Case("vcc_hi", AMDGPU::VCC_HI) + .Case("exec_lo", AMDGPU::EXEC_LO) + .Case("exec_hi", AMDGPU::EXEC_HI) + .Default(0); +} + +bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { + const AsmToken Tok = Parser.getTok(); + StartLoc = Tok.getLoc(); + EndLoc = Tok.getEndLoc(); + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + + StringRef RegName = Tok.getString(); + RegNo = getRegForName(RegName); + + if (RegNo) { + Parser.Lex(); + return !subtargetHasRegister(*TRI, RegNo); + } + + // Match vgprs and sgprs + if (RegName[0] != 's' && RegName[0] != 'v') + return true; + + bool IsVgpr = RegName[0] == 'v'; + unsigned RegWidth; + unsigned RegIndexInClass; + if (RegName.size() > 1) { + // We have a 32-bit register + RegWidth = 1; + if (RegName.substr(1).getAsInteger(10, RegIndexInClass)) + return true; + Parser.Lex(); + } else { + // We have a register greater than 32-bits. + + int64_t RegLo, RegHi; + Parser.Lex(); + if (getLexer().isNot(AsmToken::LBrac)) + return true; + + Parser.Lex(); + if (getParser().parseAbsoluteExpression(RegLo)) + return true; + + if (getLexer().isNot(AsmToken::Colon)) + return true; + + Parser.Lex(); + if (getParser().parseAbsoluteExpression(RegHi)) + return true; + + if (getLexer().isNot(AsmToken::RBrac)) + return true; + + Parser.Lex(); + RegWidth = (RegHi - RegLo) + 1; + if (IsVgpr) { + // VGPR registers aren't aligned. + RegIndexInClass = RegLo; + } else { + // SGPR registers are aligned. Max alignment is 4 dwords. + unsigned Size = std::min(RegWidth, 4u); + if (RegLo % Size != 0) + return true; + + RegIndexInClass = RegLo / Size; + } + } + + int RCID = getRegClass(IsVgpr, RegWidth); + if (RCID == -1) + return true; + + const MCRegisterClass RC = TRI->getRegClass(RCID); + if (RegIndexInClass >= RC.getNumRegs()) + return true; + + RegNo = RC.getRegister(RegIndexInClass); + return !subtargetHasRegister(*TRI, RegNo); +} + +unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { + + uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; + + if ((getForcedEncodingSize() == 32 && (TSFlags & SIInstrFlags::VOP3)) || + (getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3))) + return Match_InvalidOperand; + + if ((TSFlags & SIInstrFlags::VOP3) && + (TSFlags & SIInstrFlags::VOPAsmPrefer32Bit) && + getForcedEncodingSize() != 64) + return Match_PreferE32; + + return Match_Success; +} + + +bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) { + MCInst Inst; + + switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) { + default: break; + case Match_Success: + Inst.setLoc(IDLoc); + Out.EmitInstruction(Inst, getSTI()); + return false; + case Match_MissingFeature: + return Error(IDLoc, "instruction not supported on this GPU"); + + case Match_MnemonicFail: + return Error(IDLoc, "unrecognized instruction mnemonic"); + + case Match_InvalidOperand: { + SMLoc ErrorLoc = IDLoc; + if (ErrorInfo != ~0ULL) { + if (ErrorInfo >= Operands.size()) { + if (isForcedVOP3()) { + // If 64-bit encoding has been forced we can end up with no + // clamp or omod operands if none of the registers have modifiers, + // so we need to add these to the operand list. + AMDGPUOperand &LastOp = + ((AMDGPUOperand &)*Operands[Operands.size() - 1]); + if (LastOp.isRegKind() || + (LastOp.isImm() && + LastOp.getImmTy() != AMDGPUOperand::ImmTyNone)) { + SMLoc S = Parser.getTok().getLoc(); + Operands.push_back(AMDGPUOperand::CreateImm(0, S, + AMDGPUOperand::ImmTyClamp)); + Operands.push_back(AMDGPUOperand::CreateImm(0, S, + AMDGPUOperand::ImmTyOMod)); + bool Res = MatchAndEmitInstruction(IDLoc, Opcode, Operands, + Out, ErrorInfo, + MatchingInlineAsm); + if (!Res) + return Res; + } + + } + return Error(IDLoc, "too few operands for instruction"); + } + + ErrorLoc = ((AMDGPUOperand &)*Operands[ErrorInfo]).getStartLoc(); + if (ErrorLoc == SMLoc()) + ErrorLoc = IDLoc; + } + return Error(ErrorLoc, "invalid operand for instruction"); + } + case Match_PreferE32: + return Error(IDLoc, "internal error: instruction without _e64 suffix " + "should be encoded as e32"); + } + llvm_unreachable("Implement any new match types added!"); +} + +bool AMDGPUAsmParser::ParseDirectiveMajorMinor(uint32_t &Major, + uint32_t &Minor) { + if (getLexer().isNot(AsmToken::Integer)) + return TokError("invalid major version"); + + Major = getLexer().getTok().getIntVal(); + Lex(); + + if (getLexer().isNot(AsmToken::Comma)) + return TokError("minor version number required, comma expected"); + Lex(); + + if (getLexer().isNot(AsmToken::Integer)) + return TokError("invalid minor version"); + + Minor = getLexer().getTok().getIntVal(); + Lex(); + + return false; +} + +bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectVersion() { + + uint32_t Major; + uint32_t Minor; + + if (ParseDirectiveMajorMinor(Major, Minor)) + return true; + + getTargetStreamer().EmitDirectiveHSACodeObjectVersion(Major, Minor); + return false; +} + +bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() { + + uint32_t Major; + uint32_t Minor; + uint32_t Stepping; + StringRef VendorName; + StringRef ArchName; + + // If this directive has no arguments, then use the ISA version for the + // targeted GPU. + if (getLexer().is(AsmToken::EndOfStatement)) { + AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits()); + getTargetStreamer().EmitDirectiveHSACodeObjectISA(Isa.Major, Isa.Minor, + Isa.Stepping, + "AMD", "AMDGPU"); + return false; + } + + + if (ParseDirectiveMajorMinor(Major, Minor)) + return true; + + if (getLexer().isNot(AsmToken::Comma)) + return TokError("stepping version number required, comma expected"); + Lex(); + + if (getLexer().isNot(AsmToken::Integer)) + return TokError("invalid stepping version"); + + Stepping = getLexer().getTok().getIntVal(); + Lex(); + + if (getLexer().isNot(AsmToken::Comma)) + return TokError("vendor name required, comma expected"); + Lex(); + + if (getLexer().isNot(AsmToken::String)) + return TokError("invalid vendor name"); + + VendorName = getLexer().getTok().getStringContents(); + Lex(); + + if (getLexer().isNot(AsmToken::Comma)) + return TokError("arch name required, comma expected"); + Lex(); + + if (getLexer().isNot(AsmToken::String)) + return TokError("invalid arch name"); + + ArchName = getLexer().getTok().getStringContents(); + Lex(); + + getTargetStreamer().EmitDirectiveHSACodeObjectISA(Major, Minor, Stepping, + VendorName, ArchName); + return false; +} + +bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID, + amd_kernel_code_t &Header) { + + if (getLexer().isNot(AsmToken::Equal)) + return TokError("expected '='"); + Lex(); + + if (getLexer().isNot(AsmToken::Integer)) + return TokError("amd_kernel_code_t values must be integers"); + + uint64_t Value = getLexer().getTok().getIntVal(); + Lex(); + + if (ID == "kernel_code_version_major") + Header.amd_kernel_code_version_major = Value; + else if (ID == "kernel_code_version_minor") + Header.amd_kernel_code_version_minor = Value; + else if (ID == "machine_kind") + Header.amd_machine_kind = Value; + else if (ID == "machine_version_major") + Header.amd_machine_version_major = Value; + else if (ID == "machine_version_minor") + Header.amd_machine_version_minor = Value; + else if (ID == "machine_version_stepping") + Header.amd_machine_version_stepping = Value; + else if (ID == "kernel_code_entry_byte_offset") + Header.kernel_code_entry_byte_offset = Value; + else if (ID == "kernel_code_prefetch_byte_size") + Header.kernel_code_prefetch_byte_size = Value; + else if (ID == "max_scratch_backing_memory_byte_size") + Header.max_scratch_backing_memory_byte_size = Value; + else if (ID == "compute_pgm_rsrc1_vgprs") + Header.compute_pgm_resource_registers |= S_00B848_VGPRS(Value); + else if (ID == "compute_pgm_rsrc1_sgprs") + Header.compute_pgm_resource_registers |= S_00B848_SGPRS(Value); + else if (ID == "compute_pgm_rsrc1_priority") + Header.compute_pgm_resource_registers |= S_00B848_PRIORITY(Value); + else if (ID == "compute_pgm_rsrc1_float_mode") + Header.compute_pgm_resource_registers |= S_00B848_FLOAT_MODE(Value); + else if (ID == "compute_pgm_rsrc1_priv") + Header.compute_pgm_resource_registers |= S_00B848_PRIV(Value); + else if (ID == "compute_pgm_rsrc1_dx10_clamp") + Header.compute_pgm_resource_registers |= S_00B848_DX10_CLAMP(Value); + else if (ID == "compute_pgm_rsrc1_debug_mode") + Header.compute_pgm_resource_registers |= S_00B848_DEBUG_MODE(Value); + else if (ID == "compute_pgm_rsrc1_ieee_mode") + Header.compute_pgm_resource_registers |= S_00B848_IEEE_MODE(Value); + else if (ID == "compute_pgm_rsrc2_scratch_en") + Header.compute_pgm_resource_registers |= (S_00B84C_SCRATCH_EN(Value) << 32); + else if (ID == "compute_pgm_rsrc2_user_sgpr") + Header.compute_pgm_resource_registers |= (S_00B84C_USER_SGPR(Value) << 32); + else if (ID == "compute_pgm_rsrc2_tgid_x_en") + Header.compute_pgm_resource_registers |= (S_00B84C_TGID_X_EN(Value) << 32); + else if (ID == "compute_pgm_rsrc2_tgid_y_en") + Header.compute_pgm_resource_registers |= (S_00B84C_TGID_Y_EN(Value) << 32); + else if (ID == "compute_pgm_rsrc2_tgid_z_en") + Header.compute_pgm_resource_registers |= (S_00B84C_TGID_Z_EN(Value) << 32); + else if (ID == "compute_pgm_rsrc2_tg_size_en") + Header.compute_pgm_resource_registers |= (S_00B84C_TG_SIZE_EN(Value) << 32); + else if (ID == "compute_pgm_rsrc2_tidig_comp_cnt") + Header.compute_pgm_resource_registers |= + (S_00B84C_TIDIG_COMP_CNT(Value) << 32); + else if (ID == "compute_pgm_rsrc2_excp_en_msb") + Header.compute_pgm_resource_registers |= + (S_00B84C_EXCP_EN_MSB(Value) << 32); + else if (ID == "compute_pgm_rsrc2_lds_size") + Header.compute_pgm_resource_registers |= (S_00B84C_LDS_SIZE(Value) << 32); + else if (ID == "compute_pgm_rsrc2_excp_en") + Header.compute_pgm_resource_registers |= (S_00B84C_EXCP_EN(Value) << 32); + else if (ID == "compute_pgm_resource_registers") + Header.compute_pgm_resource_registers = Value; + else if (ID == "enable_sgpr_private_segment_buffer") + Header.code_properties |= + (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT); + else if (ID == "enable_sgpr_dispatch_ptr") + Header.code_properties |= + (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT); + else if (ID == "enable_sgpr_queue_ptr") + Header.code_properties |= + (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT); + else if (ID == "enable_sgpr_kernarg_segment_ptr") + Header.code_properties |= + (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT); + else if (ID == "enable_sgpr_dispatch_id") + Header.code_properties |= + (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT); + else if (ID == "enable_sgpr_flat_scratch_init") + Header.code_properties |= + (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT); + else if (ID == "enable_sgpr_private_segment_size") + Header.code_properties |= + (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT); + else if (ID == "enable_sgpr_grid_workgroup_count_x") + Header.code_properties |= + (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT); + else if (ID == "enable_sgpr_grid_workgroup_count_y") + Header.code_properties |= + (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT); + else if (ID == "enable_sgpr_grid_workgroup_count_z") + Header.code_properties |= + (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT); + else if (ID == "enable_ordered_append_gds") + Header.code_properties |= + (Value << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT); + else if (ID == "private_element_size") + Header.code_properties |= + (Value << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT); + else if (ID == "is_ptr64") + Header.code_properties |= + (Value << AMD_CODE_PROPERTY_IS_PTR64_SHIFT); + else if (ID == "is_dynamic_callstack") + Header.code_properties |= + (Value << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT); + else if (ID == "is_debug_enabled") + Header.code_properties |= + (Value << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT); + else if (ID == "is_xnack_enabled") + Header.code_properties |= + (Value << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT); + else if (ID == "workitem_private_segment_byte_size") + Header.workitem_private_segment_byte_size = Value; + else if (ID == "workgroup_group_segment_byte_size") + Header.workgroup_group_segment_byte_size = Value; + else if (ID == "gds_segment_byte_size") + Header.gds_segment_byte_size = Value; + else if (ID == "kernarg_segment_byte_size") + Header.kernarg_segment_byte_size = Value; + else if (ID == "workgroup_fbarrier_count") + Header.workgroup_fbarrier_count = Value; + else if (ID == "wavefront_sgpr_count") + Header.wavefront_sgpr_count = Value; + else if (ID == "workitem_vgpr_count") + Header.workitem_vgpr_count = Value; + else if (ID == "reserved_vgpr_first") + Header.reserved_vgpr_first = Value; + else if (ID == "reserved_vgpr_count") + Header.reserved_vgpr_count = Value; + else if (ID == "reserved_sgpr_first") + Header.reserved_sgpr_first = Value; + else if (ID == "reserved_sgpr_count") + Header.reserved_sgpr_count = Value; + else if (ID == "debug_wavefront_private_segment_offset_sgpr") + Header.debug_wavefront_private_segment_offset_sgpr = Value; + else if (ID == "debug_private_segment_buffer_sgpr") + Header.debug_private_segment_buffer_sgpr = Value; + else if (ID == "kernarg_segment_alignment") + Header.kernarg_segment_alignment = Value; + else if (ID == "group_segment_alignment") + Header.group_segment_alignment = Value; + else if (ID == "private_segment_alignment") + Header.private_segment_alignment = Value; + else if (ID == "wavefront_size") + Header.wavefront_size = Value; + else if (ID == "call_convention") + Header.call_convention = Value; + else if (ID == "runtime_loader_kernel_symbol") + Header.runtime_loader_kernel_symbol = Value; + else + return TokError("amd_kernel_code_t value not recognized."); + + return false; +} + +bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() { + + amd_kernel_code_t Header; + AMDGPU::initDefaultAMDKernelCodeT(Header, getSTI().getFeatureBits()); + + while (true) { + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("amd_kernel_code_t values must begin on a new line"); + + // Lex EndOfStatement. This is in a while loop, because lexing a comment + // will set the current token to EndOfStatement. + while(getLexer().is(AsmToken::EndOfStatement)) + Lex(); + + if (getLexer().isNot(AsmToken::Identifier)) + return TokError("expected value identifier or .end_amd_kernel_code_t"); + + StringRef ID = getLexer().getTok().getIdentifier(); + Lex(); + + if (ID == ".end_amd_kernel_code_t") + break; + + if (ParseAMDKernelCodeTValue(ID, Header)) + return true; + } + + getTargetStreamer().EmitAMDKernelCodeT(Header); + + return false; +} + +bool AMDGPUAsmParser::ParseSectionDirectiveHSAText() { + getParser().getStreamer().SwitchSection( + AMDGPU::getHSATextSection(getContext())); + return false; +} + +bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() { + if (getLexer().isNot(AsmToken::Identifier)) + return TokError("expected symbol name"); + + StringRef KernelName = Parser.getTok().getString(); + + getTargetStreamer().EmitAMDGPUSymbolType(KernelName, + ELF::STT_AMDGPU_HSA_KERNEL); + Lex(); + return false; +} + +bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaModuleGlobal() { + if (getLexer().isNot(AsmToken::Identifier)) + return TokError("expected symbol name"); + + StringRef GlobalName = Parser.getTok().getIdentifier(); + + getTargetStreamer().EmitAMDGPUHsaModuleScopeGlobal(GlobalName); + Lex(); + return false; +} + +bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaProgramGlobal() { + if (getLexer().isNot(AsmToken::Identifier)) + return TokError("expected symbol name"); + + StringRef GlobalName = Parser.getTok().getIdentifier(); + + getTargetStreamer().EmitAMDGPUHsaProgramScopeGlobal(GlobalName); + Lex(); + return false; +} + +bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalAgent() { + getParser().getStreamer().SwitchSection( + AMDGPU::getHSADataGlobalAgentSection(getContext())); + return false; +} + +bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalProgram() { + getParser().getStreamer().SwitchSection( + AMDGPU::getHSADataGlobalProgramSection(getContext())); + return false; +} + +bool AMDGPUAsmParser::ParseSectionDirectiveHSARodataReadonlyAgent() { + getParser().getStreamer().SwitchSection( + AMDGPU::getHSARodataReadonlyAgentSection(getContext())); + return false; +} + +bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { + StringRef IDVal = DirectiveID.getString(); + + if (IDVal == ".hsa_code_object_version") + return ParseDirectiveHSACodeObjectVersion(); + + if (IDVal == ".hsa_code_object_isa") + return ParseDirectiveHSACodeObjectISA(); + + if (IDVal == ".amd_kernel_code_t") + return ParseDirectiveAMDKernelCodeT(); + + if (IDVal == ".hsatext" || IDVal == ".text") + return ParseSectionDirectiveHSAText(); + + if (IDVal == ".amdgpu_hsa_kernel") + return ParseDirectiveAMDGPUHsaKernel(); + + if (IDVal == ".amdgpu_hsa_module_global") + return ParseDirectiveAMDGPUHsaModuleGlobal(); + + if (IDVal == ".amdgpu_hsa_program_global") + return ParseDirectiveAMDGPUHsaProgramGlobal(); + + if (IDVal == ".hsadata_global_agent") + return ParseSectionDirectiveHSADataGlobalAgent(); + + if (IDVal == ".hsadata_global_program") + return ParseSectionDirectiveHSADataGlobalProgram(); + + if (IDVal == ".hsarodata_readonly_agent") + return ParseSectionDirectiveHSARodataReadonlyAgent(); + + return true; +} + +bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, + unsigned RegNo) const { + if (isCI()) + return true; + + if (isSI()) { + // No flat_scr + switch (RegNo) { + case AMDGPU::FLAT_SCR: + case AMDGPU::FLAT_SCR_LO: + case AMDGPU::FLAT_SCR_HI: + return false; + default: + return true; + } + } + + // VI only has 102 SGPRs, so make sure we aren't trying to use the 2 more that + // SI/CI have. + for (MCRegAliasIterator R(AMDGPU::SGPR102_SGPR103, &MRI, true); + R.isValid(); ++R) { + if (*R == RegNo) + return false; + } + + return true; +} + +static bool operandsHaveModifiers(const OperandVector &Operands) { + + for (unsigned i = 0, e = Operands.size(); i != e; ++i) { + const AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]); + if (Op.isRegKind() && Op.hasModifiers()) + return true; + if (Op.isImm() && (Op.getImmTy() == AMDGPUOperand::ImmTyOMod || + Op.getImmTy() == AMDGPUOperand::ImmTyClamp)) + return true; + } + return false; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { + + // Try to parse with a custom parser + OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); + + // If we successfully parsed the operand or if there as an error parsing, + // we are done. + // + // If we are parsing after we reach EndOfStatement then this means we + // are appending default values to the Operands list. This is only done + // by custom parser, so we shouldn't continue on to the generic parsing. + if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail || + getLexer().is(AsmToken::EndOfStatement)) + return ResTy; + + bool Negate = false, Abs = false; + if (getLexer().getKind()== AsmToken::Minus) { + Parser.Lex(); + Negate = true; + } + + if (getLexer().getKind() == AsmToken::Pipe) { + Parser.Lex(); + Abs = true; + } + + switch(getLexer().getKind()) { + case AsmToken::Integer: { + SMLoc S = Parser.getTok().getLoc(); + int64_t IntVal; + if (getParser().parseAbsoluteExpression(IntVal)) + return MatchOperand_ParseFail; + if (!isInt<32>(IntVal) && !isUInt<32>(IntVal)) { + Error(S, "invalid immediate: only 32-bit values are legal"); + return MatchOperand_ParseFail; + } + + if (Negate) + IntVal *= -1; + Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S)); + return MatchOperand_Success; + } + case AsmToken::Real: { + // FIXME: We should emit an error if a double precisions floating-point + // value is used. I'm not sure the best way to detect this. + SMLoc S = Parser.getTok().getLoc(); + int64_t IntVal; + if (getParser().parseAbsoluteExpression(IntVal)) + return MatchOperand_ParseFail; + + APFloat F((float)BitsToDouble(IntVal)); + if (Negate) + F.changeSign(); + Operands.push_back( + AMDGPUOperand::CreateImm(F.bitcastToAPInt().getZExtValue(), S)); + return MatchOperand_Success; + } + case AsmToken::Identifier: { + SMLoc S, E; + unsigned RegNo; + if (!ParseRegister(RegNo, S, E)) { + + bool HasModifiers = operandsHaveModifiers(Operands); + unsigned Modifiers = 0; + + if (Negate) + Modifiers |= 0x1; + + if (Abs) { + if (getLexer().getKind() != AsmToken::Pipe) + return MatchOperand_ParseFail; + Parser.Lex(); + Modifiers |= 0x2; + } + + if (Modifiers && !HasModifiers) { + // We are adding a modifier to src1 or src2 and previous sources + // don't have modifiers, so we need to go back and empty modifers + // for each previous source. + for (unsigned PrevRegIdx = Operands.size() - 1; PrevRegIdx > 1; + --PrevRegIdx) { + + AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[PrevRegIdx]); + RegOp.setModifiers(0); + } + } + + + Operands.push_back(AMDGPUOperand::CreateReg( + RegNo, S, E, getContext().getRegisterInfo(), &getSTI(), + isForcedVOP3())); + + if (HasModifiers || Modifiers) { + AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[Operands.size() - 1]); + RegOp.setModifiers(Modifiers); + + } + } else { + Operands.push_back(AMDGPUOperand::CreateToken(Parser.getTok().getString(), + S)); + Parser.Lex(); + } + return MatchOperand_Success; + } + default: + return MatchOperand_NoMatch; + } +} + +bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, + StringRef Name, + SMLoc NameLoc, OperandVector &Operands) { + + // Clear any forced encodings from the previous instruction. + setForcedEncodingSize(0); + + if (Name.endswith("_e64")) + setForcedEncodingSize(64); + else if (Name.endswith("_e32")) + setForcedEncodingSize(32); + + // Add the instruction mnemonic + Operands.push_back(AMDGPUOperand::CreateToken(Name, NameLoc)); + + while (!getLexer().is(AsmToken::EndOfStatement)) { + AMDGPUAsmParser::OperandMatchResultTy Res = parseOperand(Operands, Name); + + // Eat the comma or space if there is one. + if (getLexer().is(AsmToken::Comma)) + Parser.Lex(); + + switch (Res) { + case MatchOperand_Success: break; + case MatchOperand_ParseFail: return Error(getLexer().getLoc(), + "failed parsing operand."); + case MatchOperand_NoMatch: return Error(getLexer().getLoc(), + "not a valid operand."); + } + } + + // Once we reach end of statement, continue parsing so we can add default + // values for optional arguments. + AMDGPUAsmParser::OperandMatchResultTy Res; + while ((Res = parseOperand(Operands, Name)) != MatchOperand_NoMatch) { + if (Res != MatchOperand_Success) + return Error(getLexer().getLoc(), "failed parsing operand."); + } + return false; +} + +//===----------------------------------------------------------------------===// +// Utility functions +//===----------------------------------------------------------------------===// + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int, + int64_t Default) { + + // We are at the end of the statement, and this is a default argument, so + // use a default value. + if (getLexer().is(AsmToken::EndOfStatement)) { + Int = Default; + return MatchOperand_Success; + } + + switch(getLexer().getKind()) { + default: return MatchOperand_NoMatch; + case AsmToken::Identifier: { + StringRef OffsetName = Parser.getTok().getString(); + if (!OffsetName.equals(Prefix)) + return MatchOperand_NoMatch; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Colon)) + return MatchOperand_ParseFail; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Integer)) + return MatchOperand_ParseFail; + + if (getParser().parseAbsoluteExpression(Int)) + return MatchOperand_ParseFail; + break; + } + } + return MatchOperand_Success; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands, + enum AMDGPUOperand::ImmTy ImmTy) { + + SMLoc S = Parser.getTok().getLoc(); + int64_t Offset = 0; + + AMDGPUAsmParser::OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Offset); + if (Res != MatchOperand_Success) + return Res; + + Operands.push_back(AMDGPUOperand::CreateImm(Offset, S, ImmTy)); + return MatchOperand_Success; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, + enum AMDGPUOperand::ImmTy ImmTy) { + int64_t Bit = 0; + SMLoc S = Parser.getTok().getLoc(); + + // We are at the end of the statement, and this is a default argument, so + // use a default value. + if (getLexer().isNot(AsmToken::EndOfStatement)) { + switch(getLexer().getKind()) { + case AsmToken::Identifier: { + StringRef Tok = Parser.getTok().getString(); + if (Tok == Name) { + Bit = 1; + Parser.Lex(); + } else if (Tok.startswith("no") && Tok.endswith(Name)) { + Bit = 0; + Parser.Lex(); + } else { + return MatchOperand_NoMatch; + } + break; + } + default: + return MatchOperand_NoMatch; + } + } + + Operands.push_back(AMDGPUOperand::CreateImm(Bit, S, ImmTy)); + return MatchOperand_Success; +} + +static bool operandsHasOptionalOp(const OperandVector &Operands, + const OptionalOperand &OOp) { + for (unsigned i = 0; i < Operands.size(); i++) { + const AMDGPUOperand &ParsedOp = ((const AMDGPUOperand &)*Operands[i]); + if ((ParsedOp.isImm() && ParsedOp.getImmTy() == OOp.Type) || + (ParsedOp.isToken() && ParsedOp.getToken() == OOp.Name)) + return true; + + } + return false; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseOptionalOps(const ArrayRef<OptionalOperand> &OptionalOps, + OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + for (const OptionalOperand &Op : OptionalOps) { + if (operandsHasOptionalOp(Operands, Op)) + continue; + AMDGPUAsmParser::OperandMatchResultTy Res; + int64_t Value; + if (Op.IsBit) { + Res = parseNamedBit(Op.Name, Operands, Op.Type); + if (Res == MatchOperand_NoMatch) + continue; + return Res; + } + + Res = parseIntWithPrefix(Op.Name, Value, Op.Default); + + if (Res == MatchOperand_NoMatch) + continue; + + if (Res != MatchOperand_Success) + return Res; + + if (Op.ConvertResult && !Op.ConvertResult(Value)) { + return MatchOperand_ParseFail; + } + + Operands.push_back(AMDGPUOperand::CreateImm(Value, S, Op.Type)); + return MatchOperand_Success; + } + return MatchOperand_NoMatch; +} + +//===----------------------------------------------------------------------===// +// ds +//===----------------------------------------------------------------------===// + +static const OptionalOperand DSOptionalOps [] = { + {"offset", AMDGPUOperand::ImmTyOffset, false, 0, nullptr}, + {"gds", AMDGPUOperand::ImmTyGDS, true, 0, nullptr} +}; + +static const OptionalOperand DSOptionalOpsOff01 [] = { + {"offset0", AMDGPUOperand::ImmTyDSOffset0, false, 0, nullptr}, + {"offset1", AMDGPUOperand::ImmTyDSOffset1, false, 0, nullptr}, + {"gds", AMDGPUOperand::ImmTyGDS, true, 0, nullptr} +}; + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseDSOptionalOps(OperandVector &Operands) { + return parseOptionalOps(DSOptionalOps, Operands); +} +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseDSOff01OptionalOps(OperandVector &Operands) { + return parseOptionalOps(DSOptionalOpsOff01, Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseDSOffsetOptional(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + AMDGPUAsmParser::OperandMatchResultTy Res = + parseIntWithPrefix("offset", Operands, AMDGPUOperand::ImmTyOffset); + if (Res == MatchOperand_NoMatch) { + Operands.push_back(AMDGPUOperand::CreateImm(0, S, + AMDGPUOperand::ImmTyOffset)); + Res = MatchOperand_Success; + } + return Res; +} + +bool AMDGPUOperand::isDSOffset() const { + return isImm() && isUInt<16>(getImm()); +} + +bool AMDGPUOperand::isDSOffset01() const { + return isImm() && isUInt<8>(getImm()); +} + +void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst, + const OperandVector &Operands) { + + std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx; + + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + + // Add the register arguments + if (Op.isReg()) { + Op.addRegOperands(Inst, 1); + continue; + } + + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = i; + } + + unsigned Offset0Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset0]; + unsigned Offset1Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset1]; + unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS]; + + ((AMDGPUOperand &)*Operands[Offset0Idx]).addImmOperands(Inst, 1); // offset0 + ((AMDGPUOperand &)*Operands[Offset1Idx]).addImmOperands(Inst, 1); // offset1 + ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds + Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0 +} + +void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) { + + std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx; + bool GDSOnly = false; + + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + + // Add the register arguments + if (Op.isReg()) { + Op.addRegOperands(Inst, 1); + continue; + } + + if (Op.isToken() && Op.getToken() == "gds") { + GDSOnly = true; + continue; + } + + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = i; + } + + unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset]; + ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1); // offset + + if (!GDSOnly) { + unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS]; + ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds + } + Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0 +} + + +//===----------------------------------------------------------------------===// +// s_waitcnt +//===----------------------------------------------------------------------===// + +bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { + StringRef CntName = Parser.getTok().getString(); + int64_t CntVal; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::LParen)) + return true; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Integer)) + return true; + + if (getParser().parseAbsoluteExpression(CntVal)) + return true; + + if (getLexer().isNot(AsmToken::RParen)) + return true; + + Parser.Lex(); + if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma)) + Parser.Lex(); + + int CntShift; + int CntMask; + + if (CntName == "vmcnt") { + CntMask = 0xf; + CntShift = 0; + } else if (CntName == "expcnt") { + CntMask = 0x7; + CntShift = 4; + } else if (CntName == "lgkmcnt") { + CntMask = 0x7; + CntShift = 8; + } else { + return true; + } + + IntVal &= ~(CntMask << CntShift); + IntVal |= (CntVal << CntShift); + return false; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { + // Disable all counters by default. + // vmcnt [3:0] + // expcnt [6:4] + // lgkmcnt [10:8] + int64_t CntVal = 0x77f; + SMLoc S = Parser.getTok().getLoc(); + + switch(getLexer().getKind()) { + default: return MatchOperand_ParseFail; + case AsmToken::Integer: + // The operand can be an integer value. + if (getParser().parseAbsoluteExpression(CntVal)) + return MatchOperand_ParseFail; + break; + + case AsmToken::Identifier: + do { + if (parseCnt(CntVal)) + return MatchOperand_ParseFail; + } while(getLexer().isNot(AsmToken::EndOfStatement)); + break; + } + Operands.push_back(AMDGPUOperand::CreateImm(CntVal, S)); + return MatchOperand_Success; +} + +bool AMDGPUOperand::isSWaitCnt() const { + return isImm(); +} + +//===----------------------------------------------------------------------===// +// sopp branch targets +//===----------------------------------------------------------------------===// + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + + switch (getLexer().getKind()) { + default: return MatchOperand_ParseFail; + case AsmToken::Integer: { + int64_t Imm; + if (getParser().parseAbsoluteExpression(Imm)) + return MatchOperand_ParseFail; + Operands.push_back(AMDGPUOperand::CreateImm(Imm, S)); + return MatchOperand_Success; + } + + case AsmToken::Identifier: + Operands.push_back(AMDGPUOperand::CreateExpr( + MCSymbolRefExpr::create(getContext().getOrCreateSymbol( + Parser.getTok().getString()), getContext()), S)); + Parser.Lex(); + return MatchOperand_Success; + } +} + +//===----------------------------------------------------------------------===// +// flat +//===----------------------------------------------------------------------===// + +static const OptionalOperand FlatOptionalOps [] = { + {"glc", AMDGPUOperand::ImmTyGLC, true, 0, nullptr}, + {"slc", AMDGPUOperand::ImmTySLC, true, 0, nullptr}, + {"tfe", AMDGPUOperand::ImmTyTFE, true, 0, nullptr} +}; + +static const OptionalOperand FlatAtomicOptionalOps [] = { + {"slc", AMDGPUOperand::ImmTySLC, true, 0, nullptr}, + {"tfe", AMDGPUOperand::ImmTyTFE, true, 0, nullptr} +}; + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseFlatOptionalOps(OperandVector &Operands) { + return parseOptionalOps(FlatOptionalOps, Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseFlatAtomicOptionalOps(OperandVector &Operands) { + return parseOptionalOps(FlatAtomicOptionalOps, Operands); +} + +void AMDGPUAsmParser::cvtFlat(MCInst &Inst, + const OperandVector &Operands) { + std::map<AMDGPUOperand::ImmTy, unsigned> OptionalIdx; + + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + + // Add the register arguments + if (Op.isReg()) { + Op.addRegOperands(Inst, 1); + continue; + } + + // Handle 'glc' token which is sometimes hard-coded into the + // asm string. There are no MCInst operands for these. + if (Op.isToken()) + continue; + + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = i; + + } + + // flat atomic instructions don't have a glc argument. + if (OptionalIdx.count(AMDGPUOperand::ImmTyGLC)) { + unsigned GLCIdx = OptionalIdx[AMDGPUOperand::ImmTyGLC]; + ((AMDGPUOperand &)*Operands[GLCIdx]).addImmOperands(Inst, 1); + } + + unsigned SLCIdx = OptionalIdx[AMDGPUOperand::ImmTySLC]; + unsigned TFEIdx = OptionalIdx[AMDGPUOperand::ImmTyTFE]; + + ((AMDGPUOperand &)*Operands[SLCIdx]).addImmOperands(Inst, 1); + ((AMDGPUOperand &)*Operands[TFEIdx]).addImmOperands(Inst, 1); +} + +//===----------------------------------------------------------------------===// +// mubuf +//===----------------------------------------------------------------------===// + +static const OptionalOperand MubufOptionalOps [] = { + {"offset", AMDGPUOperand::ImmTyOffset, false, 0, nullptr}, + {"glc", AMDGPUOperand::ImmTyGLC, true, 0, nullptr}, + {"slc", AMDGPUOperand::ImmTySLC, true, 0, nullptr}, + {"tfe", AMDGPUOperand::ImmTyTFE, true, 0, nullptr} +}; + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseMubufOptionalOps(OperandVector &Operands) { + return parseOptionalOps(MubufOptionalOps, Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseOffset(OperandVector &Operands) { + return parseIntWithPrefix("offset", Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseGLC(OperandVector &Operands) { + return parseNamedBit("glc", Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseSLC(OperandVector &Operands) { + return parseNamedBit("slc", Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseTFE(OperandVector &Operands) { + return parseNamedBit("tfe", Operands); +} + +bool AMDGPUOperand::isMubufOffset() const { + return isImm() && isUInt<12>(getImm()); +} + +void AMDGPUAsmParser::cvtMubuf(MCInst &Inst, + const OperandVector &Operands) { + std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx; + + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + + // Add the register arguments + if (Op.isReg()) { + Op.addRegOperands(Inst, 1); + continue; + } + + // Handle the case where soffset is an immediate + if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) { + Op.addImmOperands(Inst, 1); + continue; + } + + // Handle tokens like 'offen' which are sometimes hard-coded into the + // asm string. There are no MCInst operands for these. + if (Op.isToken()) { + continue; + } + assert(Op.isImm()); + + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = i; + } + + assert(OptionalIdx.size() == 4); + + unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset]; + unsigned GLCIdx = OptionalIdx[AMDGPUOperand::ImmTyGLC]; + unsigned SLCIdx = OptionalIdx[AMDGPUOperand::ImmTySLC]; + unsigned TFEIdx = OptionalIdx[AMDGPUOperand::ImmTyTFE]; + + ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1); + ((AMDGPUOperand &)*Operands[GLCIdx]).addImmOperands(Inst, 1); + ((AMDGPUOperand &)*Operands[SLCIdx]).addImmOperands(Inst, 1); + ((AMDGPUOperand &)*Operands[TFEIdx]).addImmOperands(Inst, 1); +} + +//===----------------------------------------------------------------------===// +// mimg +//===----------------------------------------------------------------------===// + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseDMask(OperandVector &Operands) { + return parseIntWithPrefix("dmask", Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseUNorm(OperandVector &Operands) { + return parseNamedBit("unorm", Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseR128(OperandVector &Operands) { + return parseNamedBit("r128", Operands); +} + +//===----------------------------------------------------------------------===// +// smrd +//===----------------------------------------------------------------------===// + +bool AMDGPUOperand::isSMRDOffset() const { + + // FIXME: Support 20-bit offsets on VI. We need to to pass subtarget + // information here. + return isImm() && isUInt<8>(getImm()); +} + +bool AMDGPUOperand::isSMRDLiteralOffset() const { + // 32-bit literals are only supported on CI and we only want to use them + // when the offset is > 8-bits. + return isImm() && !isUInt<8>(getImm()) && isUInt<32>(getImm()); +} + +//===----------------------------------------------------------------------===// +// vop3 +//===----------------------------------------------------------------------===// + +static bool ConvertOmodMul(int64_t &Mul) { + if (Mul != 1 && Mul != 2 && Mul != 4) + return false; + + Mul >>= 1; + return true; +} + +static bool ConvertOmodDiv(int64_t &Div) { + if (Div == 1) { + Div = 0; + return true; + } + + if (Div == 2) { + Div = 3; + return true; + } + + return false; +} + +static const OptionalOperand VOP3OptionalOps [] = { + {"clamp", AMDGPUOperand::ImmTyClamp, true, 0, nullptr}, + {"mul", AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodMul}, + {"div", AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodDiv}, +}; + +static bool isVOP3(OperandVector &Operands) { + if (operandsHaveModifiers(Operands)) + return true; + + AMDGPUOperand &DstOp = ((AMDGPUOperand&)*Operands[1]); + + if (DstOp.isReg() && DstOp.isRegClass(AMDGPU::SGPR_64RegClassID)) + return true; + + if (Operands.size() >= 5) + return true; + + if (Operands.size() > 3) { + AMDGPUOperand &Src1Op = ((AMDGPUOperand&)*Operands[3]); + if (Src1Op.getReg() && (Src1Op.isRegClass(AMDGPU::SReg_32RegClassID) || + Src1Op.isRegClass(AMDGPU::SReg_64RegClassID))) + return true; + } + return false; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseVOP3OptionalOps(OperandVector &Operands) { + + // The value returned by this function may change after parsing + // an operand so store the original value here. + bool HasModifiers = operandsHaveModifiers(Operands); + + bool IsVOP3 = isVOP3(Operands); + if (HasModifiers || IsVOP3 || + getLexer().isNot(AsmToken::EndOfStatement) || + getForcedEncodingSize() == 64) { + + AMDGPUAsmParser::OperandMatchResultTy Res = + parseOptionalOps(VOP3OptionalOps, Operands); + + if (!HasModifiers && Res == MatchOperand_Success) { + // We have added a modifier operation, so we need to make sure all + // previous register operands have modifiers + for (unsigned i = 2, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]); + if (Op.isReg()) + Op.setModifiers(0); + } + } + return Res; + } + return MatchOperand_NoMatch; +} + +void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { + + unsigned i = 1; + const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); + if (Desc.getNumDefs() > 0) { + ((AMDGPUOperand &)*Operands[i++]).addRegOperands(Inst, 1); + } + + std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx; + + if (operandsHaveModifiers(Operands)) { + for (unsigned e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + + if (Op.isRegWithInputMods()) { + ((AMDGPUOperand &)*Operands[i]).addRegWithInputModsOperands(Inst, 2); + continue; + } + OptionalIdx[Op.getImmTy()] = i; + } + + unsigned ClampIdx = OptionalIdx[AMDGPUOperand::ImmTyClamp]; + unsigned OModIdx = OptionalIdx[AMDGPUOperand::ImmTyOMod]; + + ((AMDGPUOperand &)*Operands[ClampIdx]).addImmOperands(Inst, 1); + ((AMDGPUOperand &)*Operands[OModIdx]).addImmOperands(Inst, 1); + } else { + for (unsigned e = Operands.size(); i != e; ++i) + ((AMDGPUOperand &)*Operands[i]).addRegOrImmOperands(Inst, 1); + } +} + +/// Force static initialization. +extern "C" void LLVMInitializeAMDGPUAsmParser() { + RegisterMCAsmParser<AMDGPUAsmParser> A(TheAMDGPUTarget); + RegisterMCAsmParser<AMDGPUAsmParser> B(TheGCNTarget); +} + +#define GET_REGISTER_MATCHER +#define GET_MATCHER_IMPLEMENTATION +#include "AMDGPUGenAsmMatcher.inc" + diff --git a/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td new file mode 100644 index 0000000..c543814 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td @@ -0,0 +1,333 @@ +//===-- CIInstructions.td - CI Instruction Defintions ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Instruction definitions for CI and newer. +//===----------------------------------------------------------------------===// +// Remaining instructions: +// S_CBRANCH_CDBGUSER +// S_CBRANCH_CDBGSYS +// S_CBRANCH_CDBGSYS_OR_USER +// S_CBRANCH_CDBGSYS_AND_USER +// DS_NOP +// DS_GWS_SEMA_RELEASE_ALL +// DS_WRAP_RTN_B32 +// DS_CNDXCHG32_RTN_B64 +// DS_WRITE_B96 +// DS_WRITE_B128 +// DS_CONDXCHG32_RTN_B128 +// DS_READ_B96 +// DS_READ_B128 +// BUFFER_LOAD_DWORDX3 +// BUFFER_STORE_DWORDX3 + + +def isCIVI : Predicate < + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || " + "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS" +>, AssemblerPredicate<"FeatureCIInsts">; + +def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">; + +//===----------------------------------------------------------------------===// +// VOP1 Instructions +//===----------------------------------------------------------------------===// + +let SubtargetPredicate = isCIVI in { + +let SchedRW = [WriteDoubleAdd] in { +defm V_TRUNC_F64 : VOP1Inst <vop1<0x17>, "v_trunc_f64", + VOP_F64_F64, ftrunc +>; +defm V_CEIL_F64 : VOP1Inst <vop1<0x18>, "v_ceil_f64", + VOP_F64_F64, fceil +>; +defm V_FLOOR_F64 : VOP1Inst <vop1<0x1A>, "v_floor_f64", + VOP_F64_F64, ffloor +>; +defm V_RNDNE_F64 : VOP1Inst <vop1<0x19>, "v_rndne_f64", + VOP_F64_F64, frint +>; +} // End SchedRW = [WriteDoubleAdd] + +let SchedRW = [WriteQuarterRate32] in { +defm V_LOG_LEGACY_F32 : VOP1Inst <vop1<0x45, 0x4c>, "v_log_legacy_f32", + VOP_F32_F32 +>; +defm V_EXP_LEGACY_F32 : VOP1Inst <vop1<0x46, 0x4b>, "v_exp_legacy_f32", + VOP_F32_F32 +>; +} // End SchedRW = [WriteQuarterRate32] + +//===----------------------------------------------------------------------===// +// VOP3 Instructions +//===----------------------------------------------------------------------===// + +defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8", + VOP_I32_I32_I32 +>; +defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8", + VOP_I32_I32_I32 +>; +defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8", + VOP_I32_I32_I32 +>; + +let isCommutable = 1 in { +defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32", + VOP_I64_I32_I32_I64 +>; + +// XXX - Does this set VCC? +defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32", + VOP_I64_I32_I32_I64 +>; +} // End isCommutable = 1 + + +//===----------------------------------------------------------------------===// +// DS Instructions +//===----------------------------------------------------------------------===// +defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">; + +// DS_CONDXCHG32_RTN_B64 +// DS_CONDXCHG32_RTN_B128 + +//===----------------------------------------------------------------------===// +// SMRD Instructions +//===----------------------------------------------------------------------===// + +defm S_DCACHE_INV_VOL : SMRD_Inval <smrd<0x1d, 0x22>, + "s_dcache_inv_vol", int_amdgcn_s_dcache_inv_vol>; + +//===----------------------------------------------------------------------===// +// MUBUF Instructions +//===----------------------------------------------------------------------===// + +defm BUFFER_WBINVL1_VOL : MUBUF_Invalidate <mubuf<0x70, 0x3f>, + "buffer_wbinvl1_vol", int_amdgcn_buffer_wbinvl1_vol +>; + +//===----------------------------------------------------------------------===// +// Flat Instructions +//===----------------------------------------------------------------------===// + +defm FLAT_LOAD_UBYTE : FLAT_Load_Helper < + flat<0x8, 0x10>, "flat_load_ubyte", VGPR_32 +>; +defm FLAT_LOAD_SBYTE : FLAT_Load_Helper < + flat<0x9, 0x11>, "flat_load_sbyte", VGPR_32 +>; +defm FLAT_LOAD_USHORT : FLAT_Load_Helper < + flat<0xa, 0x12>, "flat_load_ushort", VGPR_32 +>; +defm FLAT_LOAD_SSHORT : FLAT_Load_Helper < + flat<0xb, 0x13>, "flat_load_sshort", VGPR_32> +; +defm FLAT_LOAD_DWORD : FLAT_Load_Helper < + flat<0xc, 0x14>, "flat_load_dword", VGPR_32 +>; +defm FLAT_LOAD_DWORDX2 : FLAT_Load_Helper < + flat<0xd, 0x15>, "flat_load_dwordx2", VReg_64 +>; +defm FLAT_LOAD_DWORDX4 : FLAT_Load_Helper < + flat<0xe, 0x17>, "flat_load_dwordx4", VReg_128 +>; +defm FLAT_LOAD_DWORDX3 : FLAT_Load_Helper < + flat<0xf, 0x16>, "flat_load_dwordx3", VReg_96 +>; +defm FLAT_STORE_BYTE : FLAT_Store_Helper < + flat<0x18>, "flat_store_byte", VGPR_32 +>; +defm FLAT_STORE_SHORT : FLAT_Store_Helper < + flat <0x1a>, "flat_store_short", VGPR_32 +>; +defm FLAT_STORE_DWORD : FLAT_Store_Helper < + flat<0x1c>, "flat_store_dword", VGPR_32 +>; +defm FLAT_STORE_DWORDX2 : FLAT_Store_Helper < + flat<0x1d>, "flat_store_dwordx2", VReg_64 +>; +defm FLAT_STORE_DWORDX4 : FLAT_Store_Helper < + flat<0x1e, 0x1f>, "flat_store_dwordx4", VReg_128 +>; +defm FLAT_STORE_DWORDX3 : FLAT_Store_Helper < + flat<0x1f, 0x1e>, "flat_store_dwordx3", VReg_96 +>; +defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC < + flat<0x30, 0x40>, "flat_atomic_swap", VGPR_32 +>; +defm FLAT_ATOMIC_CMPSWAP : FLAT_ATOMIC < + flat<0x31, 0x41>, "flat_atomic_cmpswap", VGPR_32, VReg_64 +>; +defm FLAT_ATOMIC_ADD : FLAT_ATOMIC < + flat<0x32, 0x42>, "flat_atomic_add", VGPR_32 +>; +defm FLAT_ATOMIC_SUB : FLAT_ATOMIC < + flat<0x33, 0x43>, "flat_atomic_sub", VGPR_32 +>; +defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC < + flat<0x35, 0x44>, "flat_atomic_smin", VGPR_32 +>; +defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC < + flat<0x36, 0x45>, "flat_atomic_umin", VGPR_32 +>; +defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC < + flat<0x37, 0x46>, "flat_atomic_smax", VGPR_32 +>; +defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC < + flat<0x38, 0x47>, "flat_atomic_umax", VGPR_32 +>; +defm FLAT_ATOMIC_AND : FLAT_ATOMIC < + flat<0x39, 0x48>, "flat_atomic_and", VGPR_32 +>; +defm FLAT_ATOMIC_OR : FLAT_ATOMIC < + flat<0x3a, 0x49>, "flat_atomic_or", VGPR_32 +>; +defm FLAT_ATOMIC_XOR : FLAT_ATOMIC < + flat<0x3b, 0x4a>, "flat_atomic_xor", VGPR_32 +>; +defm FLAT_ATOMIC_INC : FLAT_ATOMIC < + flat<0x3c, 0x4b>, "flat_atomic_inc", VGPR_32 +>; +defm FLAT_ATOMIC_DEC : FLAT_ATOMIC < + flat<0x3d, 0x4c>, "flat_atomic_dec", VGPR_32 +>; +defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC < + flat<0x50, 0x60>, "flat_atomic_swap_x2", VReg_64 +>; +defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_ATOMIC < + flat<0x51, 0x61>, "flat_atomic_cmpswap_x2", VReg_64, VReg_128 +>; +defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC < + flat<0x52, 0x62>, "flat_atomic_add_x2", VReg_64 +>; +defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC < + flat<0x53, 0x63>, "flat_atomic_sub_x2", VReg_64 +>; +defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC < + flat<0x55, 0x64>, "flat_atomic_smin_x2", VReg_64 +>; +defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC < + flat<0x56, 0x65>, "flat_atomic_umin_x2", VReg_64 +>; +defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC < + flat<0x57, 0x66>, "flat_atomic_smax_x2", VReg_64 +>; +defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC < + flat<0x58, 0x67>, "flat_atomic_umax_x2", VReg_64 +>; +defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC < + flat<0x59, 0x68>, "flat_atomic_and_x2", VReg_64 +>; +defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC < + flat<0x5a, 0x69>, "flat_atomic_or_x2", VReg_64 +>; +defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC < + flat<0x5b, 0x6a>, "flat_atomic_xor_x2", VReg_64 +>; +defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC < + flat<0x5c, 0x6b>, "flat_atomic_inc_x2", VReg_64 +>; +defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC < + flat<0x5d, 0x6c>, "flat_atomic_dec_x2", VReg_64 +>; + +} // End SubtargetPredicate = isCIVI + +// CI Only flat instructions + +let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst in { + +defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC < + flat<0x3e>, "flat_atomic_fcmpswap", VGPR_32, VReg_64 +>; +defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC < + flat<0x3f>, "flat_atomic_fmin", VGPR_32 +>; +defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC < + flat<0x40>, "flat_atomic_fmax", VGPR_32 +>; +defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC < + flat<0x5e>, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128 +>; +defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC < + flat<0x5f>, "flat_atomic_fmin_x2", VReg_64 +>; +defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC < + flat<0x60>, "flat_atomic_fmax_x2", VReg_64 +>; + +} // End let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst + +let Predicates = [isCI] in { + +// Convert (x - floor(x)) to fract(x) +def : Pat < + (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), + (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), + (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +// Convert (x + (-floor(x))) to fract(x) +def : Pat < + (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), + (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), + (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +} // End Predicates = [isCI] + + +//===----------------------------------------------------------------------===// +// Flat Patterns +//===----------------------------------------------------------------------===// + +let Predicates = [isCIVI] in { + +// Patterns for global loads with no offset +class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < + (vt (node i64:$addr)), + (inst $addr, 0, 0, 0) +>; + +def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i32>; +def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i32>; +def : FlatLoadPat <FLAT_LOAD_USHORT, flat_az_extloadi16, i32>; +def : FlatLoadPat <FLAT_LOAD_SSHORT, flat_sextloadi16, i32>; +def : FlatLoadPat <FLAT_LOAD_DWORD, flat_load, i32>; +def : FlatLoadPat <FLAT_LOAD_DWORDX2, flat_load, v2i32>; +def : FlatLoadPat <FLAT_LOAD_DWORDX4, flat_load, v4i32>; + +class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < + (node vt:$data, i64:$addr), + (inst $data, $addr, 0, 0, 0) +>; + +def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i32>; +def : FlatStorePat <FLAT_STORE_SHORT, flat_truncstorei16, i32>; +def : FlatStorePat <FLAT_STORE_DWORD, flat_store, i32>; +def : FlatStorePat <FLAT_STORE_DWORDX2, flat_store, v2i32>; +def : FlatStorePat <FLAT_STORE_DWORDX4, flat_store, v4i32>; + +class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < + (vt (node i64:$addr, vt:$data)), + (inst $addr, $data, 0, 0) +>; + +def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>; + +} // End Predicates = [isCIVI] diff --git a/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td new file mode 100644 index 0000000..a6c3785 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td @@ -0,0 +1,230 @@ +//===-- CaymanInstructions.td - CM Instruction defs -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TableGen definitions for instructions which are available only on Cayman +// family GPUs. +// +//===----------------------------------------------------------------------===// + +def isCayman : Predicate<"Subtarget->hasCaymanISA()">; + +//===----------------------------------------------------------------------===// +// Cayman Instructions +//===----------------------------------------------------------------------===// + +let Predicates = [isCayman] in { + +def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24", + [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))], VecALU +>; +def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24", + [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))], VecALU +>; + +def : IMad24Pat<MULADD_INT24_cm>; + +let isVector = 1 in { + +def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>; + +def MULLO_INT_cm : MULLO_INT_Common<0x8F>; +def MULHI_INT_cm : MULHI_INT_Common<0x90>; +def MULLO_UINT_cm : MULLO_UINT_Common<0x91>; +def MULHI_UINT_cm : MULHI_UINT_Common<0x92>; +def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>; +def EXP_IEEE_cm : EXP_IEEE_Common<0x81>; +def LOG_IEEE_cm : LOG_IEEE_Common<0x83>; +def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>; +def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>; +def SIN_cm : SIN_Common<0x8D>; +def COS_cm : COS_Common<0x8E>; +} // End isVector = 1 + +def : RsqPat<RECIPSQRT_IEEE_cm, f32>; + +def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>; + +defm DIV_cm : DIV_Common<RECIP_IEEE_cm>; +defm : Expand24UBitOps<MULLO_UINT_cm, ADD_INT>; + +// RECIP_UINT emulation for Cayman +// The multiplication scales from [0,1] to the unsigned integer range +def : Pat < + (AMDGPUurecip i32:$src0), + (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg $src0)), + (MOV_IMM_I32 CONST.FP_UINT_MAX_PLUS_1))) +>; + + def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> { + let ADDR = 0; + let POP_COUNT = 0; + let COUNT = 0; + } + + +def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>; + +class RAT_STORE_DWORD <RegisterClass rc, ValueType vt, bits<4> mask> : + CF_MEM_RAT_CACHELESS <0x14, 0, mask, + (ins rc:$rw_gpr, R600_TReg32_X:$index_gpr), + "STORE_DWORD $rw_gpr, $index_gpr", + [(global_store vt:$rw_gpr, i32:$index_gpr)]> { + let eop = 0; // This bit is not used on Cayman. +} + +def RAT_STORE_DWORD32 : RAT_STORE_DWORD <R600_TReg32_X, i32, 0x1>; +def RAT_STORE_DWORD64 : RAT_STORE_DWORD <R600_Reg64, v2i32, 0x3>; +def RAT_STORE_DWORD128 : RAT_STORE_DWORD <R600_Reg128, v4i32, 0xf>; + +def RAT_STORE_TYPED_cm: CF_MEM_RAT_STORE_TYPED<0> { + let eop = 0; // This bit is not used on Cayman. +} + +class VTX_READ_cm <string name, bits<8> buffer_id, dag outs, list<dag> pattern> + : VTX_WORD0_cm, VTX_READ<name, buffer_id, outs, pattern> { + + // Static fields + let VC_INST = 0; + let FETCH_TYPE = 2; + let FETCH_WHOLE_QUAD = 0; + let BUFFER_ID = buffer_id; + let SRC_REL = 0; + // XXX: We can infer this field based on the SRC_GPR. This would allow us + // to store vertex addresses in any channel, not just X. + let SRC_SEL_X = 0; + let SRC_SEL_Y = 0; + let STRUCTURED_READ = 0; + let LDS_REQ = 0; + let COALESCED_READ = 0; + + let Inst{31-0} = Word0; +} + +class VTX_READ_8_cm <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_cm <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id, + (outs R600_TReg32_X:$dst_gpr), pattern> { + + let DST_SEL_X = 0; + let DST_SEL_Y = 7; // Masked + let DST_SEL_Z = 7; // Masked + let DST_SEL_W = 7; // Masked + let DATA_FORMAT = 1; // FMT_8 +} + +class VTX_READ_16_cm <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_cm <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id, + (outs R600_TReg32_X:$dst_gpr), pattern> { + let DST_SEL_X = 0; + let DST_SEL_Y = 7; // Masked + let DST_SEL_Z = 7; // Masked + let DST_SEL_W = 7; // Masked + let DATA_FORMAT = 5; // FMT_16 + +} + +class VTX_READ_32_cm <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_cm <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id, + (outs R600_TReg32_X:$dst_gpr), pattern> { + + let DST_SEL_X = 0; + let DST_SEL_Y = 7; // Masked + let DST_SEL_Z = 7; // Masked + let DST_SEL_W = 7; // Masked + let DATA_FORMAT = 0xD; // COLOR_32 + + // This is not really necessary, but there were some GPU hangs that appeared + // to be caused by ALU instructions in the next instruction group that wrote + // to the $src_gpr registers of the VTX_READ. + // e.g. + // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24 + // %T2_X<def> = MOV %ZERO + //Adding this constraint prevents this from happening. + let Constraints = "$src_gpr.ptr = $dst_gpr"; +} + +class VTX_READ_64_cm <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_cm <"VTX_READ_64 $dst_gpr, $src_gpr", buffer_id, + (outs R600_Reg64:$dst_gpr), pattern> { + + let DST_SEL_X = 0; + let DST_SEL_Y = 1; + let DST_SEL_Z = 7; + let DST_SEL_W = 7; + let DATA_FORMAT = 0x1D; // COLOR_32_32 +} + +class VTX_READ_128_cm <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_cm <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id, + (outs R600_Reg128:$dst_gpr), pattern> { + + let DST_SEL_X = 0; + let DST_SEL_Y = 1; + let DST_SEL_Z = 2; + let DST_SEL_W = 3; + let DATA_FORMAT = 0x22; // COLOR_32_32_32_32 + + // XXX: Need to force VTX_READ_128 instructions to write to the same register + // that holds its buffer address to avoid potential hangs. We can't use + // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst + // registers are different sizes. +} + +//===----------------------------------------------------------------------===// +// VTX Read from parameter memory space +//===----------------------------------------------------------------------===// +def VTX_READ_PARAM_8_cm : VTX_READ_8_cm <0, + [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_16_cm : VTX_READ_16_cm <0, + [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_32_cm : VTX_READ_32_cm <0, + [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_64_cm : VTX_READ_64_cm <0, + [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_128_cm : VTX_READ_128_cm <0, + [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] +>; + +//===----------------------------------------------------------------------===// +// VTX Read from global memory space +//===----------------------------------------------------------------------===// + +// 8-bit reads +def VTX_READ_GLOBAL_8_cm : VTX_READ_8_cm <1, + [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_GLOBAL_16_cm : VTX_READ_16_cm <1, + [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))] +>; + +// 32-bit reads +def VTX_READ_GLOBAL_32_cm : VTX_READ_32_cm <1, + [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +>; + +// 64-bit reads +def VTX_READ_GLOBAL_64_cm : VTX_READ_64_cm <1, + [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +>; + +// 128-bit reads +def VTX_READ_GLOBAL_128_cm : VTX_READ_128_cm <1, + [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +>; + +} // End isCayman + diff --git a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td new file mode 100644 index 0000000..2245f14 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -0,0 +1,681 @@ +//===-- EvergreenInstructions.td - EG Instruction defs ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TableGen definitions for instructions which are: +// - Available to Evergreen and newer VLIW4/VLIW5 GPUs +// - Available only on Evergreen family GPUs. +// +//===----------------------------------------------------------------------===// + +def isEG : Predicate< + "Subtarget->getGeneration() >= AMDGPUSubtarget::EVERGREEN && " + "Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && " + "!Subtarget->hasCaymanISA()" +>; + +def isEGorCayman : Predicate< + "Subtarget->getGeneration() == AMDGPUSubtarget::EVERGREEN ||" + "Subtarget->getGeneration() ==AMDGPUSubtarget::NORTHERN_ISLANDS" +>; + +//===----------------------------------------------------------------------===// +// Evergreen / Cayman store instructions +//===----------------------------------------------------------------------===// + +let Predicates = [isEGorCayman] in { + +class CF_MEM_RAT_CACHELESS <bits<6> rat_inst, bits<4> rat_id, bits<4> mask, dag ins, + string name, list<dag> pattern> + : EG_CF_RAT <0x57, rat_inst, rat_id, mask, (outs), ins, + "MEM_RAT_CACHELESS "#name, pattern>; + +class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, dag ins, string name, + list<dag> pattern> + : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins, + "MEM_RAT "#name, pattern>; + +class CF_MEM_RAT_STORE_TYPED<bits<1> has_eop> + : CF_MEM_RAT <0x1, ?, (ins R600_Reg128:$rw_gpr, R600_Reg128:$index_gpr, + i32imm:$rat_id, InstFlag:$eop), + "STORE_TYPED RAT($rat_id) $rw_gpr, $index_gpr" + #!if(has_eop, ", $eop", ""), + [(int_r600_rat_store_typed R600_Reg128:$rw_gpr, + R600_Reg128:$index_gpr, + (i32 imm:$rat_id))]>; + +def RAT_MSKOR : CF_MEM_RAT <0x11, 0, + (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr), + "MSKOR $rw_gpr.XW, $index_gpr", + [(mskor_global v4i32:$rw_gpr, i32:$index_gpr)] +> { + let eop = 0; +} + +} // End let Predicates = [isEGorCayman] + +//===----------------------------------------------------------------------===// +// Evergreen Only instructions +//===----------------------------------------------------------------------===// + +let Predicates = [isEG] in { + +def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>; +defm DIV_eg : DIV_Common<RECIP_IEEE_eg>; + +def MULLO_INT_eg : MULLO_INT_Common<0x8F>; +def MULHI_INT_eg : MULHI_INT_Common<0x90>; +def MULLO_UINT_eg : MULLO_UINT_Common<0x91>; +def MULHI_UINT_eg : MULHI_UINT_Common<0x92>; +def RECIP_UINT_eg : RECIP_UINT_Common<0x94>; +def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>; +def EXP_IEEE_eg : EXP_IEEE_Common<0x81>; +def LOG_IEEE_eg : LOG_IEEE_Common<0x83>; +def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>; +def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>; +def : RsqPat<RECIPSQRT_IEEE_eg, f32>; +def SIN_eg : SIN_Common<0x8D>; +def COS_eg : COS_Common<0x8E>; + +def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>; +def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>; + +defm : Expand24IBitOps<MULLO_INT_eg, ADD_INT>; + +//===----------------------------------------------------------------------===// +// Memory read/write instructions +//===----------------------------------------------------------------------===// + +let usesCustomInserter = 1 in { + +// 32-bit store +def RAT_WRITE_CACHELESS_32_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x1, + (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), + "STORE_RAW $rw_gpr, $index_gpr, $eop", + [(global_store i32:$rw_gpr, i32:$index_gpr)] +>; + +// 64-bit store +def RAT_WRITE_CACHELESS_64_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x3, + (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), + "STORE_RAW $rw_gpr.XY, $index_gpr, $eop", + [(global_store v2i32:$rw_gpr, i32:$index_gpr)] +>; + +//128-bit store +def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf, + (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), + "STORE_RAW $rw_gpr.XYZW, $index_gpr, $eop", + [(global_store v4i32:$rw_gpr, i32:$index_gpr)] +>; + +def RAT_STORE_TYPED_eg: CF_MEM_RAT_STORE_TYPED<1>; + +} // End usesCustomInserter = 1 + +class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern> + : VTX_WORD0_eg, VTX_READ<name, buffer_id, outs, pattern> { + + // Static fields + let VC_INST = 0; + let FETCH_TYPE = 2; + let FETCH_WHOLE_QUAD = 0; + let BUFFER_ID = buffer_id; + let SRC_REL = 0; + // XXX: We can infer this field based on the SRC_GPR. This would allow us + // to store vertex addresses in any channel, not just X. + let SRC_SEL_X = 0; + + let Inst{31-0} = Word0; +} + +class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_eg <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id, + (outs R600_TReg32_X:$dst_gpr), pattern> { + + let MEGA_FETCH_COUNT = 1; + let DST_SEL_X = 0; + let DST_SEL_Y = 7; // Masked + let DST_SEL_Z = 7; // Masked + let DST_SEL_W = 7; // Masked + let DATA_FORMAT = 1; // FMT_8 +} + +class VTX_READ_16_eg <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_eg <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id, + (outs R600_TReg32_X:$dst_gpr), pattern> { + let MEGA_FETCH_COUNT = 2; + let DST_SEL_X = 0; + let DST_SEL_Y = 7; // Masked + let DST_SEL_Z = 7; // Masked + let DST_SEL_W = 7; // Masked + let DATA_FORMAT = 5; // FMT_16 + +} + +class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_eg <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id, + (outs R600_TReg32_X:$dst_gpr), pattern> { + + let MEGA_FETCH_COUNT = 4; + let DST_SEL_X = 0; + let DST_SEL_Y = 7; // Masked + let DST_SEL_Z = 7; // Masked + let DST_SEL_W = 7; // Masked + let DATA_FORMAT = 0xD; // COLOR_32 + + // This is not really necessary, but there were some GPU hangs that appeared + // to be caused by ALU instructions in the next instruction group that wrote + // to the $src_gpr registers of the VTX_READ. + // e.g. + // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24 + // %T2_X<def> = MOV %ZERO + //Adding this constraint prevents this from happening. + let Constraints = "$src_gpr.ptr = $dst_gpr"; +} + +class VTX_READ_64_eg <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_eg <"VTX_READ_64 $dst_gpr.XY, $src_gpr", buffer_id, + (outs R600_Reg64:$dst_gpr), pattern> { + + let MEGA_FETCH_COUNT = 8; + let DST_SEL_X = 0; + let DST_SEL_Y = 1; + let DST_SEL_Z = 7; + let DST_SEL_W = 7; + let DATA_FORMAT = 0x1D; // COLOR_32_32 +} + +class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_eg <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id, + (outs R600_Reg128:$dst_gpr), pattern> { + + let MEGA_FETCH_COUNT = 16; + let DST_SEL_X = 0; + let DST_SEL_Y = 1; + let DST_SEL_Z = 2; + let DST_SEL_W = 3; + let DATA_FORMAT = 0x22; // COLOR_32_32_32_32 + + // XXX: Need to force VTX_READ_128 instructions to write to the same register + // that holds its buffer address to avoid potential hangs. We can't use + // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst + // registers are different sizes. +} + +//===----------------------------------------------------------------------===// +// VTX Read from parameter memory space +//===----------------------------------------------------------------------===// + +def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0, + [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0, + [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0, + [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0, + [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0, + [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] +>; + +//===----------------------------------------------------------------------===// +// VTX Read from global memory space +//===----------------------------------------------------------------------===// + +// 8-bit reads +def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1, + [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_GLOBAL_16_eg : VTX_READ_16_eg <1, + [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))] +>; + +// 32-bit reads +def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1, + [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +>; + +// 64-bit reads +def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1, + [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +>; + +// 128-bit reads +def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1, + [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +>; + +} // End Predicates = [isEG] + +//===----------------------------------------------------------------------===// +// Evergreen / Cayman Instructions +//===----------------------------------------------------------------------===// + +let Predicates = [isEGorCayman] in { + +// Should be predicated on FeatureFP64 +// def FMA_64 : R600_3OP < +// 0xA, "FMA_64", +// [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))] +// >; + +// BFE_UINT - bit_extract, an optimization for mask and shift +// Src0 = Input +// Src1 = Offset +// Src2 = Width +// +// bit_extract = (Input << (32 - Offset - Width)) >> (32 - Width) +// +// Example Usage: +// (Offset, Width) +// +// (0, 8) = (Input << 24) >> 24 = (Input & 0xff) >> 0 +// (8, 8) = (Input << 16) >> 24 = (Input & 0xffff) >> 8 +// (16, 8) = (Input << 8) >> 24 = (Input & 0xffffff) >> 16 +// (24, 8) = (Input << 0) >> 24 = (Input & 0xffffffff) >> 24 +def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT", + [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))], + VecALU +>; + +def BFE_INT_eg : R600_3OP <0x5, "BFE_INT", + [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))], + VecALU +>; + +def : BFEPattern <BFE_UINT_eg, MOV_IMM_I32>; + +def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", + [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))], + VecALU +>; + +def : Pat<(i32 (sext_inreg i32:$src, i1)), + (BFE_INT_eg i32:$src, (i32 ZERO), (i32 ONE_INT))>; +def : Pat<(i32 (sext_inreg i32:$src, i8)), + (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 8))>; +def : Pat<(i32 (sext_inreg i32:$src, i16)), + (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>; + +defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32, R600_Reg64>; + +def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT", + [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))], + VecALU +>; + +def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24", + [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))], VecALU +>; + +def : UMad24Pat<MULADD_UINT24_eg>; + +def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>; +def : ROTRPattern <BIT_ALIGN_INT_eg>; +def MULADD_eg : MULADD_Common<0x14>; +def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>; +def FMA_eg : FMA_Common<0x7>; +def ASHR_eg : ASHR_Common<0x15>; +def LSHR_eg : LSHR_Common<0x16>; +def LSHL_eg : LSHL_Common<0x17>; +def CNDE_eg : CNDE_Common<0x19>; +def CNDGT_eg : CNDGT_Common<0x1A>; +def CNDGE_eg : CNDGE_Common<0x1B>; +def MUL_LIT_eg : MUL_LIT_Common<0x1F>; +def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>; +def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24", + [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))], VecALU +>; +def DOT4_eg : DOT4_Common<0xBE>; +defm CUBE_eg : CUBE_Common<0xC0>; + +def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>; + +def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>; +def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>; + +def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>; +def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>; + +let hasSideEffects = 1 in { + def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>; +} + +def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common<MUL_LIT_eg, LOG_CLAMPED_eg, EXP_IEEE_eg>; + +def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> { + let Pattern = []; + let Itinerary = AnyALU; +} + +def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>; + +def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> { + let Pattern = []; +} + +def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>; + +def GROUP_BARRIER : InstR600 < + (outs), (ins), " GROUP_BARRIER", [(int_AMDGPU_barrier_local), (int_AMDGPU_barrier_global)], AnyALU>, + R600ALU_Word0, + R600ALU_Word1_OP2 <0x54> { + + let dst = 0; + let dst_rel = 0; + let src0 = 0; + let src0_rel = 0; + let src0_neg = 0; + let src0_abs = 0; + let src1 = 0; + let src1_rel = 0; + let src1_neg = 0; + let src1_abs = 0; + let write = 0; + let omod = 0; + let clamp = 0; + let last = 1; + let bank_swizzle = 0; + let pred_sel = 0; + let update_exec_mask = 0; + let update_pred = 0; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + + let ALUInst = 1; +} + +def : Pat < + (int_AMDGPU_barrier_global), + (GROUP_BARRIER) +>; + +//===----------------------------------------------------------------------===// +// LDS Instructions +//===----------------------------------------------------------------------===// +class R600_LDS <bits<6> op, dag outs, dag ins, string asm, + list<dag> pattern = []> : + + InstR600 <outs, ins, asm, pattern, XALU>, + R600_ALU_LDS_Word0, + R600LDS_Word1 { + + bits<6> offset = 0; + let lds_op = op; + + let Word1{27} = offset{0}; + let Word1{12} = offset{1}; + let Word1{28} = offset{2}; + let Word1{31} = offset{3}; + let Word0{12} = offset{4}; + let Word0{25} = offset{5}; + + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + + let ALUInst = 1; + let HasNativeOperands = 1; + let UseNamedOperandTable = 1; +} + +class R600_LDS_1A <bits<6> lds_op, string name, list<dag> pattern> : R600_LDS < + lds_op, + (outs R600_Reg32:$dst), + (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel, + LAST:$last, R600_Pred:$pred_sel, + BANK_SWIZZLE:$bank_swizzle), + " "#name#" $last OQAP, $src0$src0_rel $pred_sel", + pattern + > { + + let src1 = 0; + let src1_rel = 0; + let src2 = 0; + let src2_rel = 0; + + let usesCustomInserter = 1; + let LDS_1A = 1; + let DisableEncoding = "$dst"; +} + +class R600_LDS_1A1D <bits<6> lds_op, dag outs, string name, list<dag> pattern, + string dst =""> : + R600_LDS < + lds_op, outs, + (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel, + R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel, + LAST:$last, R600_Pred:$pred_sel, + BANK_SWIZZLE:$bank_swizzle), + " "#name#" $last "#dst#"$src0$src0_rel, $src1$src1_rel, $pred_sel", + pattern + > { + + field string BaseOp; + + let src2 = 0; + let src2_rel = 0; + let LDS_1A1D = 1; +} + +class R600_LDS_1A1D_NORET <bits<6> lds_op, string name, list<dag> pattern> : + R600_LDS_1A1D <lds_op, (outs), name, pattern> { + let BaseOp = name; +} + +class R600_LDS_1A1D_RET <bits<6> lds_op, string name, list<dag> pattern> : + R600_LDS_1A1D <lds_op, (outs R600_Reg32:$dst), name##"_RET", pattern, "OQAP, "> { + + let BaseOp = name; + let usesCustomInserter = 1; + let DisableEncoding = "$dst"; +} + +class R600_LDS_1A2D <bits<6> lds_op, dag outs, string name, list<dag> pattern, + string dst =""> : + R600_LDS < + lds_op, outs, + (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel, + R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel, + R600_Reg32:$src2, REL:$src2_rel, SEL:$src2_sel, + LAST:$last, R600_Pred:$pred_sel, BANK_SWIZZLE:$bank_swizzle), + " "#name# "$last "#dst#"$src0$src0_rel, $src1$src1_rel, $src2$src2_rel, $pred_sel", + pattern> { + + field string BaseOp; + + let LDS_1A1D = 0; + let LDS_1A2D = 1; +} + +class R600_LDS_1A2D_NORET <bits<6> lds_op, string name, list<dag> pattern> : + R600_LDS_1A2D <lds_op, (outs), name, pattern> { + let BaseOp = name; +} + +class R600_LDS_1A2D_RET <bits<6> lds_op, string name, list<dag> pattern> : + R600_LDS_1A2D <lds_op, (outs R600_Reg32:$dst), name, pattern> { + + let BaseOp = name; + let usesCustomInserter = 1; + let DisableEncoding = "$dst"; +} + +def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >; +def LDS_SUB : R600_LDS_1A1D_NORET <0x1, "LDS_SUB", [] >; +def LDS_AND : R600_LDS_1A1D_NORET <0x9, "LDS_AND", [] >; +def LDS_OR : R600_LDS_1A1D_NORET <0xa, "LDS_OR", [] >; +def LDS_XOR : R600_LDS_1A1D_NORET <0xb, "LDS_XOR", [] >; +def LDS_WRXCHG: R600_LDS_1A1D_NORET <0xd, "LDS_WRXCHG", [] >; +def LDS_CMPST: R600_LDS_1A2D_NORET <0x10, "LDS_CMPST", [] >; +def LDS_MIN_INT : R600_LDS_1A1D_NORET <0x5, "LDS_MIN_INT", [] >; +def LDS_MAX_INT : R600_LDS_1A1D_NORET <0x6, "LDS_MAX_INT", [] >; +def LDS_MIN_UINT : R600_LDS_1A1D_NORET <0x7, "LDS_MIN_UINT", [] >; +def LDS_MAX_UINT : R600_LDS_1A1D_NORET <0x8, "LDS_MAX_UINT", [] >; +def LDS_WRITE : R600_LDS_1A1D_NORET <0xD, "LDS_WRITE", + [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)] +>; +def LDS_BYTE_WRITE : R600_LDS_1A1D_NORET<0x12, "LDS_BYTE_WRITE", + [(truncstorei8_local i32:$src1, i32:$src0)] +>; +def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE", + [(truncstorei16_local i32:$src1, i32:$src0)] +>; +def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD", + [(set i32:$dst, (atomic_load_add_local i32:$src0, i32:$src1))] +>; +def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB", + [(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))] +>; +def LDS_AND_RET : R600_LDS_1A1D_RET <0x29, "LDS_AND", + [(set i32:$dst, (atomic_load_and_local i32:$src0, i32:$src1))] +>; +def LDS_OR_RET : R600_LDS_1A1D_RET <0x2a, "LDS_OR", + [(set i32:$dst, (atomic_load_or_local i32:$src0, i32:$src1))] +>; +def LDS_XOR_RET : R600_LDS_1A1D_RET <0x2b, "LDS_XOR", + [(set i32:$dst, (atomic_load_xor_local i32:$src0, i32:$src1))] +>; +def LDS_MIN_INT_RET : R600_LDS_1A1D_RET <0x25, "LDS_MIN_INT", + [(set i32:$dst, (atomic_load_min_local i32:$src0, i32:$src1))] +>; +def LDS_MAX_INT_RET : R600_LDS_1A1D_RET <0x26, "LDS_MAX_INT", + [(set i32:$dst, (atomic_load_max_local i32:$src0, i32:$src1))] +>; +def LDS_MIN_UINT_RET : R600_LDS_1A1D_RET <0x27, "LDS_MIN_UINT", + [(set i32:$dst, (atomic_load_umin_local i32:$src0, i32:$src1))] +>; +def LDS_MAX_UINT_RET : R600_LDS_1A1D_RET <0x28, "LDS_MAX_UINT", + [(set i32:$dst, (atomic_load_umax_local i32:$src0, i32:$src1))] +>; +def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG", + [(set i32:$dst, (atomic_swap_local i32:$src0, i32:$src1))] +>; +def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST", + [(set i32:$dst, (atomic_cmp_swap_32_local i32:$src0, i32:$src1, i32:$src2))] +>; +def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET", + [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))] +>; +def LDS_BYTE_READ_RET : R600_LDS_1A <0x36, "LDS_BYTE_READ_RET", + [(set i32:$dst, (sextloadi8_local i32:$src0))] +>; +def LDS_UBYTE_READ_RET : R600_LDS_1A <0x37, "LDS_UBYTE_READ_RET", + [(set i32:$dst, (az_extloadi8_local i32:$src0))] +>; +def LDS_SHORT_READ_RET : R600_LDS_1A <0x38, "LDS_SHORT_READ_RET", + [(set i32:$dst, (sextloadi16_local i32:$src0))] +>; +def LDS_USHORT_READ_RET : R600_LDS_1A <0x39, "LDS_USHORT_READ_RET", + [(set i32:$dst, (az_extloadi16_local i32:$src0))] +>; + +// TRUNC is used for the FLT_TO_INT instructions to work around a +// perceived problem where the rounding modes are applied differently +// depending on the instruction and the slot they are in. +// See: +// https://bugs.freedesktop.org/show_bug.cgi?id=50232 +// Mesa commit: a1a0974401c467cb86ef818f22df67c21774a38c +// +// XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes, +// which do not need to be truncated since the fp values are 0.0f or 1.0f. +// We should look into handling these cases separately. +def : Pat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>; + +def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>; + +// SHA-256 Patterns +def : SHA256MaPattern <BFI_INT_eg, XOR_INT>; + +def EG_ExportSwz : ExportSwzInst { + let Word1{19-16} = 0; // BURST_COUNT + let Word1{20} = 0; // VALID_PIXEL_MODE + let Word1{21} = eop; + let Word1{29-22} = inst; + let Word1{30} = 0; // MARK + let Word1{31} = 1; // BARRIER +} +defm : ExportPattern<EG_ExportSwz, 83>; + +def EG_ExportBuf : ExportBufInst { + let Word1{19-16} = 0; // BURST_COUNT + let Word1{20} = 0; // VALID_PIXEL_MODE + let Word1{21} = eop; + let Word1{29-22} = inst; + let Word1{30} = 0; // MARK + let Word1{31} = 1; // BARRIER +} +defm : SteamOutputExportPattern<EG_ExportBuf, 0x40, 0x41, 0x42, 0x43>; + +def CF_TC_EG : CF_CLAUSE_EG<1, (ins i32imm:$ADDR, i32imm:$COUNT), + "TEX $COUNT @$ADDR"> { + let POP_COUNT = 0; +} +def CF_VC_EG : CF_CLAUSE_EG<2, (ins i32imm:$ADDR, i32imm:$COUNT), + "VTX $COUNT @$ADDR"> { + let POP_COUNT = 0; +} +def WHILE_LOOP_EG : CF_CLAUSE_EG<6, (ins i32imm:$ADDR), + "LOOP_START_DX10 @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; +} +def END_LOOP_EG : CF_CLAUSE_EG<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; +} +def LOOP_BREAK_EG : CF_CLAUSE_EG<9, (ins i32imm:$ADDR), + "LOOP_BREAK @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; +} +def CF_CONTINUE_EG : CF_CLAUSE_EG<8, (ins i32imm:$ADDR), + "CONTINUE @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; +} +def CF_JUMP_EG : CF_CLAUSE_EG<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "JUMP @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; +} +def CF_PUSH_EG : CF_CLAUSE_EG<11, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "PUSH @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; +} +def CF_ELSE_EG : CF_CLAUSE_EG<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "ELSE @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; +} +def CF_CALL_FS_EG : CF_CLAUSE_EG<19, (ins), "CALL_FS"> { + let ADDR = 0; + let COUNT = 0; + let POP_COUNT = 0; +} +def POP_EG : CF_CLAUSE_EG<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "POP @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; +} +def CF_END_EG : CF_CLAUSE_EG<0, (ins), "CF_END"> { + let COUNT = 0; + let POP_COUNT = 0; + let ADDR = 0; + let END_OF_PROGRAM = 1; +} + +} // End Predicates = [isEGorCayman] diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp new file mode 100644 index 0000000..a187de8 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -0,0 +1,648 @@ +//===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// \file +//===----------------------------------------------------------------------===// + +#include "AMDGPUInstPrinter.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIDefines.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, + StringRef Annot, const MCSubtargetInfo &STI) { + OS.flush(); + printInstruction(MI, OS); + + printAnnotation(OS, Annot); +} + +void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatHex(MI->getOperand(OpNo).getImm() & 0xff); +} + +void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatHex(MI->getOperand(OpNo).getImm() & 0xffff); +} + +void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff); +} + +void AMDGPUInstPrinter::printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatDec(MI->getOperand(OpNo).getImm() & 0xff); +} + +void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff); +} + +void AMDGPUInstPrinter::printOffen(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " offen"; +} + +void AMDGPUInstPrinter::printIdxen(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " idxen"; +} + +void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " addr64"; +} + +void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) { + O << " offset:"; + printU16ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printDSOffset(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + uint16_t Imm = MI->getOperand(OpNo).getImm(); + if (Imm != 0) { + O << " offset:"; + printU16ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printDSOffset0(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) { + O << " offset0:"; + printU8ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printDSOffset1(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) { + O << " offset1:"; + printU8ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " gds"; +} + +void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " glc"; +} + +void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " slc"; +} + +void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " tfe"; +} + +void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O, + const MCRegisterInfo &MRI) { + switch (reg) { + case AMDGPU::VCC: + O << "vcc"; + return; + case AMDGPU::SCC: + O << "scc"; + return; + case AMDGPU::EXEC: + O << "exec"; + return; + case AMDGPU::M0: + O << "m0"; + return; + case AMDGPU::FLAT_SCR: + O << "flat_scratch"; + return; + case AMDGPU::VCC_LO: + O << "vcc_lo"; + return; + case AMDGPU::VCC_HI: + O << "vcc_hi"; + return; + case AMDGPU::EXEC_LO: + O << "exec_lo"; + return; + case AMDGPU::EXEC_HI: + O << "exec_hi"; + return; + case AMDGPU::FLAT_SCR_LO: + O << "flat_scratch_lo"; + return; + case AMDGPU::FLAT_SCR_HI: + O << "flat_scratch_hi"; + return; + default: + break; + } + + char Type; + unsigned NumRegs; + + if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 1; + } else if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(reg)) { + Type = 's'; + NumRegs = 1; + } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 2; + } else if (MRI.getRegClass(AMDGPU::SReg_64RegClassID).contains(reg)) { + Type = 's'; + NumRegs = 2; + } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 4; + } else if (MRI.getRegClass(AMDGPU::SReg_128RegClassID).contains(reg)) { + Type = 's'; + NumRegs = 4; + } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 3; + } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 8; + } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(reg)) { + Type = 's'; + NumRegs = 8; + } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 16; + } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(reg)) { + Type = 's'; + NumRegs = 16; + } else { + O << getRegisterName(reg); + return; + } + + // The low 8 bits of the encoding value is the register index, for both VGPRs + // and SGPRs. + unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1); + if (NumRegs == 1) { + O << Type << RegIdx; + return; + } + + O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']'; +} + +void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3) + O << "_e64 "; + else + O << "_e32 "; + + printOperand(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, raw_ostream &O) { + int32_t SImm = static_cast<int32_t>(Imm); + if (SImm >= -16 && SImm <= 64) { + O << SImm; + return; + } + + if (Imm == FloatToBits(0.0f)) + O << "0.0"; + else if (Imm == FloatToBits(1.0f)) + O << "1.0"; + else if (Imm == FloatToBits(-1.0f)) + O << "-1.0"; + else if (Imm == FloatToBits(0.5f)) + O << "0.5"; + else if (Imm == FloatToBits(-0.5f)) + O << "-0.5"; + else if (Imm == FloatToBits(2.0f)) + O << "2.0"; + else if (Imm == FloatToBits(-2.0f)) + O << "-2.0"; + else if (Imm == FloatToBits(4.0f)) + O << "4.0"; + else if (Imm == FloatToBits(-4.0f)) + O << "-4.0"; + else + O << formatHex(static_cast<uint64_t>(Imm)); +} + +void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) { + int64_t SImm = static_cast<int64_t>(Imm); + if (SImm >= -16 && SImm <= 64) { + O << SImm; + return; + } + + if (Imm == DoubleToBits(0.0)) + O << "0.0"; + else if (Imm == DoubleToBits(1.0)) + O << "1.0"; + else if (Imm == DoubleToBits(-1.0)) + O << "-1.0"; + else if (Imm == DoubleToBits(0.5)) + O << "0.5"; + else if (Imm == DoubleToBits(-0.5)) + O << "-0.5"; + else if (Imm == DoubleToBits(2.0)) + O << "2.0"; + else if (Imm == DoubleToBits(-2.0)) + O << "-2.0"; + else if (Imm == DoubleToBits(4.0)) + O << "4.0"; + else if (Imm == DoubleToBits(-4.0)) + O << "-4.0"; + else { + assert(isUInt<32>(Imm)); + + // In rare situations, we will have a 32-bit literal in a 64-bit + // operand. This is technically allowed for the encoding of s_mov_b64. + O << formatHex(static_cast<uint64_t>(Imm)); + } +} + +void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + switch (Op.getReg()) { + // This is the default predicate state, so we don't need to print it. + case AMDGPU::PRED_SEL_OFF: + break; + + default: + printRegOperand(Op.getReg(), O, MRI); + break; + } + } else if (Op.isImm()) { + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + int RCID = Desc.OpInfo[OpNo].RegClass; + if (RCID != -1) { + const MCRegisterClass &ImmRC = MRI.getRegClass(RCID); + if (ImmRC.getSize() == 4) + printImmediate32(Op.getImm(), O); + else if (ImmRC.getSize() == 8) + printImmediate64(Op.getImm(), O); + else + llvm_unreachable("Invalid register class size"); + } else if (Desc.OpInfo[OpNo].OperandType == MCOI::OPERAND_IMMEDIATE) { + printImmediate32(Op.getImm(), O); + } else { + // We hit this for the immediate instruction bits that don't yet have a + // custom printer. + // TODO: Eventually this should be unnecessary. + O << formatDec(Op.getImm()); + } + } else if (Op.isFPImm()) { + // We special case 0.0 because otherwise it will be printed as an integer. + if (Op.getFPImm() == 0.0) + O << "0.0"; + else { + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + const MCRegisterClass &ImmRC = MRI.getRegClass(Desc.OpInfo[OpNo].RegClass); + + if (ImmRC.getSize() == 4) + printImmediate32(FloatToBits(Op.getFPImm()), O); + else if (ImmRC.getSize() == 8) + printImmediate64(DoubleToBits(Op.getFPImm()), O); + else + llvm_unreachable("Invalid register class size"); + } + } else if (Op.isExpr()) { + const MCExpr *Exp = Op.getExpr(); + Exp->print(O, &MAI); + } else { + llvm_unreachable("unknown operand type in printOperand"); + } +} + +void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned InputModifiers = MI->getOperand(OpNo).getImm(); + if (InputModifiers & SISrcMods::NEG) + O << '-'; + if (InputModifiers & SISrcMods::ABS) + O << '|'; + printOperand(MI, OpNo + 1, O); + if (InputModifiers & SISrcMods::ABS) + O << '|'; +} + +void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNum).getImm(); + + if (Imm == 2) { + O << "P0"; + } else if (Imm == 1) { + O << "P20"; + } else if (Imm == 0) { + O << "P10"; + } else { + llvm_unreachable("Invalid interpolation parameter slot"); + } +} + +void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printOperand(MI, OpNo, O); + O << ", "; + printOperand(MI, OpNo + 1, O); +} + +void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo, + raw_ostream &O, StringRef Asm, + StringRef Default) { + const MCOperand &Op = MI->getOperand(OpNo); + assert(Op.isImm()); + if (Op.getImm() == 1) { + O << Asm; + } else { + O << Default; + } +} + +void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "|"); +} + +void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "_SAT"); +} + +void AMDGPUInstPrinter::printClampSI(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " clamp"; +} + +void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + int Imm = MI->getOperand(OpNo).getImm(); + if (Imm == SIOutMods::MUL2) + O << " mul:2"; + else if (Imm == SIOutMods::MUL4) + O << " mul:4"; + else if (Imm == SIOutMods::DIV2) + O << " div:2"; +} + +void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + int32_t Imm = MI->getOperand(OpNo).getImm(); + O << Imm << '(' << BitsToFloat(Imm) << ')'; +} + +void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "*", " "); +} + +void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "-"); +} + +void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + switch (MI->getOperand(OpNo).getImm()) { + default: break; + case 1: + O << " * 2.0"; + break; + case 2: + O << " * 4.0"; + break; + case 3: + O << " / 2.0"; + break; + } +} + +void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "+"); +} + +void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "ExecMask,"); +} + +void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "Pred,"); +} + +void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.getImm() == 0) { + O << " (MASKED)"; + } +} + +void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const char * chans = "XYZW"; + int sel = MI->getOperand(OpNo).getImm(); + + int chan = sel & 3; + sel >>= 2; + + if (sel >= 512) { + sel -= 512; + int cb = sel >> 12; + sel &= 4095; + O << cb << '[' << sel << ']'; + } else if (sel >= 448) { + sel -= 448; + O << sel; + } else if (sel >= 0){ + O << sel; + } + + if (sel >= 0) + O << '.' << chans[chan]; +} + +void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + int BankSwizzle = MI->getOperand(OpNo).getImm(); + switch (BankSwizzle) { + case 1: + O << "BS:VEC_021/SCL_122"; + break; + case 2: + O << "BS:VEC_120/SCL_212"; + break; + case 3: + O << "BS:VEC_102/SCL_221"; + break; + case 4: + O << "BS:VEC_201"; + break; + case 5: + O << "BS:VEC_210"; + break; + default: + break; + } + return; +} + +void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned Sel = MI->getOperand(OpNo).getImm(); + switch (Sel) { + case 0: + O << 'X'; + break; + case 1: + O << 'Y'; + break; + case 2: + O << 'Z'; + break; + case 3: + O << 'W'; + break; + case 4: + O << '0'; + break; + case 5: + O << '1'; + break; + case 7: + O << '_'; + break; + default: + break; + } +} + +void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned CT = MI->getOperand(OpNo).getImm(); + switch (CT) { + case 0: + O << 'U'; + break; + case 1: + O << 'N'; + break; + default: + break; + } +} + +void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + int KCacheMode = MI->getOperand(OpNo).getImm(); + if (KCacheMode > 0) { + int KCacheBank = MI->getOperand(OpNo - 2).getImm(); + O << "CB" << KCacheBank << ':'; + int KCacheAddr = MI->getOperand(OpNo + 2).getImm(); + int LineSize = (KCacheMode == 1) ? 16 : 32; + O << KCacheAddr * 16 << '-' << KCacheAddr * 16 + LineSize; + } +} + +void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned SImm16 = MI->getOperand(OpNo).getImm(); + unsigned Msg = SImm16 & 0xF; + if (Msg == 2 || Msg == 3) { + unsigned Op = (SImm16 >> 4) & 0xF; + if (Msg == 3) + O << "Gs_done("; + else + O << "Gs("; + if (Op == 0) { + O << "nop"; + } else { + unsigned Stream = (SImm16 >> 8) & 0x3; + if (Op == 1) + O << "cut"; + else if (Op == 2) + O << "emit"; + else if (Op == 3) + O << "emit-cut"; + O << " stream " << Stream; + } + O << "), [m0] "; + } else if (Msg == 1) + O << "interrupt "; + else if (Msg == 15) + O << "system "; + else + O << "unknown(" << Msg << ") "; +} + +void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + // Note: Mask values are taken from SIInsertWaits.cpp and not from ISA docs + // SIInsertWaits.cpp bits usage does not match ISA docs description but it + // works so it might be a misprint in docs. + unsigned SImm16 = MI->getOperand(OpNo).getImm(); + unsigned Vmcnt = SImm16 & 0xF; + unsigned Expcnt = (SImm16 >> 4) & 0xF; + unsigned Lgkmcnt = (SImm16 >> 8) & 0xF; + + bool NeedSpace = false; + + if (Vmcnt != 0xF) { + O << "vmcnt(" << Vmcnt << ')'; + NeedSpace = true; + } + + if (Expcnt != 0x7) { + if (NeedSpace) + O << ' '; + O << "expcnt(" << Expcnt << ')'; + NeedSpace = true; + } + + if (Lgkmcnt != 0x7) { + if (NeedSpace) + O << ' '; + O << "lgkmcnt(" << Lgkmcnt << ')'; + } +} + +#include "AMDGPUGenAsmWriter.inc" diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h new file mode 100644 index 0000000..90541d8 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -0,0 +1,86 @@ +//===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H +#define LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H + +#include "llvm/MC/MCInstPrinter.h" + +namespace llvm { + +class AMDGPUInstPrinter : public MCInstPrinter { +public: + AMDGPUInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + + //Autogenerated by tblgen + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); + + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + static void printRegOperand(unsigned RegNo, raw_ostream &O, + const MCRegisterInfo &MRI); + +private: + void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU32ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printDSOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printDSOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printDSOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printGDS(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printGLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printRegOperand(unsigned RegNo, raw_ostream &O); + void printVOPDst(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printImmediate32(uint32_t I, raw_ostream &O); + void printImmediate64(uint64_t I, raw_ostream &O); + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, + StringRef Asm, StringRef Default = ""); + static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printClampSI(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printOModSI(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printUpdateExecMask(const MCInst *MI, unsigned OpNo, + raw_ostream &O); + static void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printCT(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printSendMsg(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream &O); +}; + +} // End namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp new file mode 100644 index 0000000..60e8c8f --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -0,0 +1,189 @@ +//===-- AMDGPUAsmBackend.cpp - AMDGPU Assembler Backend -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "MCTargetDesc/AMDGPUFixupKinds.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +namespace { + +class AMDGPUMCObjectWriter : public MCObjectWriter { +public: + AMDGPUMCObjectWriter(raw_pwrite_stream &OS) : MCObjectWriter(OS, true) {} + void executePostLayoutBinding(MCAssembler &Asm, + const MCAsmLayout &Layout) override { + //XXX: Implement if necessary. + } + void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout, + const MCFragment *Fragment, const MCFixup &Fixup, + MCValue Target, bool &IsPCRel, + uint64_t &FixedValue) override { + assert(!"Not implemented"); + } + + void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override; + +}; + +class AMDGPUAsmBackend : public MCAsmBackend { +public: + AMDGPUAsmBackend(const Target &T) + : MCAsmBackend() {} + + unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; }; + void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value, bool IsPCRel) const override; + bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const override { + return false; + } + void relaxInstruction(const MCInst &Inst, MCInst &Res) const override { + assert(!"Not implemented"); + } + bool mayNeedRelaxation(const MCInst &Inst) const override { return false; } + bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override; + + const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; +}; + +} //End anonymous namespace + +void AMDGPUMCObjectWriter::writeObject(MCAssembler &Asm, + const MCAsmLayout &Layout) { + for (MCAssembler::iterator I = Asm.begin(), E = Asm.end(); I != E; ++I) { + Asm.writeSectionData(&*I, Layout); + } +} + +static unsigned getFixupKindNumBytes(unsigned Kind) { + switch (Kind) { + case FK_Data_1: + return 1; + case FK_Data_2: + return 2; + case FK_Data_4: + return 4; + case FK_Data_8: + return 8; + default: + llvm_unreachable("Unknown fixup kind!"); + } +} + +void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, + unsigned DataSize, uint64_t Value, + bool IsPCRel) const { + + switch ((unsigned)Fixup.getKind()) { + case AMDGPU::fixup_si_sopp_br: { + uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset()); + *Dst = (Value - 4) / 4; + break; + } + + case AMDGPU::fixup_si_rodata: { + uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset()); + // We emit constant data at the end of the text section and generate its + // address using the following code sequence: + // s_getpc_b64 s[0:1] + // s_add_u32 s0, s0, $symbol + // s_addc_u32 s1, s1, 0 + // + // s_getpc_b64 returns the address of the s_add_u32 instruction and then + // the fixup replaces $symbol with a literal constant, which is a + // pc-relative offset from the encoding of the $symbol operand to the + // constant data. + // + // What we want here is an offset from the start of the s_add_u32 + // instruction to the constant data, but since the encoding of $symbol + // starts 4 bytes after the start of the add instruction, we end up + // with an offset that is 4 bytes too small. This requires us to + // add 4 to the fixup value before applying it. + *Dst = Value + 4; + break; + } + default: { + // FIXME: Copied from AArch64 + unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); + if (!Value) + return; // Doesn't change encoding. + MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind()); + + // Shift the value into position. + Value <<= Info.TargetOffset; + + unsigned Offset = Fixup.getOffset(); + assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!"); + + // For each byte of the fragment that the fixup touches, mask in the + // bits from the fixup value. + for (unsigned i = 0; i != NumBytes; ++i) + Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); + } + } +} + +const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo( + MCFixupKind Kind) const { + const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = { + // name offset bits flags + { "fixup_si_sopp_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_si_rodata", 0, 32, MCFixupKindInfo::FKF_IsPCRel } + }; + + if (Kind < FirstTargetFixupKind) + return MCAsmBackend::getFixupKindInfo(Kind); + + return Infos[Kind - FirstTargetFixupKind]; +} + +bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { + OW->WriteZeros(Count); + + return true; +} + +//===----------------------------------------------------------------------===// +// ELFAMDGPUAsmBackend class +//===----------------------------------------------------------------------===// + +namespace { + +class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend { + bool Is64Bit; + +public: + ELFAMDGPUAsmBackend(const Target &T, bool Is64Bit) : + AMDGPUAsmBackend(T), Is64Bit(Is64Bit) { } + + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + return createAMDGPUELFObjectWriter(Is64Bit, OS); + } +}; + +} // end anonymous namespace + +MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, + const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU) { + Triple TargetTriple(TT); + + // Use 64-bit ELF for amdgcn + return new ELFAMDGPUAsmBackend(T, TargetTriple.getArch() == Triple::amdgcn); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp new file mode 100644 index 0000000..820f17d --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -0,0 +1,40 @@ +//===-- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#include "AMDGPUMCTargetDesc.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCFixup.h" + +using namespace llvm; + +namespace { + +class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter { +public: + AMDGPUELFObjectWriter(bool Is64Bit); +protected: + unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, + bool IsPCRel) const override { + return Fixup.getKind(); + } + +}; + + +} // End anonymous namespace + +AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit) + : MCELFObjectTargetWriter(Is64Bit, ELF::ELFOSABI_AMDGPU_HSA, + ELF::EM_AMDGPU, false) { } + +MCObjectWriter *llvm::createAMDGPUELFObjectWriter(bool Is64Bit, raw_pwrite_stream &OS) { + MCELFObjectTargetWriter *MOTW = new AMDGPUELFObjectWriter(Is64Bit); + return createELFObjectWriter(MOTW, OS, true); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp new file mode 100644 index 0000000..9ff9fe7 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp @@ -0,0 +1,26 @@ +//===-------- AMDGPUELFStreamer.cpp - ELF Object Output -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUELFStreamer.h" +#include "Utils/AMDGPUBaseInfo.h" + +using namespace llvm; + +void AMDGPUELFStreamer::InitSections(bool NoExecStack) { + // Start with the .hsatext section by default. + SwitchSection(AMDGPU::getHSATextSection(getContext())); +} + +MCELFStreamer *llvm::createAMDGPUELFStreamer(MCContext &Context, + MCAsmBackend &MAB, + raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, + bool RelaxAll) { + return new AMDGPUELFStreamer(Context, MAB, OS, Emitter); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h new file mode 100644 index 0000000..488d7e7 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h @@ -0,0 +1,40 @@ +//===-------- AMDGPUELFStreamer.h - ELF Object Output ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is a custom MCELFStreamer which allows us to insert some hooks before +// emitting data into an actual object file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H +#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H + +#include "llvm/MC/MCELFStreamer.h" + +namespace llvm { +class MCAsmBackend; +class MCCodeEmitter; +class MCContext; +class MCSubtargetInfo; + +class AMDGPUELFStreamer : public MCELFStreamer { +public: + AMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS, + MCCodeEmitter *Emitter) + : MCELFStreamer(Context, MAB, OS, Emitter) { } + + virtual void InitSections(bool NoExecStac) override; +}; + +MCELFStreamer *createAMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB, + raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll); +} // namespace llvm. + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h new file mode 100644 index 0000000..59a9178 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h @@ -0,0 +1,31 @@ +//===-- AMDGPUFixupKinds.h - AMDGPU Specific Fixup Entries ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H + +#include "llvm/MC/MCFixup.h" + +namespace llvm { +namespace AMDGPU { +enum Fixups { + /// 16-bit PC relative fixup for SOPP branch instructions. + fixup_si_sopp_br = FirstTargetFixupKind, + + /// fixup for global addresses with constant initializers + fixup_si_rodata, + + // Marker + LastTargetFixupKind, + NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind +}; +} +} + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp new file mode 100644 index 0000000..4bc80a0 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -0,0 +1,42 @@ +//===-- MCTargetDesc/AMDGPUMCAsmInfo.cpp - Assembly Info ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#include "AMDGPUMCAsmInfo.h" + +using namespace llvm; +AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() { + HasSingleParameterDotFile = false; + //===------------------------------------------------------------------===// + MaxInstLength = 16; + SeparatorString = "\n"; + CommentString = ";"; + PrivateLabelPrefix = ""; + InlineAsmStart = ";#ASMSTART"; + InlineAsmEnd = ";#ASMEND"; + + //===--- Data Emission Directives -------------------------------------===// + SunStyleELFSectionSwitchSyntax = true; + UsesELFSectionDirectiveForBSS = true; + + //===--- Global Variable Emission Directives --------------------------===// + HasAggressiveSymbolFolding = true; + COMMDirectiveAlignmentIsInBytes = false; + HasNoDeadStrip = true; + WeakRefDirective = ".weakref\t"; + //===--- Dwarf Emission Directives -----------------------------------===// + SupportsDebugInformation = true; +} + +bool AMDGPUMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const { + return SectionName == ".hsatext" || SectionName == ".hsadata_global_agent" || + SectionName == ".hsadata_global_program" || + SectionName == ".hsarodata_readonly_agent" || + MCAsmInfo::shouldOmitSectionDirective(SectionName); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h new file mode 100644 index 0000000..a546961 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h @@ -0,0 +1,33 @@ +//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H + +#include "llvm/MC/MCAsmInfoELF.h" +namespace llvm { + +class Triple; + +// If you need to create another MCAsmInfo class, which inherits from MCAsmInfo, +// you will need to make sure your new class sets PrivateGlobalPrefix to +// a prefix that won't appear in a function name. The default value +// for PrivateGlobalPrefix is 'L', so it will consider any function starting +// with 'L' as a local symbol. +class AMDGPUMCAsmInfo : public MCAsmInfoELF { +public: + explicit AMDGPUMCAsmInfo(const Triple &TT); + bool shouldOmitSectionDirective(StringRef SectionName) const override; +}; +} // namespace llvm +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp new file mode 100644 index 0000000..521b3b3 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -0,0 +1,21 @@ +//===-- AMDGPUCodeEmitter.cpp - AMDGPU Code Emitter interface -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief CodeEmitter interface for R600 and SI codegen. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUMCCodeEmitter.h" + +using namespace llvm; + +// pin vtable to this file +void AMDGPUMCCodeEmitter::anchor() {} + diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h new file mode 100644 index 0000000..c957427 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -0,0 +1,50 @@ +//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief CodeEmitter interface for R600 and SI codegen. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H + +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { + +class MCInst; +class MCOperand; +class MCSubtargetInfo; + +class AMDGPUMCCodeEmitter : public MCCodeEmitter { + virtual void anchor(); +public: + + uint64_t getBinaryCodeForInstr(const MCInst &MI, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + return 0; + } + + virtual unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + return 0; + } +}; + +} // End namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp new file mode 100644 index 0000000..f704094 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -0,0 +1,122 @@ +//===-- AMDGPUMCTargetDesc.cpp - AMDGPU Target Descriptions ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This file provides AMDGPU specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUMCTargetDesc.h" +#include "AMDGPUELFStreamer.h" +#include "AMDGPUMCAsmInfo.h" +#include "AMDGPUTargetStreamer.h" +#include "InstPrinter/AMDGPUInstPrinter.h" +#include "SIDefines.h" +#include "llvm/MC/MCCodeGenInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MachineLocation.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +#define GET_INSTRINFO_MC_DESC +#include "AMDGPUGenInstrInfo.inc" + +#define GET_SUBTARGETINFO_MC_DESC +#include "AMDGPUGenSubtargetInfo.inc" + +#define GET_REGINFO_MC_DESC +#include "AMDGPUGenRegisterInfo.inc" + +static MCInstrInfo *createAMDGPUMCInstrInfo() { + MCInstrInfo *X = new MCInstrInfo(); + InitAMDGPUMCInstrInfo(X); + return X; +} + +static MCRegisterInfo *createAMDGPUMCRegisterInfo(const Triple &TT) { + MCRegisterInfo *X = new MCRegisterInfo(); + InitAMDGPUMCRegisterInfo(X, 0); + return X; +} + +static MCSubtargetInfo * +createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { + return createAMDGPUMCSubtargetInfoImpl(TT, CPU, FS); +} + +static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(const Triple &TT, + Reloc::Model RM, + CodeModel::Model CM, + CodeGenOpt::Level OL) { + MCCodeGenInfo *X = new MCCodeGenInfo(); + X->initMCCodeGenInfo(RM, CM, OL); + return X; +} + +static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T, + unsigned SyntaxVariant, + const MCAsmInfo &MAI, + const MCInstrInfo &MII, + const MCRegisterInfo &MRI) { + return new AMDGPUInstPrinter(MAI, MII, MRI); +} + +static MCTargetStreamer *createAMDGPUAsmTargetStreamer(MCStreamer &S, + formatted_raw_ostream &OS, + MCInstPrinter *InstPrint, + bool isVerboseAsm) { + return new AMDGPUTargetAsmStreamer(S, OS); +} + +static MCTargetStreamer * createAMDGPUObjectTargetStreamer( + MCStreamer &S, + const MCSubtargetInfo &STI) { + return new AMDGPUTargetELFStreamer(S); +} + +static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context, + MCAsmBackend &MAB, raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll) { + if (T.getOS() == Triple::AMDHSA) + return createAMDGPUELFStreamer(Context, MAB, OS, Emitter, RelaxAll); + + return createELFStreamer(Context, MAB, OS, Emitter, RelaxAll); +} + +extern "C" void LLVMInitializeAMDGPUTargetMC() { + for (Target *T : {&TheAMDGPUTarget, &TheGCNTarget}) { + RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T); + + TargetRegistry::RegisterMCCodeGenInfo(*T, createAMDGPUMCCodeGenInfo); + TargetRegistry::RegisterMCInstrInfo(*T, createAMDGPUMCInstrInfo); + TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo); + TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo); + TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter); + TargetRegistry::RegisterMCAsmBackend(*T, createAMDGPUAsmBackend); + TargetRegistry::RegisterELFStreamer(*T, createMCStreamer); + } + + // R600 specific registration + TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget, + createR600MCCodeEmitter); + + // GCN specific registration + TargetRegistry::RegisterMCCodeEmitter(TheGCNTarget, createSIMCCodeEmitter); + + TargetRegistry::RegisterAsmTargetStreamer(TheGCNTarget, + createAMDGPUAsmTargetStreamer); + TargetRegistry::RegisterObjectTargetStreamer(TheGCNTarget, + createAMDGPUObjectTargetStreamer); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h new file mode 100644 index 0000000..5d1b86b --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -0,0 +1,62 @@ +//===-- AMDGPUMCTargetDesc.h - AMDGPU Target Descriptions -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Provides AMDGPU specific target descriptions. +// +//===----------------------------------------------------------------------===// +// + +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H + +#include "llvm/Support/DataTypes.h" +#include "llvm/ADT/StringRef.h" + +namespace llvm { +class MCAsmBackend; +class MCCodeEmitter; +class MCContext; +class MCInstrInfo; +class MCObjectWriter; +class MCRegisterInfo; +class MCSubtargetInfo; +class Target; +class Triple; +class raw_pwrite_stream; +class raw_ostream; + +extern Target TheAMDGPUTarget; +extern Target TheGCNTarget; + +MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx); + +MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx); + +MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU); + +MCObjectWriter *createAMDGPUELFObjectWriter(bool Is64Bit, + raw_pwrite_stream &OS); +} // End llvm namespace + +#define GET_REGINFO_ENUM +#include "AMDGPUGenRegisterInfo.inc" + +#define GET_INSTRINFO_ENUM +#include "AMDGPUGenInstrInfo.inc" + +#define GET_SUBTARGETINFO_ENUM +#include "AMDGPUGenSubtargetInfo.inc" + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp new file mode 100644 index 0000000..b91134d --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -0,0 +1,346 @@ +//===-- AMDGPUTargetStreamer.cpp - Mips Target Streamer Methods -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides AMDGPU specific target streamer methods. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetStreamer.h" +#include "SIDefines.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCELFStreamer.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/FormattedStream.h" + +using namespace llvm; + +AMDGPUTargetStreamer::AMDGPUTargetStreamer(MCStreamer &S) + : MCTargetStreamer(S) { } + +//===----------------------------------------------------------------------===// +// AMDGPUTargetAsmStreamer +//===----------------------------------------------------------------------===// + +AMDGPUTargetAsmStreamer::AMDGPUTargetAsmStreamer(MCStreamer &S, + formatted_raw_ostream &OS) + : AMDGPUTargetStreamer(S), OS(OS) { } + +void +AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major, + uint32_t Minor) { + OS << "\t.hsa_code_object_version " << + Twine(Major) << "," << Twine(Minor) << '\n'; +} + +void +AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major, + uint32_t Minor, + uint32_t Stepping, + StringRef VendorName, + StringRef ArchName) { + OS << "\t.hsa_code_object_isa " << + Twine(Major) << "," << Twine(Minor) << "," << Twine(Stepping) << + ",\"" << VendorName << "\",\"" << ArchName << "\"\n"; + +} + +void +AMDGPUTargetAsmStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) { + uint64_t ComputePgmRsrc2 = (Header.compute_pgm_resource_registers >> 32); + bool EnableSGPRPrivateSegmentBuffer = (Header.code_properties & + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER); + bool EnableSGPRDispatchPtr = (Header.code_properties & + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR); + bool EnableSGPRQueuePtr = (Header.code_properties & + AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR); + bool EnableSGPRKernargSegmentPtr = (Header.code_properties & + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR); + bool EnableSGPRDispatchID = (Header.code_properties & + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID); + bool EnableSGPRFlatScratchInit = (Header.code_properties & + AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); + bool EnableSGPRPrivateSegmentSize = (Header.code_properties & + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); + bool EnableSGPRGridWorkgroupCountX = (Header.code_properties & + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X); + bool EnableSGPRGridWorkgroupCountY = (Header.code_properties & + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y); + bool EnableSGPRGridWorkgroupCountZ = (Header.code_properties & + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z); + bool EnableOrderedAppendGDS = (Header.code_properties & + AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS); + uint32_t PrivateElementSize = (Header.code_properties & + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE) >> + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT; + bool IsPtr64 = (Header.code_properties & AMD_CODE_PROPERTY_IS_PTR64); + bool IsDynamicCallstack = (Header.code_properties & + AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK); + bool IsDebugEnabled = (Header.code_properties & + AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED); + bool IsXNackEnabled = (Header.code_properties & + AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED); + + OS << "\t.amd_kernel_code_t\n" << + "\t\tkernel_code_version_major = " << + Header.amd_kernel_code_version_major << '\n' << + "\t\tkernel_code_version_minor = " << + Header.amd_kernel_code_version_minor << '\n' << + "\t\tmachine_kind = " << + Header.amd_machine_kind << '\n' << + "\t\tmachine_version_major = " << + Header.amd_machine_version_major << '\n' << + "\t\tmachine_version_minor = " << + Header.amd_machine_version_minor << '\n' << + "\t\tmachine_version_stepping = " << + Header.amd_machine_version_stepping << '\n' << + "\t\tkernel_code_entry_byte_offset = " << + Header.kernel_code_entry_byte_offset << '\n' << + "\t\tkernel_code_prefetch_byte_size = " << + Header.kernel_code_prefetch_byte_size << '\n' << + "\t\tmax_scratch_backing_memory_byte_size = " << + Header.max_scratch_backing_memory_byte_size << '\n' << + "\t\tcompute_pgm_rsrc1_vgprs = " << + G_00B848_VGPRS(Header.compute_pgm_resource_registers) << '\n' << + "\t\tcompute_pgm_rsrc1_sgprs = " << + G_00B848_SGPRS(Header.compute_pgm_resource_registers) << '\n' << + "\t\tcompute_pgm_rsrc1_priority = " << + G_00B848_PRIORITY(Header.compute_pgm_resource_registers) << '\n' << + "\t\tcompute_pgm_rsrc1_float_mode = " << + G_00B848_FLOAT_MODE(Header.compute_pgm_resource_registers) << '\n' << + "\t\tcompute_pgm_rsrc1_priv = " << + G_00B848_PRIV(Header.compute_pgm_resource_registers) << '\n' << + "\t\tcompute_pgm_rsrc1_dx10_clamp = " << + G_00B848_DX10_CLAMP(Header.compute_pgm_resource_registers) << '\n' << + "\t\tcompute_pgm_rsrc1_debug_mode = " << + G_00B848_DEBUG_MODE(Header.compute_pgm_resource_registers) << '\n' << + "\t\tcompute_pgm_rsrc1_ieee_mode = " << + G_00B848_IEEE_MODE(Header.compute_pgm_resource_registers) << '\n' << + "\t\tcompute_pgm_rsrc2_scratch_en = " << + G_00B84C_SCRATCH_EN(ComputePgmRsrc2) << '\n' << + "\t\tcompute_pgm_rsrc2_user_sgpr = " << + G_00B84C_USER_SGPR(ComputePgmRsrc2) << '\n' << + "\t\tcompute_pgm_rsrc2_tgid_x_en = " << + G_00B84C_TGID_X_EN(ComputePgmRsrc2) << '\n' << + "\t\tcompute_pgm_rsrc2_tgid_y_en = " << + G_00B84C_TGID_Y_EN(ComputePgmRsrc2) << '\n' << + "\t\tcompute_pgm_rsrc2_tgid_z_en = " << + G_00B84C_TGID_Z_EN(ComputePgmRsrc2) << '\n' << + "\t\tcompute_pgm_rsrc2_tg_size_en = " << + G_00B84C_TG_SIZE_EN(ComputePgmRsrc2) << '\n' << + "\t\tcompute_pgm_rsrc2_tidig_comp_cnt = " << + G_00B84C_TIDIG_COMP_CNT(ComputePgmRsrc2) << '\n' << + "\t\tcompute_pgm_rsrc2_excp_en_msb = " << + G_00B84C_EXCP_EN_MSB(ComputePgmRsrc2) << '\n' << + "\t\tcompute_pgm_rsrc2_lds_size = " << + G_00B84C_LDS_SIZE(ComputePgmRsrc2) << '\n' << + "\t\tcompute_pgm_rsrc2_excp_en = " << + G_00B84C_EXCP_EN(ComputePgmRsrc2) << '\n' << + + "\t\tenable_sgpr_private_segment_buffer = " << + EnableSGPRPrivateSegmentBuffer << '\n' << + "\t\tenable_sgpr_dispatch_ptr = " << + EnableSGPRDispatchPtr << '\n' << + "\t\tenable_sgpr_queue_ptr = " << + EnableSGPRQueuePtr << '\n' << + "\t\tenable_sgpr_kernarg_segment_ptr = " << + EnableSGPRKernargSegmentPtr << '\n' << + "\t\tenable_sgpr_dispatch_id = " << + EnableSGPRDispatchID << '\n' << + "\t\tenable_sgpr_flat_scratch_init = " << + EnableSGPRFlatScratchInit << '\n' << + "\t\tenable_sgpr_private_segment_size = " << + EnableSGPRPrivateSegmentSize << '\n' << + "\t\tenable_sgpr_grid_workgroup_count_x = " << + EnableSGPRGridWorkgroupCountX << '\n' << + "\t\tenable_sgpr_grid_workgroup_count_y = " << + EnableSGPRGridWorkgroupCountY << '\n' << + "\t\tenable_sgpr_grid_workgroup_count_z = " << + EnableSGPRGridWorkgroupCountZ << '\n' << + "\t\tenable_ordered_append_gds = " << + EnableOrderedAppendGDS << '\n' << + "\t\tprivate_element_size = " << + PrivateElementSize << '\n' << + "\t\tis_ptr64 = " << + IsPtr64 << '\n' << + "\t\tis_dynamic_callstack = " << + IsDynamicCallstack << '\n' << + "\t\tis_debug_enabled = " << + IsDebugEnabled << '\n' << + "\t\tis_xnack_enabled = " << + IsXNackEnabled << '\n' << + "\t\tworkitem_private_segment_byte_size = " << + Header.workitem_private_segment_byte_size << '\n' << + "\t\tworkgroup_group_segment_byte_size = " << + Header.workgroup_group_segment_byte_size << '\n' << + "\t\tgds_segment_byte_size = " << + Header.gds_segment_byte_size << '\n' << + "\t\tkernarg_segment_byte_size = " << + Header.kernarg_segment_byte_size << '\n' << + "\t\tworkgroup_fbarrier_count = " << + Header.workgroup_fbarrier_count << '\n' << + "\t\twavefront_sgpr_count = " << + Header.wavefront_sgpr_count << '\n' << + "\t\tworkitem_vgpr_count = " << + Header.workitem_vgpr_count << '\n' << + "\t\treserved_vgpr_first = " << + Header.reserved_vgpr_first << '\n' << + "\t\treserved_vgpr_count = " << + Header.reserved_vgpr_count << '\n' << + "\t\treserved_sgpr_first = " << + Header.reserved_sgpr_first << '\n' << + "\t\treserved_sgpr_count = " << + Header.reserved_sgpr_count << '\n' << + "\t\tdebug_wavefront_private_segment_offset_sgpr = " << + Header.debug_wavefront_private_segment_offset_sgpr << '\n' << + "\t\tdebug_private_segment_buffer_sgpr = " << + Header.debug_private_segment_buffer_sgpr << '\n' << + "\t\tkernarg_segment_alignment = " << + (uint32_t)Header.kernarg_segment_alignment << '\n' << + "\t\tgroup_segment_alignment = " << + (uint32_t)Header.group_segment_alignment << '\n' << + "\t\tprivate_segment_alignment = " << + (uint32_t)Header.private_segment_alignment << '\n' << + "\t\twavefront_size = " << + (uint32_t)Header.wavefront_size << '\n' << + "\t\tcall_convention = " << + Header.call_convention << '\n' << + "\t\truntime_loader_kernel_symbol = " << + Header.runtime_loader_kernel_symbol << '\n' << + // TODO: control_directives + "\t.end_amd_kernel_code_t\n"; + +} + +void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, + unsigned Type) { + switch (Type) { + default: llvm_unreachable("Invalid AMDGPU symbol type"); + case ELF::STT_AMDGPU_HSA_KERNEL: + OS << "\t.amdgpu_hsa_kernel " << SymbolName << '\n' ; + break; + } +} + +void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaModuleScopeGlobal( + StringRef GlobalName) { + OS << "\t.amdgpu_hsa_module_global " << GlobalName << '\n'; +} + +void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaProgramScopeGlobal( + StringRef GlobalName) { + OS << "\t.amdgpu_hsa_program_global " << GlobalName << '\n'; +} + +//===----------------------------------------------------------------------===// +// AMDGPUTargetELFStreamer +//===----------------------------------------------------------------------===// + +AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(MCStreamer &S) + : AMDGPUTargetStreamer(S), Streamer(S) { } + +MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() { + return static_cast<MCELFStreamer &>(Streamer); +} + +void +AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major, + uint32_t Minor) { + MCStreamer &OS = getStreamer(); + MCSectionELF *Note = OS.getContext().getELFSection(".note", ELF::SHT_NOTE, 0); + + unsigned NameSZ = 4; + + OS.PushSection(); + OS.SwitchSection(Note); + OS.EmitIntValue(NameSZ, 4); // namesz + OS.EmitIntValue(8, 4); // descz + OS.EmitIntValue(NT_AMDGPU_HSA_CODE_OBJECT_VERSION, 4); // type + OS.EmitBytes(StringRef("AMD", NameSZ)); // name + OS.EmitIntValue(Major, 4); // desc + OS.EmitIntValue(Minor, 4); + OS.EmitValueToAlignment(4); + OS.PopSection(); +} + +void +AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major, + uint32_t Minor, + uint32_t Stepping, + StringRef VendorName, + StringRef ArchName) { + MCStreamer &OS = getStreamer(); + MCSectionELF *Note = OS.getContext().getELFSection(".note", ELF::SHT_NOTE, 0); + + unsigned NameSZ = 4; + uint16_t VendorNameSize = VendorName.size() + 1; + uint16_t ArchNameSize = ArchName.size() + 1; + unsigned DescSZ = sizeof(VendorNameSize) + sizeof(ArchNameSize) + + sizeof(Major) + sizeof(Minor) + sizeof(Stepping) + + VendorNameSize + ArchNameSize; + + OS.PushSection(); + OS.SwitchSection(Note); + OS.EmitIntValue(NameSZ, 4); // namesz + OS.EmitIntValue(DescSZ, 4); // descsz + OS.EmitIntValue(NT_AMDGPU_HSA_ISA, 4); // type + OS.EmitBytes(StringRef("AMD", 4)); // name + OS.EmitIntValue(VendorNameSize, 2); // desc + OS.EmitIntValue(ArchNameSize, 2); + OS.EmitIntValue(Major, 4); + OS.EmitIntValue(Minor, 4); + OS.EmitIntValue(Stepping, 4); + OS.EmitBytes(VendorName); + OS.EmitIntValue(0, 1); // NULL terminate VendorName + OS.EmitBytes(ArchName); + OS.EmitIntValue(0, 1); // NULL terminte ArchName + OS.EmitValueToAlignment(4); + OS.PopSection(); +} + +void +AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) { + + MCStreamer &OS = getStreamer(); + OS.PushSection(); + // The MCObjectFileInfo that is available to the assembler is a generic + // implementation and not AMDGPUHSATargetObjectFile, so we can't use + // MCObjectFileInfo::getTextSection() here for fetching the HSATextSection. + OS.SwitchSection(AMDGPU::getHSATextSection(OS.getContext())); + OS.EmitBytes(StringRef((const char*)&Header, sizeof(Header))); + OS.PopSection(); +} + +void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, + unsigned Type) { + MCSymbolELF *Symbol = cast<MCSymbolELF>( + getStreamer().getContext().getOrCreateSymbol(SymbolName)); + Symbol->setType(ELF::STT_AMDGPU_HSA_KERNEL); +} + +void AMDGPUTargetELFStreamer::EmitAMDGPUHsaModuleScopeGlobal( + StringRef GlobalName) { + + MCSymbolELF *Symbol = cast<MCSymbolELF>( + getStreamer().getContext().getOrCreateSymbol(GlobalName)); + Symbol->setType(ELF::STT_OBJECT); + Symbol->setBinding(ELF::STB_LOCAL); +} + +void AMDGPUTargetELFStreamer::EmitAMDGPUHsaProgramScopeGlobal( + StringRef GlobalName) { + + MCSymbolELF *Symbol = cast<MCSymbolELF>( + getStreamer().getContext().getOrCreateSymbol(GlobalName)); + Symbol->setType(ELF::STT_OBJECT); + Symbol->setBinding(ELF::STB_GLOBAL); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h new file mode 100644 index 0000000..83bb728 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -0,0 +1,98 @@ +//===-- AMDGPUTargetStreamer.h - AMDGPU Target Streamer --------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H + +#include "AMDKernelCodeT.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/Debug.h" +namespace llvm { + +class MCELFStreamer; + +class AMDGPUTargetStreamer : public MCTargetStreamer { +public: + AMDGPUTargetStreamer(MCStreamer &S); + virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major, + uint32_t Minor) = 0; + + virtual void EmitDirectiveHSACodeObjectISA(uint32_t Major, uint32_t Minor, + uint32_t Stepping, + StringRef VendorName, + StringRef ArchName) = 0; + + virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) = 0; + + virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0; + + virtual void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) = 0; + + virtual void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) = 0; +}; + +class AMDGPUTargetAsmStreamer : public AMDGPUTargetStreamer { + formatted_raw_ostream &OS; +public: + AMDGPUTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS); + void EmitDirectiveHSACodeObjectVersion(uint32_t Major, + uint32_t Minor) override; + + void EmitDirectiveHSACodeObjectISA(uint32_t Major, uint32_t Minor, + uint32_t Stepping, StringRef VendorName, + StringRef ArchName) override; + + void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override; + + void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override; + + void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override; + + void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override; +}; + +class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer { + + enum NoteType { + NT_AMDGPU_HSA_CODE_OBJECT_VERSION = 1, + NT_AMDGPU_HSA_HSAIL = 2, + NT_AMDGPU_HSA_ISA = 3, + NT_AMDGPU_HSA_PRODUCER = 4, + NT_AMDGPU_HSA_PRODUCER_OPTIONS = 5, + NT_AMDGPU_HSA_EXTENSION = 6, + NT_AMDGPU_HSA_HLDEBUG_DEBUG = 101, + NT_AMDGPU_HSA_HLDEBUG_TARGET = 102 + }; + + MCStreamer &Streamer; + +public: + AMDGPUTargetELFStreamer(MCStreamer &S); + + MCELFStreamer &getStreamer(); + + void EmitDirectiveHSACodeObjectVersion(uint32_t Major, + uint32_t Minor) override; + + void EmitDirectiveHSACodeObjectISA(uint32_t Major, uint32_t Minor, + uint32_t Stepping, StringRef VendorName, + StringRef ArchName) override; + + void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override; + + void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override; + + void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override; + + void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override; +}; + +} +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp new file mode 100644 index 0000000..3c1142d --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -0,0 +1,179 @@ +//===- R600MCCodeEmitter.cpp - Code Emitter for R600->Cayman GPU families -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// \brief The R600 code emitter produces machine code that can be executed +/// directly on the GPU device. +// +//===----------------------------------------------------------------------===// + +#include "R600Defines.h" +#include "MCTargetDesc/AMDGPUMCCodeEmitter.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/EndianStream.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { + +class R600MCCodeEmitter : public AMDGPUMCCodeEmitter { + R600MCCodeEmitter(const R600MCCodeEmitter &) = delete; + void operator=(const R600MCCodeEmitter &) = delete; + const MCInstrInfo &MCII; + const MCRegisterInfo &MRI; + +public: + R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri) + : MCII(mcii), MRI(mri) { } + + /// \brief Encode the instruction and write it to the OS. + void encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; + + /// \returns the encoding for an MCOperand. + uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; + +private: + void EmitByte(unsigned int byte, raw_ostream &OS) const; + + void Emit(uint32_t value, raw_ostream &OS) const; + void Emit(uint64_t value, raw_ostream &OS) const; + + unsigned getHWRegChan(unsigned reg) const; + unsigned getHWReg(unsigned regNo) const; +}; + +} // End anonymous namespace + +enum RegElement { + ELEMENT_X = 0, + ELEMENT_Y, + ELEMENT_Z, + ELEMENT_W +}; + +enum FCInstr { + FC_IF_PREDICATE = 0, + FC_ELSE, + FC_ENDIF, + FC_BGNLOOP, + FC_ENDLOOP, + FC_BREAK_PREDICATE, + FC_CONTINUE +}; + +MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx) { + return new R600MCCodeEmitter(MCII, MRI); +} + +void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + if (MI.getOpcode() == AMDGPU::RETURN || + MI.getOpcode() == AMDGPU::FETCH_CLAUSE || + MI.getOpcode() == AMDGPU::ALU_CLAUSE || + MI.getOpcode() == AMDGPU::BUNDLE || + MI.getOpcode() == AMDGPU::KILL) { + return; + } else if (IS_VTX(Desc)) { + uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups, STI); + uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset + if (!(STI.getFeatureBits()[AMDGPU::FeatureCaymanISA])) { + InstWord2 |= 1 << 19; // Mega-Fetch bit + } + + Emit(InstWord01, OS); + Emit(InstWord2, OS); + Emit((uint32_t) 0, OS); + } else if (IS_TEX(Desc)) { + int64_t Sampler = MI.getOperand(14).getImm(); + + int64_t SrcSelect[4] = { + MI.getOperand(2).getImm(), + MI.getOperand(3).getImm(), + MI.getOperand(4).getImm(), + MI.getOperand(5).getImm() + }; + int64_t Offsets[3] = { + MI.getOperand(6).getImm() & 0x1F, + MI.getOperand(7).getImm() & 0x1F, + MI.getOperand(8).getImm() & 0x1F + }; + + uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups, STI); + uint32_t Word2 = Sampler << 15 | SrcSelect[ELEMENT_X] << 20 | + SrcSelect[ELEMENT_Y] << 23 | SrcSelect[ELEMENT_Z] << 26 | + SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 | + Offsets[2] << 10; + + Emit(Word01, OS); + Emit(Word2, OS); + Emit((uint32_t) 0, OS); + } else { + uint64_t Inst = getBinaryCodeForInstr(MI, Fixups, STI); + if ((STI.getFeatureBits()[AMDGPU::FeatureR600ALUInst]) && + ((Desc.TSFlags & R600_InstFlag::OP1) || + Desc.TSFlags & R600_InstFlag::OP2)) { + uint64_t ISAOpCode = Inst & (0x3FFULL << 39); + Inst &= ~(0x3FFULL << 39); + Inst |= ISAOpCode << 1; + } + Emit(Inst, OS); + } +} + +void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const { + OS.write((uint8_t) Byte & 0xff); +} + +void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const { + support::endian::Writer<support::little>(OS).write(Value); +} + +void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const { + support::endian::Writer<support::little>(OS).write(Value); +} + +unsigned R600MCCodeEmitter::getHWRegChan(unsigned reg) const { + return MRI.getEncodingValue(reg) >> HW_CHAN_SHIFT; +} + +unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const { + return MRI.getEncodingValue(RegNo) & HW_REG_MASK; +} + +uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI, + const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixup, + const MCSubtargetInfo &STI) const { + if (MO.isReg()) { + if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags)) + return MRI.getEncodingValue(MO.getReg()); + return getHWReg(MO.getReg()); + } + + assert(MO.isImm()); + return MO.getImm(); +} + +#include "AMDGPUGenMCCodeEmitter.inc" diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp new file mode 100644 index 0000000..9eb3dad --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -0,0 +1,278 @@ +//===-- SIMCCodeEmitter.cpp - SI Code Emitter -------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief The SI code emitter produces machine code that can be executed +/// directly on the GPU device. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "MCTargetDesc/AMDGPUFixupKinds.h" +#include "MCTargetDesc/AMDGPUMCCodeEmitter.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIDefines.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { + +class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { + SIMCCodeEmitter(const SIMCCodeEmitter &) = delete; + void operator=(const SIMCCodeEmitter &) = delete; + const MCInstrInfo &MCII; + const MCRegisterInfo &MRI; + + /// \brief Can this operand also contain immediate values? + bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const; + + /// \brief Encode an fp or int literal + uint32_t getLitEncoding(const MCOperand &MO, unsigned OpSize) const; + +public: + SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, + MCContext &ctx) + : MCII(mcii), MRI(mri) { } + + ~SIMCCodeEmitter() override {} + + /// \brief Encode the instruction and write it to the OS. + void encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; + + /// \returns the encoding for an MCOperand. + uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; + + /// \brief Use a fixup to encode the simm16 field for SOPP branch + /// instructions. + unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; +}; + +} // End anonymous namespace + +MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx) { + return new SIMCCodeEmitter(MCII, MRI, Ctx); +} + +bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc, + unsigned OpNo) const { + unsigned OpType = Desc.OpInfo[OpNo].OperandType; + + return OpType == AMDGPU::OPERAND_REG_IMM32 || + OpType == AMDGPU::OPERAND_REG_INLINE_C; +} + +// Returns the encoding value to use if the given integer is an integer inline +// immediate value, or 0 if it is not. +template <typename IntTy> +static uint32_t getIntInlineImmEncoding(IntTy Imm) { + if (Imm >= 0 && Imm <= 64) + return 128 + Imm; + + if (Imm >= -16 && Imm <= -1) + return 192 + std::abs(Imm); + + return 0; +} + +static uint32_t getLit32Encoding(uint32_t Val) { + uint32_t IntImm = getIntInlineImmEncoding(static_cast<int32_t>(Val)); + if (IntImm != 0) + return IntImm; + + if (Val == FloatToBits(0.5f)) + return 240; + + if (Val == FloatToBits(-0.5f)) + return 241; + + if (Val == FloatToBits(1.0f)) + return 242; + + if (Val == FloatToBits(-1.0f)) + return 243; + + if (Val == FloatToBits(2.0f)) + return 244; + + if (Val == FloatToBits(-2.0f)) + return 245; + + if (Val == FloatToBits(4.0f)) + return 246; + + if (Val == FloatToBits(-4.0f)) + return 247; + + return 255; +} + +static uint32_t getLit64Encoding(uint64_t Val) { + uint32_t IntImm = getIntInlineImmEncoding(static_cast<int64_t>(Val)); + if (IntImm != 0) + return IntImm; + + if (Val == DoubleToBits(0.5)) + return 240; + + if (Val == DoubleToBits(-0.5)) + return 241; + + if (Val == DoubleToBits(1.0)) + return 242; + + if (Val == DoubleToBits(-1.0)) + return 243; + + if (Val == DoubleToBits(2.0)) + return 244; + + if (Val == DoubleToBits(-2.0)) + return 245; + + if (Val == DoubleToBits(4.0)) + return 246; + + if (Val == DoubleToBits(-4.0)) + return 247; + + return 255; +} + +uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, + unsigned OpSize) const { + if (MO.isExpr()) + return 255; + + assert(!MO.isFPImm()); + + if (!MO.isImm()) + return ~0; + + if (OpSize == 4) + return getLit32Encoding(static_cast<uint32_t>(MO.getImm())); + + assert(OpSize == 8); + + return getLit64Encoding(static_cast<uint64_t>(MO.getImm())); +} + +void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + + uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups, STI); + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + unsigned bytes = Desc.getSize(); + + for (unsigned i = 0; i < bytes; i++) { + OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff)); + } + + if (bytes > 4) + return; + + // Check for additional literals in SRC0/1/2 (Op 1/2/3) + for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) { + + // Check if this operand should be encoded as [SV]Src + if (!isSrcOperand(Desc, i)) + continue; + + int RCID = Desc.OpInfo[i].RegClass; + const MCRegisterClass &RC = MRI.getRegClass(RCID); + + // Is this operand a literal immediate? + const MCOperand &Op = MI.getOperand(i); + if (getLitEncoding(Op, RC.getSize()) != 255) + continue; + + // Yes! Encode it + int64_t Imm = 0; + + if (Op.isImm()) + Imm = Op.getImm(); + else if (!Op.isExpr()) // Exprs will be replaced with a fixup value. + llvm_unreachable("Must be immediate or expr"); + + for (unsigned j = 0; j < 4; j++) { + OS.write((uint8_t) ((Imm >> (8 * j)) & 0xff)); + } + + // Only one literal value allowed + break; + } +} + +unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + + if (MO.isExpr()) { + const MCExpr *Expr = MO.getExpr(); + MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_sopp_br; + Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc())); + return 0; + } + + return getMachineOpValue(MI, MO, Fixups, STI); +} + +uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, + const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + if (MO.isReg()) + return MRI.getEncodingValue(MO.getReg()); + + if (MO.isExpr()) { + const MCSymbolRefExpr *Expr = cast<MCSymbolRefExpr>(MO.getExpr()); + MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_rodata; + Fixups.push_back(MCFixup::create(4, Expr, Kind, MI.getLoc())); + } + + // Figure out the operand number, needed for isSrcOperand check + unsigned OpNo = 0; + for (unsigned e = MI.getNumOperands(); OpNo < e; ++OpNo) { + if (&MO == &MI.getOperand(OpNo)) + break; + } + + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + if (isSrcOperand(Desc, OpNo)) { + int RCID = Desc.OpInfo[OpNo].RegClass; + const MCRegisterClass &RC = MRI.getRegClass(RCID); + + uint32_t Enc = getLitEncoding(MO, RC.getSize()); + if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4)) + return Enc; + + } else if (MO.isImm()) + return MO.getImm(); + + llvm_unreachable("Encoding of this operand type is not supported yet."); + return 0; +} + diff --git a/contrib/llvm/lib/Target/AMDGPU/Processors.td b/contrib/llvm/lib/Target/AMDGPU/Processors.td new file mode 100644 index 0000000..a1584a2 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/Processors.td @@ -0,0 +1,148 @@ +//===-- Processors.td - R600 Processor definitions ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features> +: Processor<Name, itin, Features>; + +//===----------------------------------------------------------------------===// +// R600 +//===----------------------------------------------------------------------===// +def : Proc<"", R600_VLIW5_Itin, + [FeatureR600, FeatureVertexCache]>; + +def : Proc<"r600", R600_VLIW5_Itin, + [FeatureR600 , FeatureVertexCache, FeatureWavefrontSize64]>; + +def : Proc<"r630", R600_VLIW5_Itin, + [FeatureR600, FeatureVertexCache, FeatureWavefrontSize32]>; + +def : Proc<"rs880", R600_VLIW5_Itin, + [FeatureR600, FeatureWavefrontSize16]>; + +def : Proc<"rv670", R600_VLIW5_Itin, + [FeatureR600, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>; + +//===----------------------------------------------------------------------===// +// R700 +//===----------------------------------------------------------------------===// + +def : Proc<"rv710", R600_VLIW5_Itin, + [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>; + +def : Proc<"rv730", R600_VLIW5_Itin, + [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>; + +def : Proc<"rv770", R600_VLIW5_Itin, + [FeatureR700, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>; + +//===----------------------------------------------------------------------===// +// Evergreen +//===----------------------------------------------------------------------===// + +def : Proc<"cedar", R600_VLIW5_Itin, + [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32, + FeatureCFALUBug]>; + +def : Proc<"redwood", R600_VLIW5_Itin, + [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64, + FeatureCFALUBug]>; + +def : Proc<"sumo", R600_VLIW5_Itin, + [FeatureEvergreen, FeatureWavefrontSize64, FeatureCFALUBug]>; + +def : Proc<"juniper", R600_VLIW5_Itin, + [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>; + +def : Proc<"cypress", R600_VLIW5_Itin, + [FeatureEvergreen, FeatureFP64, FeatureVertexCache, + FeatureWavefrontSize64]>; + +//===----------------------------------------------------------------------===// +// Northern Islands +//===----------------------------------------------------------------------===// + +def : Proc<"barts", R600_VLIW5_Itin, + [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>; + +def : Proc<"turks", R600_VLIW5_Itin, + [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>; + +def : Proc<"caicos", R600_VLIW5_Itin, + [FeatureNorthernIslands, FeatureCFALUBug]>; + +def : Proc<"cayman", R600_VLIW4_Itin, + [FeatureNorthernIslands, FeatureFP64, FeatureCaymanISA]>; + +//===----------------------------------------------------------------------===// +// Southern Islands +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"SI", SIFullSpeedModel, + [FeatureSouthernIslands, FeatureFastFMAF32] +>; + +def : ProcessorModel<"tahiti", SIFullSpeedModel, + [FeatureSouthernIslands, FeatureFastFMAF32] +>; + +def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureSouthernIslands]>; + +def : ProcessorModel<"verde", SIQuarterSpeedModel, [FeatureSouthernIslands]>; + +def : ProcessorModel<"oland", SIQuarterSpeedModel, [FeatureSouthernIslands]>; + +def : ProcessorModel<"hainan", SIQuarterSpeedModel, [FeatureSouthernIslands]>; + +//===----------------------------------------------------------------------===// +// Sea Islands +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"bonaire", SIQuarterSpeedModel, + [FeatureSeaIslands, FeatureLDSBankCount32, FeatureISAVersion7_0_0] +>; + +def : ProcessorModel<"kabini", SIQuarterSpeedModel, + [FeatureSeaIslands, FeatureLDSBankCount16] +>; + +def : ProcessorModel<"kaveri", SIQuarterSpeedModel, + [FeatureSeaIslands, FeatureLDSBankCount32, FeatureISAVersion7_0_0] +>; + +def : ProcessorModel<"hawaii", SIFullSpeedModel, + [FeatureSeaIslands, FeatureFastFMAF32, FeatureLDSBankCount32, + FeatureISAVersion7_0_1] +>; + +def : ProcessorModel<"mullins", SIQuarterSpeedModel, + [FeatureSeaIslands, FeatureLDSBankCount16]>; + +//===----------------------------------------------------------------------===// +// Volcanic Islands +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"tonga", SIQuarterSpeedModel, + [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0] +>; + +def : ProcessorModel<"iceland", SIQuarterSpeedModel, + [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0] +>; + +def : ProcessorModel<"carrizo", SIQuarterSpeedModel, + [FeatureVolcanicIslands, FeatureISAVersion8_0_1] +>; + +def : ProcessorModel<"fiji", SIQuarterSpeedModel, + [FeatureVolcanicIslands, FeatureISAVersion8_0_1] +>; + +def : ProcessorModel<"stoney", SIQuarterSpeedModel, + [FeatureVolcanicIslands, FeatureISAVersion8_0_1] +>; diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp new file mode 100644 index 0000000..3cb9021 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp @@ -0,0 +1,206 @@ +//===-- R600ClauseMergePass - Merge consecutive CF_ALU -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative maneer. +/// This pass is merging consecutive CFAlus where applicable. +/// It needs to be called after IfCvt for best results. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "r600mergeclause" + +namespace { + +static bool isCFAlu(const MachineInstr *MI) { + switch (MI->getOpcode()) { + case AMDGPU::CF_ALU: + case AMDGPU::CF_ALU_PUSH_BEFORE: + return true; + default: + return false; + } +} + +class R600ClauseMergePass : public MachineFunctionPass { + +private: + static char ID; + const R600InstrInfo *TII; + + unsigned getCFAluSize(const MachineInstr *MI) const; + bool isCFAluEnabled(const MachineInstr *MI) const; + + /// IfCvt pass can generate "disabled" ALU clause marker that need to be + /// removed and their content affected to the previous alu clause. + /// This function parse instructions after CFAlu until it find a disabled + /// CFAlu and merge the content, or an enabled CFAlu. + void cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) const; + + /// Check whether LatrCFAlu can be merged into RootCFAlu and do it if + /// it is the case. + bool mergeIfPossible(MachineInstr *RootCFAlu, const MachineInstr *LatrCFAlu) + const; + +public: + R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override; +}; + +char R600ClauseMergePass::ID = 0; + +unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr *MI) const { + assert(isCFAlu(MI)); + return MI->getOperand( + TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::COUNT)).getImm(); +} + +bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr *MI) const { + assert(isCFAlu(MI)); + return MI->getOperand( + TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::Enabled)).getImm(); +} + +void R600ClauseMergePass::cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) + const { + int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT); + MachineBasicBlock::iterator I = CFAlu, E = CFAlu->getParent()->end(); + I++; + do { + while (I!= E && !isCFAlu(I)) + I++; + if (I == E) + return; + MachineInstr *MI = I++; + if (isCFAluEnabled(MI)) + break; + CFAlu->getOperand(CntIdx).setImm(getCFAluSize(CFAlu) + getCFAluSize(MI)); + MI->eraseFromParent(); + } while (I != E); +} + +bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu, + const MachineInstr *LatrCFAlu) const { + assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu)); + int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT); + unsigned RootInstCount = getCFAluSize(RootCFAlu), + LaterInstCount = getCFAluSize(LatrCFAlu); + unsigned CumuledInsts = RootInstCount + LaterInstCount; + if (CumuledInsts >= TII->getMaxAlusPerClause()) { + DEBUG(dbgs() << "Excess inst counts\n"); + return false; + } + if (RootCFAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE) + return false; + // Is KCache Bank 0 compatible ? + int Mode0Idx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE0); + int KBank0Idx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0); + int KBank0LineIdx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0); + if (LatrCFAlu->getOperand(Mode0Idx).getImm() && + RootCFAlu->getOperand(Mode0Idx).getImm() && + (LatrCFAlu->getOperand(KBank0Idx).getImm() != + RootCFAlu->getOperand(KBank0Idx).getImm() || + LatrCFAlu->getOperand(KBank0LineIdx).getImm() != + RootCFAlu->getOperand(KBank0LineIdx).getImm())) { + DEBUG(dbgs() << "Wrong KC0\n"); + return false; + } + // Is KCache Bank 1 compatible ? + int Mode1Idx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE1); + int KBank1Idx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1); + int KBank1LineIdx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1); + if (LatrCFAlu->getOperand(Mode1Idx).getImm() && + RootCFAlu->getOperand(Mode1Idx).getImm() && + (LatrCFAlu->getOperand(KBank1Idx).getImm() != + RootCFAlu->getOperand(KBank1Idx).getImm() || + LatrCFAlu->getOperand(KBank1LineIdx).getImm() != + RootCFAlu->getOperand(KBank1LineIdx).getImm())) { + DEBUG(dbgs() << "Wrong KC0\n"); + return false; + } + if (LatrCFAlu->getOperand(Mode0Idx).getImm()) { + RootCFAlu->getOperand(Mode0Idx).setImm( + LatrCFAlu->getOperand(Mode0Idx).getImm()); + RootCFAlu->getOperand(KBank0Idx).setImm( + LatrCFAlu->getOperand(KBank0Idx).getImm()); + RootCFAlu->getOperand(KBank0LineIdx).setImm( + LatrCFAlu->getOperand(KBank0LineIdx).getImm()); + } + if (LatrCFAlu->getOperand(Mode1Idx).getImm()) { + RootCFAlu->getOperand(Mode1Idx).setImm( + LatrCFAlu->getOperand(Mode1Idx).getImm()); + RootCFAlu->getOperand(KBank1Idx).setImm( + LatrCFAlu->getOperand(KBank1Idx).getImm()); + RootCFAlu->getOperand(KBank1LineIdx).setImm( + LatrCFAlu->getOperand(KBank1LineIdx).getImm()); + } + RootCFAlu->getOperand(CntIdx).setImm(CumuledInsts); + RootCFAlu->setDesc(TII->get(LatrCFAlu->getOpcode())); + return true; +} + +bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) { + TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + MachineBasicBlock::iterator LatestCFAlu = E; + while (I != E) { + MachineInstr *MI = I++; + if ((!TII->canBeConsideredALU(MI) && !isCFAlu(MI)) || + TII->mustBeLastInClause(MI->getOpcode())) + LatestCFAlu = E; + if (!isCFAlu(MI)) + continue; + cleanPotentialDisabledCFAlu(MI); + + if (LatestCFAlu != E && mergeIfPossible(LatestCFAlu, MI)) { + MI->eraseFromParent(); + } else { + assert(MI->getOperand(8).getImm() && "CF ALU instruction disabled"); + LatestCFAlu = MI; + } + } + } + return false; +} + +const char *R600ClauseMergePass::getPassName() const { + return "R600 Merge Clause Markers Pass"; +} + +} // end anonymous namespace + + +llvm::FunctionPass *llvm::createR600ClauseMergePass(TargetMachine &TM) { + return new R600ClauseMergePass(TM); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp new file mode 100644 index 0000000..bd80bb2 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -0,0 +1,679 @@ +//===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass compute turns all control flow pseudo instructions into native one +/// computing their address on the fly ; it also sets STACK_SIZE info. +//===----------------------------------------------------------------------===// + +#include "llvm/Support/Debug.h" +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "r600cf" + +namespace { + +struct CFStack { + + enum StackItem { + ENTRY = 0, + SUB_ENTRY = 1, + FIRST_NON_WQM_PUSH = 2, + FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3 + }; + + const AMDGPUSubtarget *ST; + std::vector<StackItem> BranchStack; + std::vector<StackItem> LoopStack; + unsigned MaxStackSize; + unsigned CurrentEntries; + unsigned CurrentSubEntries; + + CFStack(const AMDGPUSubtarget *st, unsigned ShaderType) : ST(st), + // We need to reserve a stack entry for CALL_FS in vertex shaders. + MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0), + CurrentEntries(0), CurrentSubEntries(0) { } + + unsigned getLoopDepth(); + bool branchStackContains(CFStack::StackItem); + bool requiresWorkAroundForInst(unsigned Opcode); + unsigned getSubEntrySize(CFStack::StackItem Item); + void updateMaxStackSize(); + void pushBranch(unsigned Opcode, bool isWQM = false); + void pushLoop(); + void popBranch(); + void popLoop(); +}; + +unsigned CFStack::getLoopDepth() { + return LoopStack.size(); +} + +bool CFStack::branchStackContains(CFStack::StackItem Item) { + for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(), + E = BranchStack.end(); I != E; ++I) { + if (*I == Item) + return true; + } + return false; +} + +bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { + if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() && + getLoopDepth() > 1) + return true; + + if (!ST->hasCFAluBug()) + return false; + + switch(Opcode) { + default: return false; + case AMDGPU::CF_ALU_PUSH_BEFORE: + case AMDGPU::CF_ALU_ELSE_AFTER: + case AMDGPU::CF_ALU_BREAK: + case AMDGPU::CF_ALU_CONTINUE: + if (CurrentSubEntries == 0) + return false; + if (ST->getWavefrontSize() == 64) { + // We are being conservative here. We only require this work-around if + // CurrentSubEntries > 3 && + // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0) + // + // We have to be conservative, because we don't know for certain that + // our stack allocation algorithm for Evergreen/NI is correct. Applying this + // work-around when CurrentSubEntries > 3 allows us to over-allocate stack + // resources without any problems. + return CurrentSubEntries > 3; + } else { + assert(ST->getWavefrontSize() == 32); + // We are being conservative here. We only require the work-around if + // CurrentSubEntries > 7 && + // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0) + // See the comment on the wavefront size == 64 case for why we are + // being conservative. + return CurrentSubEntries > 7; + } + } +} + +unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { + switch(Item) { + default: + return 0; + case CFStack::FIRST_NON_WQM_PUSH: + assert(!ST->hasCaymanISA()); + if (ST->getGeneration() <= AMDGPUSubtarget::R700) { + // +1 For the push operation. + // +2 Extra space required. + return 3; + } else { + // Some documentation says that this is not necessary on Evergreen, + // but experimentation has show that we need to allocate 1 extra + // sub-entry for the first non-WQM push. + // +1 For the push operation. + // +1 Extra space required. + return 2; + } + case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY: + assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN); + // +1 For the push operation. + // +1 Extra space required. + return 2; + case CFStack::SUB_ENTRY: + return 1; + } +} + +void CFStack::updateMaxStackSize() { + unsigned CurrentStackSize = CurrentEntries + + (RoundUpToAlignment(CurrentSubEntries, 4) / 4); + MaxStackSize = std::max(CurrentStackSize, MaxStackSize); +} + +void CFStack::pushBranch(unsigned Opcode, bool isWQM) { + CFStack::StackItem Item = CFStack::ENTRY; + switch(Opcode) { + case AMDGPU::CF_PUSH_EG: + case AMDGPU::CF_ALU_PUSH_BEFORE: + if (!isWQM) { + if (!ST->hasCaymanISA() && + !branchStackContains(CFStack::FIRST_NON_WQM_PUSH)) + Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI + // See comment in + // CFStack::getSubEntrySize() + else if (CurrentEntries > 0 && + ST->getGeneration() > AMDGPUSubtarget::EVERGREEN && + !ST->hasCaymanISA() && + !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY)) + Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY; + else + Item = CFStack::SUB_ENTRY; + } else + Item = CFStack::ENTRY; + break; + } + BranchStack.push_back(Item); + if (Item == CFStack::ENTRY) + CurrentEntries++; + else + CurrentSubEntries += getSubEntrySize(Item); + updateMaxStackSize(); +} + +void CFStack::pushLoop() { + LoopStack.push_back(CFStack::ENTRY); + CurrentEntries++; + updateMaxStackSize(); +} + +void CFStack::popBranch() { + CFStack::StackItem Top = BranchStack.back(); + if (Top == CFStack::ENTRY) + CurrentEntries--; + else + CurrentSubEntries-= getSubEntrySize(Top); + BranchStack.pop_back(); +} + +void CFStack::popLoop() { + CurrentEntries--; + LoopStack.pop_back(); +} + +class R600ControlFlowFinalizer : public MachineFunctionPass { + +private: + typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile; + + enum ControlFlowInstruction { + CF_TC, + CF_VC, + CF_CALL_FS, + CF_WHILE_LOOP, + CF_END_LOOP, + CF_LOOP_BREAK, + CF_LOOP_CONTINUE, + CF_JUMP, + CF_ELSE, + CF_POP, + CF_END + }; + + static char ID; + const R600InstrInfo *TII; + const R600RegisterInfo *TRI; + unsigned MaxFetchInst; + const AMDGPUSubtarget *ST; + + bool IsTrivialInst(MachineInstr *MI) const { + switch (MI->getOpcode()) { + case AMDGPU::KILL: + case AMDGPU::RETURN: + return true; + default: + return false; + } + } + + const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const { + unsigned Opcode = 0; + bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN); + switch (CFI) { + case CF_TC: + Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600; + break; + case CF_VC: + Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600; + break; + case CF_CALL_FS: + Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600; + break; + case CF_WHILE_LOOP: + Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600; + break; + case CF_END_LOOP: + Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600; + break; + case CF_LOOP_BREAK: + Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600; + break; + case CF_LOOP_CONTINUE: + Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600; + break; + case CF_JUMP: + Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600; + break; + case CF_ELSE: + Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600; + break; + case CF_POP: + Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600; + break; + case CF_END: + if (ST->hasCaymanISA()) { + Opcode = AMDGPU::CF_END_CM; + break; + } + Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600; + break; + } + assert (Opcode && "No opcode selected"); + return TII->get(Opcode); + } + + bool isCompatibleWithClause(const MachineInstr *MI, + std::set<unsigned> &DstRegs) const { + unsigned DstMI, SrcMI; + for (MachineInstr::const_mop_iterator I = MI->operands_begin(), + E = MI->operands_end(); I != E; ++I) { + const MachineOperand &MO = *I; + if (!MO.isReg()) + continue; + if (MO.isDef()) { + unsigned Reg = MO.getReg(); + if (AMDGPU::R600_Reg128RegClass.contains(Reg)) + DstMI = Reg; + else + DstMI = TRI->getMatchingSuperReg(Reg, + TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), + &AMDGPU::R600_Reg128RegClass); + } + if (MO.isUse()) { + unsigned Reg = MO.getReg(); + if (AMDGPU::R600_Reg128RegClass.contains(Reg)) + SrcMI = Reg; + else + SrcMI = TRI->getMatchingSuperReg(Reg, + TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), + &AMDGPU::R600_Reg128RegClass); + } + } + if ((DstRegs.find(SrcMI) == DstRegs.end())) { + DstRegs.insert(DstMI); + return true; + } else + return false; + } + + ClauseFile + MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) + const { + MachineBasicBlock::iterator ClauseHead = I; + std::vector<MachineInstr *> ClauseContent; + unsigned AluInstCount = 0; + bool IsTex = TII->usesTextureCache(ClauseHead); + std::set<unsigned> DstRegs; + for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { + if (IsTrivialInst(I)) + continue; + if (AluInstCount >= MaxFetchInst) + break; + if ((IsTex && !TII->usesTextureCache(I)) || + (!IsTex && !TII->usesVertexCache(I))) + break; + if (!isCompatibleWithClause(I, DstRegs)) + break; + AluInstCount ++; + ClauseContent.push_back(I); + } + MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), + getHWInstrDesc(IsTex?CF_TC:CF_VC)) + .addImm(0) // ADDR + .addImm(AluInstCount - 1); // COUNT + return ClauseFile(MIb, std::move(ClauseContent)); + } + + void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const { + static const unsigned LiteralRegs[] = { + AMDGPU::ALU_LITERAL_X, + AMDGPU::ALU_LITERAL_Y, + AMDGPU::ALU_LITERAL_Z, + AMDGPU::ALU_LITERAL_W + }; + const SmallVector<std::pair<MachineOperand *, int64_t>, 3 > Srcs = + TII->getSrcs(MI); + for (unsigned i = 0, e = Srcs.size(); i < e; ++i) { + if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X) + continue; + int64_t Imm = Srcs[i].second; + std::vector<int64_t>::iterator It = + std::find(Lits.begin(), Lits.end(), Imm); + if (It != Lits.end()) { + unsigned Index = It - Lits.begin(); + Srcs[i].first->setReg(LiteralRegs[Index]); + } else { + assert(Lits.size() < 4 && "Too many literals in Instruction Group"); + Srcs[i].first->setReg(LiteralRegs[Lits.size()]); + Lits.push_back(Imm); + } + } + } + + MachineBasicBlock::iterator insertLiterals( + MachineBasicBlock::iterator InsertPos, + const std::vector<unsigned> &Literals) const { + MachineBasicBlock *MBB = InsertPos->getParent(); + for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { + unsigned LiteralPair0 = Literals[i]; + unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0; + InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(), + TII->get(AMDGPU::LITERALS)) + .addImm(LiteralPair0) + .addImm(LiteralPair1); + } + return InsertPos; + } + + ClauseFile + MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) + const { + MachineBasicBlock::iterator ClauseHead = I; + std::vector<MachineInstr *> ClauseContent; + I++; + for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) { + if (IsTrivialInst(I)) { + ++I; + continue; + } + if (!I->isBundle() && !TII->isALUInstr(I->getOpcode())) + break; + std::vector<int64_t> Literals; + if (I->isBundle()) { + MachineInstr *DeleteMI = I; + MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); + while (++BI != E && BI->isBundledWithPred()) { + BI->unbundleFromPred(); + for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = BI->getOperand(i); + if (MO.isReg() && MO.isInternalRead()) + MO.setIsInternalRead(false); + } + getLiteral(&*BI, Literals); + ClauseContent.push_back(&*BI); + } + I = BI; + DeleteMI->eraseFromParent(); + } else { + getLiteral(I, Literals); + ClauseContent.push_back(I); + I++; + } + for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { + unsigned literal0 = Literals[i]; + unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0; + MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(), + TII->get(AMDGPU::LITERALS)) + .addImm(literal0) + .addImm(literal2); + ClauseContent.push_back(MILit); + } + } + assert(ClauseContent.size() < 128 && "ALU clause is too big"); + ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1); + return ClauseFile(ClauseHead, std::move(ClauseContent)); + } + + void + EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, + unsigned &CfCount) { + CounterPropagateAddr(Clause.first, CfCount); + MachineBasicBlock *BB = Clause.first->getParent(); + BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE)) + .addImm(CfCount); + for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { + BB->splice(InsertPos, BB, Clause.second[i]); + } + CfCount += 2 * Clause.second.size(); + } + + void + EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, + unsigned &CfCount) { + Clause.first->getOperand(0).setImm(0); + CounterPropagateAddr(Clause.first, CfCount); + MachineBasicBlock *BB = Clause.first->getParent(); + BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE)) + .addImm(CfCount); + for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { + BB->splice(InsertPos, BB, Clause.second[i]); + } + CfCount += Clause.second.size(); + } + + void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const { + MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm()); + } + void CounterPropagateAddr(const std::set<MachineInstr *> &MIs, + unsigned Addr) const { + for (MachineInstr *MI : MIs) { + CounterPropagateAddr(MI, Addr); + } + } + +public: + R600ControlFlowFinalizer(TargetMachine &tm) + : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {} + + bool runOnMachineFunction(MachineFunction &MF) override { + ST = &MF.getSubtarget<AMDGPUSubtarget>(); + MaxFetchInst = ST->getTexVTXClauseSize(); + TII = static_cast<const R600InstrInfo *>(ST->getInstrInfo()); + TRI = static_cast<const R600RegisterInfo *>(ST->getRegisterInfo()); + R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); + + CFStack CFStack(ST, MFI->getShaderType()); + for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; + ++MB) { + MachineBasicBlock &MBB = *MB; + unsigned CfCount = 0; + std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack; + std::vector<MachineInstr * > IfThenElseStack; + if (MFI->getShaderType() == ShaderType::VERTEX) { + BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), + getHWInstrDesc(CF_CALL_FS)); + CfCount++; + } + std::vector<ClauseFile> FetchClauses, AluClauses; + std::vector<MachineInstr *> LastAlu(1); + std::vector<MachineInstr *> ToPopAfter; + + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E;) { + if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) { + DEBUG(dbgs() << CfCount << ":"; I->dump();); + FetchClauses.push_back(MakeFetchClause(MBB, I)); + CfCount++; + LastAlu.back() = nullptr; + continue; + } + + MachineBasicBlock::iterator MI = I; + if (MI->getOpcode() != AMDGPU::ENDIF) + LastAlu.back() = nullptr; + if (MI->getOpcode() == AMDGPU::CF_ALU) + LastAlu.back() = MI; + I++; + bool RequiresWorkAround = + CFStack.requiresWorkAroundForInst(MI->getOpcode()); + switch (MI->getOpcode()) { + case AMDGPU::CF_ALU_PUSH_BEFORE: + if (RequiresWorkAround) { + DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n"); + BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG)) + .addImm(CfCount + 1) + .addImm(1); + MI->setDesc(TII->get(AMDGPU::CF_ALU)); + CfCount++; + CFStack.pushBranch(AMDGPU::CF_PUSH_EG); + } else + CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE); + + case AMDGPU::CF_ALU: + I = MI; + AluClauses.push_back(MakeALUClause(MBB, I)); + DEBUG(dbgs() << CfCount << ":"; MI->dump();); + CfCount++; + break; + case AMDGPU::WHILELOOP: { + CFStack.pushLoop(); + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_WHILE_LOOP)) + .addImm(1); + std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount, + std::set<MachineInstr *>()); + Pair.second.insert(MIb); + LoopStack.push_back(std::move(Pair)); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::ENDLOOP: { + CFStack.popLoop(); + std::pair<unsigned, std::set<MachineInstr *> > Pair = + std::move(LoopStack.back()); + LoopStack.pop_back(); + CounterPropagateAddr(Pair.second, CfCount); + BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP)) + .addImm(Pair.first + 1); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::IF_PREDICATE_SET: { + LastAlu.push_back(nullptr); + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_JUMP)) + .addImm(0) + .addImm(0); + IfThenElseStack.push_back(MIb); + DEBUG(dbgs() << CfCount << ":"; MIb->dump();); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::ELSE: { + MachineInstr * JumpInst = IfThenElseStack.back(); + IfThenElseStack.pop_back(); + CounterPropagateAddr(JumpInst, CfCount); + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_ELSE)) + .addImm(0) + .addImm(0); + DEBUG(dbgs() << CfCount << ":"; MIb->dump();); + IfThenElseStack.push_back(MIb); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::ENDIF: { + CFStack.popBranch(); + if (LastAlu.back()) { + ToPopAfter.push_back(LastAlu.back()); + } else { + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_POP)) + .addImm(CfCount + 1) + .addImm(1); + (void)MIb; + DEBUG(dbgs() << CfCount << ":"; MIb->dump();); + CfCount++; + } + + MachineInstr *IfOrElseInst = IfThenElseStack.back(); + IfThenElseStack.pop_back(); + CounterPropagateAddr(IfOrElseInst, CfCount); + IfOrElseInst->getOperand(1).setImm(1); + LastAlu.pop_back(); + MI->eraseFromParent(); + break; + } + case AMDGPU::BREAK: { + CfCount ++; + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_LOOP_BREAK)) + .addImm(0); + LoopStack.back().second.insert(MIb); + MI->eraseFromParent(); + break; + } + case AMDGPU::CONTINUE: { + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_LOOP_CONTINUE)) + .addImm(0); + LoopStack.back().second.insert(MIb); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::RETURN: { + BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END)); + CfCount++; + MI->eraseFromParent(); + if (CfCount % 2) { + BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD)); + CfCount++; + } + for (unsigned i = 0, e = FetchClauses.size(); i < e; i++) + EmitFetchClause(I, FetchClauses[i], CfCount); + for (unsigned i = 0, e = AluClauses.size(); i < e; i++) + EmitALUClause(I, AluClauses[i], CfCount); + } + default: + if (TII->isExport(MI->getOpcode())) { + DEBUG(dbgs() << CfCount << ":"; MI->dump();); + CfCount++; + } + break; + } + } + for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) { + MachineInstr *Alu = ToPopAfter[i]; + BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu), + TII->get(AMDGPU::CF_ALU_POP_AFTER)) + .addImm(Alu->getOperand(0).getImm()) + .addImm(Alu->getOperand(1).getImm()) + .addImm(Alu->getOperand(2).getImm()) + .addImm(Alu->getOperand(3).getImm()) + .addImm(Alu->getOperand(4).getImm()) + .addImm(Alu->getOperand(5).getImm()) + .addImm(Alu->getOperand(6).getImm()) + .addImm(Alu->getOperand(7).getImm()) + .addImm(Alu->getOperand(8).getImm()); + Alu->eraseFromParent(); + } + MFI->StackSize = CFStack.MaxStackSize; + } + + return false; + } + + const char *getPassName() const override { + return "R600 Control Flow Finalizer Pass"; + } +}; + +char R600ControlFlowFinalizer::ID = 0; + +} // end anonymous namespace + + +llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) { + return new R600ControlFlowFinalizer(TM); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Defines.h b/contrib/llvm/lib/Target/AMDGPU/R600Defines.h new file mode 100644 index 0000000..51d87eda --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600Defines.h @@ -0,0 +1,171 @@ +//===-- R600Defines.h - R600 Helper Macros ----------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_R600DEFINES_H +#define LLVM_LIB_TARGET_R600_R600DEFINES_H + +#include "llvm/MC/MCRegisterInfo.h" + +// Operand Flags +#define MO_FLAG_CLAMP (1 << 0) +#define MO_FLAG_NEG (1 << 1) +#define MO_FLAG_ABS (1 << 2) +#define MO_FLAG_MASK (1 << 3) +#define MO_FLAG_PUSH (1 << 4) +#define MO_FLAG_NOT_LAST (1 << 5) +#define MO_FLAG_LAST (1 << 6) +#define NUM_MO_FLAGS 7 + +/// \brief Helper for getting the operand index for the instruction flags +/// operand. +#define GET_FLAG_OPERAND_IDX(Flags) (((Flags) >> 7) & 0x3) + +namespace R600_InstFlag { + enum TIF { + TRANS_ONLY = (1 << 0), + TEX = (1 << 1), + REDUCTION = (1 << 2), + FC = (1 << 3), + TRIG = (1 << 4), + OP3 = (1 << 5), + VECTOR = (1 << 6), + //FlagOperand bits 7, 8 + NATIVE_OPERANDS = (1 << 9), + OP1 = (1 << 10), + OP2 = (1 << 11), + VTX_INST = (1 << 12), + TEX_INST = (1 << 13), + ALU_INST = (1 << 14), + LDS_1A = (1 << 15), + LDS_1A1D = (1 << 16), + IS_EXPORT = (1 << 17), + LDS_1A2D = (1 << 18) + }; +} + +#define HAS_NATIVE_OPERANDS(Flags) ((Flags) & R600_InstFlag::NATIVE_OPERANDS) + +/// \brief Defines for extracting register information from register encoding +#define HW_REG_MASK 0x1ff +#define HW_CHAN_SHIFT 9 + +#define GET_REG_CHAN(reg) ((reg) >> HW_CHAN_SHIFT) +#define GET_REG_INDEX(reg) ((reg) & HW_REG_MASK) + +#define IS_VTX(desc) ((desc).TSFlags & R600_InstFlag::VTX_INST) +#define IS_TEX(desc) ((desc).TSFlags & R600_InstFlag::TEX_INST) + +namespace OpName { + + enum VecOps { + UPDATE_EXEC_MASK_X, + UPDATE_PREDICATE_X, + WRITE_X, + OMOD_X, + DST_REL_X, + CLAMP_X, + SRC0_X, + SRC0_NEG_X, + SRC0_REL_X, + SRC0_ABS_X, + SRC0_SEL_X, + SRC1_X, + SRC1_NEG_X, + SRC1_REL_X, + SRC1_ABS_X, + SRC1_SEL_X, + PRED_SEL_X, + UPDATE_EXEC_MASK_Y, + UPDATE_PREDICATE_Y, + WRITE_Y, + OMOD_Y, + DST_REL_Y, + CLAMP_Y, + SRC0_Y, + SRC0_NEG_Y, + SRC0_REL_Y, + SRC0_ABS_Y, + SRC0_SEL_Y, + SRC1_Y, + SRC1_NEG_Y, + SRC1_REL_Y, + SRC1_ABS_Y, + SRC1_SEL_Y, + PRED_SEL_Y, + UPDATE_EXEC_MASK_Z, + UPDATE_PREDICATE_Z, + WRITE_Z, + OMOD_Z, + DST_REL_Z, + CLAMP_Z, + SRC0_Z, + SRC0_NEG_Z, + SRC0_REL_Z, + SRC0_ABS_Z, + SRC0_SEL_Z, + SRC1_Z, + SRC1_NEG_Z, + SRC1_REL_Z, + SRC1_ABS_Z, + SRC1_SEL_Z, + PRED_SEL_Z, + UPDATE_EXEC_MASK_W, + UPDATE_PREDICATE_W, + WRITE_W, + OMOD_W, + DST_REL_W, + CLAMP_W, + SRC0_W, + SRC0_NEG_W, + SRC0_REL_W, + SRC0_ABS_W, + SRC0_SEL_W, + SRC1_W, + SRC1_NEG_W, + SRC1_REL_W, + SRC1_ABS_W, + SRC1_SEL_W, + PRED_SEL_W, + IMM_0, + IMM_1, + VEC_COUNT + }; + +} + +//===----------------------------------------------------------------------===// +// Config register definitions +//===----------------------------------------------------------------------===// + +#define R_02880C_DB_SHADER_CONTROL 0x02880C +#define S_02880C_KILL_ENABLE(x) (((x) & 0x1) << 6) + +// These fields are the same for all shader types and families. +#define S_NUM_GPRS(x) (((x) & 0xFF) << 0) +#define S_STACK_SIZE(x) (((x) & 0xFF) << 8) +//===----------------------------------------------------------------------===// +// R600, R700 Registers +//===----------------------------------------------------------------------===// + +#define R_028850_SQ_PGM_RESOURCES_PS 0x028850 +#define R_028868_SQ_PGM_RESOURCES_VS 0x028868 + +//===----------------------------------------------------------------------===// +// Evergreen, Northern Islands Registers +//===----------------------------------------------------------------------===// + +#define R_028844_SQ_PGM_RESOURCES_PS 0x028844 +#define R_028860_SQ_PGM_RESOURCES_VS 0x028860 +#define R_028878_SQ_PGM_RESOURCES_GS 0x028878 +#define R_0288D4_SQ_PGM_RESOURCES_LS 0x0288d4 + +#define R_0288E8_SQ_LDS_ALLOC 0x0288E8 + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp new file mode 100644 index 0000000..fdc2030 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp @@ -0,0 +1,336 @@ +//===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold +/// 128 Alu instructions ; these instructions can access up to 4 prefetched +/// 4 lines of 16 registers from constant buffers. Such ALU clauses are +/// initiated by CF_ALU instructions. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +namespace llvm { + void initializeR600EmitClauseMarkersPass(PassRegistry&); +} + +namespace { + +class R600EmitClauseMarkers : public MachineFunctionPass { + +private: + const R600InstrInfo *TII; + int Address; + + unsigned OccupiedDwords(MachineInstr *MI) const { + switch (MI->getOpcode()) { + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::DOT_4: + return 4; + case AMDGPU::KILL: + return 0; + default: + break; + } + + // These will be expanded to two ALU instructions in the + // ExpandSpecialInstructions pass. + if (TII->isLDSRetInstr(MI->getOpcode())) + return 2; + + if(TII->isVector(*MI) || + TII->isCubeOp(MI->getOpcode()) || + TII->isReductionOp(MI->getOpcode())) + return 4; + + unsigned NumLiteral = 0; + for (MachineInstr::mop_iterator It = MI->operands_begin(), + E = MI->operands_end(); It != E; ++It) { + MachineOperand &MO = *It; + if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) + ++NumLiteral; + } + return 1 + NumLiteral; + } + + bool isALU(const MachineInstr *MI) const { + if (TII->isALUInstr(MI->getOpcode())) + return true; + if (TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode())) + return true; + switch (MI->getOpcode()) { + case AMDGPU::PRED_X: + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::COPY: + case AMDGPU::DOT_4: + return true; + default: + return false; + } + } + + bool IsTrivialInst(MachineInstr *MI) const { + switch (MI->getOpcode()) { + case AMDGPU::KILL: + case AMDGPU::RETURN: + case AMDGPU::IMPLICIT_DEF: + return true; + default: + return false; + } + } + + std::pair<unsigned, unsigned> getAccessedBankLine(unsigned Sel) const { + // Sel is (512 + (kc_bank << 12) + ConstIndex) << 2 + // (See also R600ISelLowering.cpp) + // ConstIndex value is in [0, 4095]; + return std::pair<unsigned, unsigned>( + ((Sel >> 2) - 512) >> 12, // KC_BANK + // Line Number of ConstIndex + // A line contains 16 constant registers however KCX bank can lock + // two line at the same time ; thus we want to get an even line number. + // Line number can be retrieved with (>>4), using (>>5) <<1 generates + // an even number. + ((((Sel >> 2) - 512) & 4095) >> 5) << 1); + } + + bool SubstituteKCacheBank(MachineInstr *MI, + std::vector<std::pair<unsigned, unsigned> > &CachedConsts, + bool UpdateInstr = true) const { + std::vector<std::pair<unsigned, unsigned> > UsedKCache; + + if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4) + return true; + + const SmallVectorImpl<std::pair<MachineOperand *, int64_t> > &Consts = + TII->getSrcs(MI); + assert((TII->isALUInstr(MI->getOpcode()) || + MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const"); + for (unsigned i = 0, n = Consts.size(); i < n; ++i) { + if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) + continue; + unsigned Sel = Consts[i].second; + unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31; + unsigned KCacheIndex = Index * 4 + Chan; + const std::pair<unsigned, unsigned> &BankLine = getAccessedBankLine(Sel); + if (CachedConsts.empty()) { + CachedConsts.push_back(BankLine); + UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex)); + continue; + } + if (CachedConsts[0] == BankLine) { + UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex)); + continue; + } + if (CachedConsts.size() == 1) { + CachedConsts.push_back(BankLine); + UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex)); + continue; + } + if (CachedConsts[1] == BankLine) { + UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex)); + continue; + } + return false; + } + + if (!UpdateInstr) + return true; + + for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) { + if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) + continue; + switch(UsedKCache[j].first) { + case 0: + Consts[i].first->setReg( + AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second)); + break; + case 1: + Consts[i].first->setReg( + AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second)); + break; + default: + llvm_unreachable("Wrong Cache Line"); + } + j++; + } + return true; + } + + bool canClauseLocalKillFitInClause( + unsigned AluInstCount, + std::vector<std::pair<unsigned, unsigned> > KCacheBanks, + MachineBasicBlock::iterator Def, + MachineBasicBlock::iterator BBEnd) { + const R600RegisterInfo &TRI = TII->getRegisterInfo(); + for (MachineInstr::const_mop_iterator + MOI = Def->operands_begin(), + MOE = Def->operands_end(); MOI != MOE; ++MOI) { + if (!MOI->isReg() || !MOI->isDef() || + TRI.isPhysRegLiveAcrossClauses(MOI->getReg())) + continue; + + // Def defines a clause local register, so check that its use will fit + // in the clause. + unsigned LastUseCount = 0; + for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) { + AluInstCount += OccupiedDwords(UseI); + // Make sure we won't need to end the clause due to KCache limitations. + if (!SubstituteKCacheBank(UseI, KCacheBanks, false)) + return false; + + // We have reached the maximum instruction limit before finding the + // use that kills this register, so we cannot use this def in the + // current clause. + if (AluInstCount >= TII->getMaxAlusPerClause()) + return false; + + // Register kill flags have been cleared by the time we get to this + // pass, but it is safe to assume that all uses of this register + // occur in the same basic block as its definition, because + // it is illegal for the scheduler to schedule them in + // different blocks. + if (UseI->findRegisterUseOperandIdx(MOI->getReg())) + LastUseCount = AluInstCount; + + if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1) + break; + } + if (LastUseCount) + return LastUseCount <= TII->getMaxAlusPerClause(); + llvm_unreachable("Clause local register live at end of clause."); + } + return true; + } + + MachineBasicBlock::iterator + MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { + MachineBasicBlock::iterator ClauseHead = I; + std::vector<std::pair<unsigned, unsigned> > KCacheBanks; + bool PushBeforeModifier = false; + unsigned AluInstCount = 0; + for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { + if (IsTrivialInst(I)) + continue; + if (!isALU(I)) + break; + if (AluInstCount > TII->getMaxAlusPerClause()) + break; + if (I->getOpcode() == AMDGPU::PRED_X) { + // We put PRED_X in its own clause to ensure that ifcvt won't create + // clauses with more than 128 insts. + // IfCvt is indeed checking that "then" and "else" branches of an if + // statement have less than ~60 insts thus converted clauses can't be + // bigger than ~121 insts (predicate setter needs to be in the same + // clause as predicated alus). + if (AluInstCount > 0) + break; + if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH) + PushBeforeModifier = true; + AluInstCount ++; + continue; + } + // XXX: GROUP_BARRIER instructions cannot be in the same ALU clause as: + // + // * KILL or INTERP instructions + // * Any instruction that sets UPDATE_EXEC_MASK or UPDATE_PRED bits + // * Uses waterfalling (i.e. INDEX_MODE = AR.X) + // + // XXX: These checks have not been implemented yet. + if (TII->mustBeLastInClause(I->getOpcode())) { + I++; + break; + } + + // If this instruction defines a clause local register, make sure + // its use can fit in this clause. + if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E)) + break; + + if (!SubstituteKCacheBank(I, KCacheBanks)) + break; + AluInstCount += OccupiedDwords(I); + } + unsigned Opcode = PushBeforeModifier ? + AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU; + BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode)) + // We don't use the ADDR field until R600ControlFlowFinalizer pass, where + // it is safe to assume it is 0. However if we always put 0 here, the ifcvt + // pass may assume that identical ALU clause starter at the beginning of a + // true and false branch can be factorized which is not the case. + .addImm(Address++) // ADDR + .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0 + .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1 + .addImm(KCacheBanks.empty()?0:2) // KM0 + .addImm((KCacheBanks.size() < 2)?0:2) // KM1 + .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0 + .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // KLINE1 + .addImm(AluInstCount) // COUNT + .addImm(1); // Enabled + return I; + } + +public: + static char ID; + R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) { + + initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override { + TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + MachineBasicBlock::iterator I = MBB.begin(); + if (I->getOpcode() == AMDGPU::CF_ALU) + continue; // BB was already parsed + for (MachineBasicBlock::iterator E = MBB.end(); I != E;) { + if (isALU(I)) + I = MakeALUClause(MBB, I); + else + ++I; + } + } + return false; + } + + const char *getPassName() const override { + return "R600 Emit Clause Markers Pass"; + } +}; + +char R600EmitClauseMarkers::ID = 0; + +} // end anonymous namespace + +INITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers", + "R600 Emit Clause Markters", false, false) +INITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers", + "R600 Emit Clause Markters", false, false) + +llvm::FunctionPass *llvm::createR600EmitClauseMarkers() { + return new R600EmitClauseMarkers(); +} + diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp new file mode 100644 index 0000000..211d392e --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp @@ -0,0 +1,349 @@ +//===-- R600ExpandSpecialInstrs.cpp - Expand special instructions ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Vector, Reduction, and Cube instructions need to fill the entire instruction +/// group to work correctly. This pass expands these individual instructions +/// into several instructions that will completely fill the instruction group. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +namespace { + +class R600ExpandSpecialInstrsPass : public MachineFunctionPass { + +private: + static char ID; + const R600InstrInfo *TII; + + void SetFlagInNewMI(MachineInstr *NewMI, const MachineInstr *OldMI, + unsigned Op); + +public: + R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID), + TII(nullptr) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "R600 Expand special instructions pass"; + } +}; + +} // End anonymous namespace + +char R600ExpandSpecialInstrsPass::ID = 0; + +FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) { + return new R600ExpandSpecialInstrsPass(TM); +} + +void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI, + const MachineInstr *OldMI, unsigned Op) { + int OpIdx = TII->getOperandIdx(*OldMI, Op); + if (OpIdx > -1) { + uint64_t Val = OldMI->getOperand(OpIdx).getImm(); + TII->setImmOperand(NewMI, Op, Val); + } +} + +bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { + TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); + + const R600RegisterInfo &TRI = TII->getRegisterInfo(); + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + MachineBasicBlock::iterator I = MBB.begin(); + while (I != MBB.end()) { + MachineInstr &MI = *I; + I = std::next(I); + + // Expand LDS_*_RET instructions + if (TII->isLDSRetInstr(MI.getOpcode())) { + int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); + assert(DstIdx != -1); + MachineOperand &DstOp = MI.getOperand(DstIdx); + MachineInstr *Mov = TII->buildMovInstr(&MBB, I, + DstOp.getReg(), AMDGPU::OQAP); + DstOp.setReg(AMDGPU::OQAP); + int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(), + AMDGPU::OpName::pred_sel); + int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(), + AMDGPU::OpName::pred_sel); + // Copy the pred_sel bit + Mov->getOperand(MovPredSelIdx).setReg( + MI.getOperand(LDSPredSelIdx).getReg()); + } + + switch (MI.getOpcode()) { + default: break; + // Expand PRED_X to one of the PRED_SET instructions. + case AMDGPU::PRED_X: { + uint64_t Flags = MI.getOperand(3).getImm(); + // The native opcode used by PRED_X is stored as an immediate in the + // third operand. + MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I, + MI.getOperand(2).getImm(), // opcode + MI.getOperand(0).getReg(), // dst + MI.getOperand(1).getReg(), // src0 + AMDGPU::ZERO); // src1 + TII->addFlag(PredSet, 0, MO_FLAG_MASK); + if (Flags & MO_FLAG_PUSH) { + TII->setImmOperand(PredSet, AMDGPU::OpName::update_exec_mask, 1); + } else { + TII->setImmOperand(PredSet, AMDGPU::OpName::update_pred, 1); + } + MI.eraseFromParent(); + continue; + } + + case AMDGPU::INTERP_PAIR_XY: { + MachineInstr *BMI; + unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( + MI.getOperand(2).getImm()); + + for (unsigned Chan = 0; Chan < 4; ++Chan) { + unsigned DstReg; + + if (Chan < 2) + DstReg = MI.getOperand(Chan).getReg(); + else + DstReg = Chan == 2 ? AMDGPU::T0_Z : AMDGPU::T0_W; + + BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_XY, + DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg); + + if (Chan > 0) { + BMI->bundleWithPred(); + } + if (Chan >= 2) + TII->addFlag(BMI, 0, MO_FLAG_MASK); + if (Chan != 3) + TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + } + + MI.eraseFromParent(); + continue; + } + + case AMDGPU::INTERP_PAIR_ZW: { + MachineInstr *BMI; + unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( + MI.getOperand(2).getImm()); + + for (unsigned Chan = 0; Chan < 4; ++Chan) { + unsigned DstReg; + + if (Chan < 2) + DstReg = Chan == 0 ? AMDGPU::T0_X : AMDGPU::T0_Y; + else + DstReg = MI.getOperand(Chan-2).getReg(); + + BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_ZW, + DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg); + + if (Chan > 0) { + BMI->bundleWithPred(); + } + if (Chan < 2) + TII->addFlag(BMI, 0, MO_FLAG_MASK); + if (Chan != 3) + TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + } + + MI.eraseFromParent(); + continue; + } + + case AMDGPU::INTERP_VEC_LOAD: { + const R600RegisterInfo &TRI = TII->getRegisterInfo(); + MachineInstr *BMI; + unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( + MI.getOperand(1).getImm()); + unsigned DstReg = MI.getOperand(0).getReg(); + + for (unsigned Chan = 0; Chan < 4; ++Chan) { + BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_LOAD_P0, + TRI.getSubReg(DstReg, TRI.getSubRegFromChannel(Chan)), PReg); + if (Chan > 0) { + BMI->bundleWithPred(); + } + if (Chan != 3) + TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + } + + MI.eraseFromParent(); + continue; + } + case AMDGPU::DOT_4: { + + const R600RegisterInfo &TRI = TII->getRegisterInfo(); + + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; + + for (unsigned Chan = 0; Chan < 4; ++Chan) { + bool Mask = (Chan != TRI.getHWRegChan(DstReg)); + unsigned SubDstReg = + AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); + MachineInstr *BMI = + TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg); + if (Chan > 0) { + BMI->bundleWithPred(); + } + if (Mask) { + TII->addFlag(BMI, 0, MO_FLAG_MASK); + } + if (Chan != 3) + TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + unsigned Opcode = BMI->getOpcode(); + // While not strictly necessary from hw point of view, we force + // all src operands of a dot4 inst to belong to the same slot. + unsigned Src0 = BMI->getOperand( + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0)) + .getReg(); + unsigned Src1 = BMI->getOperand( + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1)) + .getReg(); + (void) Src0; + (void) Src1; + if ((TRI.getEncodingValue(Src0) & 0xff) < 127 && + (TRI.getEncodingValue(Src1) & 0xff) < 127) + assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1)); + } + MI.eraseFromParent(); + continue; + } + } + + bool IsReduction = TII->isReductionOp(MI.getOpcode()); + bool IsVector = TII->isVector(MI); + bool IsCube = TII->isCubeOp(MI.getOpcode()); + if (!IsReduction && !IsVector && !IsCube) { + continue; + } + + // Expand the instruction + // + // Reduction instructions: + // T0_X = DP4 T1_XYZW, T2_XYZW + // becomes: + // TO_X = DP4 T1_X, T2_X + // TO_Y (write masked) = DP4 T1_Y, T2_Y + // TO_Z (write masked) = DP4 T1_Z, T2_Z + // TO_W (write masked) = DP4 T1_W, T2_W + // + // Vector instructions: + // T0_X = MULLO_INT T1_X, T2_X + // becomes: + // T0_X = MULLO_INT T1_X, T2_X + // T0_Y (write masked) = MULLO_INT T1_X, T2_X + // T0_Z (write masked) = MULLO_INT T1_X, T2_X + // T0_W (write masked) = MULLO_INT T1_X, T2_X + // + // Cube instructions: + // T0_XYZW = CUBE T1_XYZW + // becomes: + // TO_X = CUBE T1_Z, T1_Y + // T0_Y = CUBE T1_Z, T1_X + // T0_Z = CUBE T1_X, T1_Z + // T0_W = CUBE T1_Y, T1_Z + for (unsigned Chan = 0; Chan < 4; Chan++) { + unsigned DstReg = MI.getOperand( + TII->getOperandIdx(MI, AMDGPU::OpName::dst)).getReg(); + unsigned Src0 = MI.getOperand( + TII->getOperandIdx(MI, AMDGPU::OpName::src0)).getReg(); + unsigned Src1 = 0; + + // Determine the correct source registers + if (!IsCube) { + int Src1Idx = TII->getOperandIdx(MI, AMDGPU::OpName::src1); + if (Src1Idx != -1) { + Src1 = MI.getOperand(Src1Idx).getReg(); + } + } + if (IsReduction) { + unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); + Src0 = TRI.getSubReg(Src0, SubRegIndex); + Src1 = TRI.getSubReg(Src1, SubRegIndex); + } else if (IsCube) { + static const int CubeSrcSwz[] = {2, 2, 0, 1}; + unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]); + unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]); + Src1 = TRI.getSubReg(Src0, SubRegIndex1); + Src0 = TRI.getSubReg(Src0, SubRegIndex0); + } + + // Determine the correct destination registers; + bool Mask = false; + bool NotLast = true; + if (IsCube) { + unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); + DstReg = TRI.getSubReg(DstReg, SubRegIndex); + } else { + // Mask the write if the original instruction does not write to + // the current Channel. + Mask = (Chan != TRI.getHWRegChan(DstReg)); + unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; + DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); + } + + // Set the IsLast bit + NotLast = (Chan != 3 ); + + // Add the new instruction + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case AMDGPU::CUBE_r600_pseudo: + Opcode = AMDGPU::CUBE_r600_real; + break; + case AMDGPU::CUBE_eg_pseudo: + Opcode = AMDGPU::CUBE_eg_real; + break; + default: + break; + } + + MachineInstr *NewMI = + TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1); + + if (Chan != 0) + NewMI->bundleWithPred(); + if (Mask) { + TII->addFlag(NewMI, 0, MO_FLAG_MASK); + } + if (NotLast) { + TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST); + } + SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp); + SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal); + SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_abs); + SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_abs); + SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_neg); + SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_neg); + } + MI.eraseFromParent(); + } + } + return false; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp new file mode 100644 index 0000000..124a9c6 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -0,0 +1,2298 @@ +//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Custom DAG lowering for R600 +// +//===----------------------------------------------------------------------===// + +#include "R600ISelLowering.h" +#include "AMDGPUFrameLowering.h" +#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPUSubtarget.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Function.h" + +using namespace llvm; + +R600TargetLowering::R600TargetLowering(TargetMachine &TM, + const AMDGPUSubtarget &STI) + : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) { + addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); + addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); + addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); + addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); + addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); + addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); + + computeRegisterProperties(STI.getRegisterInfo()); + + // Set condition code actions + setCondCodeAction(ISD::SETO, MVT::f32, Expand); + setCondCodeAction(ISD::SETUO, MVT::f32, Expand); + setCondCodeAction(ISD::SETLT, MVT::f32, Expand); + setCondCodeAction(ISD::SETLE, MVT::f32, Expand); + setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); + setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); + setCondCodeAction(ISD::SETONE, MVT::f32, Expand); + setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); + setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); + setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); + setCondCodeAction(ISD::SETULT, MVT::f32, Expand); + setCondCodeAction(ISD::SETULE, MVT::f32, Expand); + + setCondCodeAction(ISD::SETLE, MVT::i32, Expand); + setCondCodeAction(ISD::SETLT, MVT::i32, Expand); + setCondCodeAction(ISD::SETULE, MVT::i32, Expand); + setCondCodeAction(ISD::SETULT, MVT::i32, Expand); + + setOperationAction(ISD::FCOS, MVT::f32, Custom); + setOperationAction(ISD::FSIN, MVT::f32, Custom); + + setOperationAction(ISD::SETCC, MVT::v4i32, Expand); + setOperationAction(ISD::SETCC, MVT::v2i32, Expand); + + setOperationAction(ISD::BR_CC, MVT::i32, Expand); + setOperationAction(ISD::BR_CC, MVT::f32, Expand); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); + + setOperationAction(ISD::FSUB, MVT::f32, Expand); + + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); + + setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); + + setOperationAction(ISD::SETCC, MVT::i32, Expand); + setOperationAction(ISD::SETCC, MVT::f32, Expand); + setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); + + setOperationAction(ISD::SELECT, MVT::i32, Expand); + setOperationAction(ISD::SELECT, MVT::f32, Expand); + setOperationAction(ISD::SELECT, MVT::v2i32, Expand); + setOperationAction(ISD::SELECT, MVT::v4i32, Expand); + + // ADD, SUB overflow. + // TODO: turn these into Legal? + if (Subtarget->hasCARRY()) + setOperationAction(ISD::UADDO, MVT::i32, Custom); + + if (Subtarget->hasBORROW()) + setOperationAction(ISD::USUBO, MVT::i32, Custom); + + // Expand sign extension of vectors + if (!Subtarget->hasBFE()) + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand); + + if (!Subtarget->hasBFE()) + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand); + + if (!Subtarget->hasBFE()) + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); + + + // Legalize loads and stores to the private address space. + setOperationAction(ISD::LOAD, MVT::i32, Custom); + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + + // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address + // spaces, so it is custom lowered to handle those where it isn't. + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom); + + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom); + + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom); + } + + setOperationAction(ISD::STORE, MVT::i8, Custom); + setOperationAction(ISD::STORE, MVT::i32, Custom); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); + setOperationAction(ISD::STORE, MVT::v4i32, Custom); + setTruncStoreAction(MVT::i32, MVT::i8, Custom); + setTruncStoreAction(MVT::i32, MVT::i16, Custom); + + setOperationAction(ISD::LOAD, MVT::i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setOperationAction(ISD::FrameIndex, MVT::i32, Custom); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); + + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + + setTargetDAGCombine(ISD::FP_ROUND); + setTargetDAGCombine(ISD::FP_TO_SINT); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::SELECT_CC); + setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + + // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32 + // to be Legal/Custom in order to avoid library calls. + setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); + + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + + const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; + for (MVT VT : ScalarIntVTs) { + setOperationAction(ISD::ADDC, VT, Expand); + setOperationAction(ISD::SUBC, VT, Expand); + setOperationAction(ISD::ADDE, VT, Expand); + setOperationAction(ISD::SUBE, VT, Expand); + } + + setSchedulingPreference(Sched::Source); +} + +static inline bool isEOP(MachineBasicBlock::iterator I) { + return std::next(I)->getOpcode() == AMDGPU::RETURN; +} + +MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( + MachineInstr * MI, MachineBasicBlock * BB) const { + MachineFunction * MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineBasicBlock::iterator I = *MI; + const R600InstrInfo *TII = + static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo()); + + switch (MI->getOpcode()) { + default: + // Replace LDS_*_RET instruction that don't have any uses with the + // equivalent LDS_*_NORET instruction. + if (TII->isLDSRetInstr(MI->getOpcode())) { + int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); + assert(DstIdx != -1); + MachineInstrBuilder NewMI; + // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add + // LDS_1A2D support and remove this special case. + if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) || + MI->getOpcode() == AMDGPU::LDS_CMPST_RET) + return BB; + + NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), + TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode()))); + for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) { + NewMI.addOperand(MI->getOperand(i)); + } + } else { + return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); + } + break; + case AMDGPU::CLAMP_R600: { + MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, + AMDGPU::MOV, + MI->getOperand(0).getReg(), + MI->getOperand(1).getReg()); + TII->addFlag(NewMI, 0, MO_FLAG_CLAMP); + break; + } + + case AMDGPU::FABS_R600: { + MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, + AMDGPU::MOV, + MI->getOperand(0).getReg(), + MI->getOperand(1).getReg()); + TII->addFlag(NewMI, 0, MO_FLAG_ABS); + break; + } + + case AMDGPU::FNEG_R600: { + MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, + AMDGPU::MOV, + MI->getOperand(0).getReg(), + MI->getOperand(1).getReg()); + TII->addFlag(NewMI, 0, MO_FLAG_NEG); + break; + } + + case AMDGPU::MASK_WRITE: { + unsigned maskedRegister = MI->getOperand(0).getReg(); + assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); + MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); + TII->addFlag(defInstr, 0, MO_FLAG_MASK); + break; + } + + case AMDGPU::MOV_IMM_F32: + TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), + MI->getOperand(1).getFPImm()->getValueAPF() + .bitcastToAPInt().getZExtValue()); + break; + case AMDGPU::MOV_IMM_I32: + TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), + MI->getOperand(1).getImm()); + break; + case AMDGPU::CONST_COPY: { + MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV, + MI->getOperand(0).getReg(), AMDGPU::ALU_CONST); + TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel, + MI->getOperand(1).getImm()); + break; + } + + case AMDGPU::RAT_WRITE_CACHELESS_32_eg: + case AMDGPU::RAT_WRITE_CACHELESS_64_eg: + case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addImm(isEOP(I)); // Set End of program bit + break; + } + case AMDGPU::RAT_STORE_TYPED_eg: { + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addOperand(MI->getOperand(2)) + .addImm(isEOP(I)); // Set End of program bit + break; + } + + case AMDGPU::TXD: { + unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); + unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); + MachineOperand &RID = MI->getOperand(4); + MachineOperand &SID = MI->getOperand(5); + unsigned TextureId = MI->getOperand(6).getImm(); + unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; + unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; + + switch (TextureId) { + case 5: // Rect + CTX = CTY = 0; + break; + case 6: // Shadow1D + SrcW = SrcZ; + break; + case 7: // Shadow2D + SrcW = SrcZ; + break; + case 8: // ShadowRect + CTX = CTY = 0; + SrcW = SrcZ; + break; + case 9: // 1DArray + SrcZ = SrcY; + CTZ = 0; + break; + case 10: // 2DArray + CTZ = 0; + break; + case 11: // Shadow1DArray + SrcZ = SrcY; + CTZ = 0; + break; + case 12: // Shadow2DArray + CTZ = 0; + break; + } + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) + .addOperand(MI->getOperand(3)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) + .addOperand(MI->getOperand(2)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW) + .addReg(T0, RegState::Implicit) + .addReg(T1, RegState::Implicit); + break; + } + + case AMDGPU::TXD_SHADOW: { + unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); + unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); + MachineOperand &RID = MI->getOperand(4); + MachineOperand &SID = MI->getOperand(5); + unsigned TextureId = MI->getOperand(6).getImm(); + unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; + unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; + + switch (TextureId) { + case 5: // Rect + CTX = CTY = 0; + break; + case 6: // Shadow1D + SrcW = SrcZ; + break; + case 7: // Shadow2D + SrcW = SrcZ; + break; + case 8: // ShadowRect + CTX = CTY = 0; + SrcW = SrcZ; + break; + case 9: // 1DArray + SrcZ = SrcY; + CTZ = 0; + break; + case 10: // 2DArray + CTZ = 0; + break; + case 11: // Shadow1DArray + SrcZ = SrcY; + CTZ = 0; + break; + case 12: // Shadow2DArray + CTZ = 0; + break; + } + + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) + .addOperand(MI->getOperand(3)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) + .addOperand(MI->getOperand(2)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW) + .addReg(T0, RegState::Implicit) + .addReg(T1, RegState::Implicit); + break; + } + + case AMDGPU::BRANCH: + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) + .addOperand(MI->getOperand(0)); + break; + + case AMDGPU::BRANCH_COND_f32: { + MachineInstr *NewMI = + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), + AMDGPU::PREDICATE_BIT) + .addOperand(MI->getOperand(1)) + .addImm(OPCODE_IS_NOT_ZERO) + .addImm(0); // Flags + TII->addFlag(NewMI, 0, MO_FLAG_PUSH); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) + .addOperand(MI->getOperand(0)) + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + break; + } + + case AMDGPU::BRANCH_COND_i32: { + MachineInstr *NewMI = + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), + AMDGPU::PREDICATE_BIT) + .addOperand(MI->getOperand(1)) + .addImm(OPCODE_IS_NOT_ZERO_INT) + .addImm(0); // Flags + TII->addFlag(NewMI, 0, MO_FLAG_PUSH); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) + .addOperand(MI->getOperand(0)) + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + break; + } + + case AMDGPU::EG_ExportSwz: + case AMDGPU::R600_ExportSwz: { + // Instruction is left unmodified if its not the last one of its type + bool isLastInstructionOfItsType = true; + unsigned InstExportType = MI->getOperand(1).getImm(); + for (MachineBasicBlock::iterator NextExportInst = std::next(I), + EndBlock = BB->end(); NextExportInst != EndBlock; + NextExportInst = std::next(NextExportInst)) { + if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || + NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { + unsigned CurrentInstExportType = NextExportInst->getOperand(1) + .getImm(); + if (CurrentInstExportType == InstExportType) { + isLastInstructionOfItsType = false; + break; + } + } + } + bool EOP = isEOP(I); + if (!EOP && !isLastInstructionOfItsType) + return BB; + unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addOperand(MI->getOperand(2)) + .addOperand(MI->getOperand(3)) + .addOperand(MI->getOperand(4)) + .addOperand(MI->getOperand(5)) + .addOperand(MI->getOperand(6)) + .addImm(CfInst) + .addImm(EOP); + break; + } + case AMDGPU::RETURN: { + // RETURN instructions must have the live-out registers as implicit uses, + // otherwise they appear dead. + R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); + MachineInstrBuilder MIB(*MF, MI); + for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i) + MIB.addReg(MFI->LiveOuts[i], RegState::Implicit); + return BB; + } + } + + MI->eraseFromParent(); + return BB; +} + +//===----------------------------------------------------------------------===// +// Custom DAG Lowering Operations +//===----------------------------------------------------------------------===// + +SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); + switch (Op.getOpcode()) { + default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); + case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG); + case ISD::SRA_PARTS: + case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG); + case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY); + case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW); + case ISD::FCOS: + case ISD::FSIN: return LowerTrig(Op, DAG); + case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + case ISD::STORE: return LowerSTORE(Op, DAG); + case ISD::LOAD: { + SDValue Result = LowerLOAD(Op, DAG); + assert((!Result.getNode() || + Result.getNode()->getNumValues() == 2) && + "Load should return a value and a chain"); + return Result; + } + + case ISD::BRCOND: return LowerBRCOND(Op, DAG); + case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); + case ISD::INTRINSIC_VOID: { + SDValue Chain = Op.getOperand(0); + unsigned IntrinsicID = + cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + switch (IntrinsicID) { + case AMDGPUIntrinsic::AMDGPU_store_output: { + int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); + unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); + MFI->LiveOuts.push_back(Reg); + return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2)); + } + case AMDGPUIntrinsic::R600_store_swizzle: { + SDLoc DL(Op); + const SDValue Args[8] = { + Chain, + Op.getOperand(2), // Export Value + Op.getOperand(3), // ArrayBase + Op.getOperand(4), // Type + DAG.getConstant(0, DL, MVT::i32), // SWZ_X + DAG.getConstant(1, DL, MVT::i32), // SWZ_Y + DAG.getConstant(2, DL, MVT::i32), // SWZ_Z + DAG.getConstant(3, DL, MVT::i32) // SWZ_W + }; + return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args); + } + + // default for switch(IntrinsicID) + default: break; + } + // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) + break; + } + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntrinsicID = + cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + EVT VT = Op.getValueType(); + SDLoc DL(Op); + switch(IntrinsicID) { + default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case AMDGPUIntrinsic::R600_load_input: { + int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + MRI.addLiveIn(Reg); + return DAG.getCopyFromReg(DAG.getEntryNode(), + SDLoc(DAG.getEntryNode()), Reg, VT); + } + + case AMDGPUIntrinsic::R600_interp_input: { + int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue(); + MachineSDNode *interp; + if (ijb < 0) { + const R600InstrInfo *TII = + static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo()); + interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, + MVT::v4f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32)); + return DAG.getTargetExtractSubreg( + TII->getRegisterInfo().getSubRegFromChannel(slot % 4), + DL, MVT::f32, SDValue(interp, 0)); + } + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb); + unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1); + MRI.addLiveIn(RegisterI); + MRI.addLiveIn(RegisterJ); + SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(), + SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32); + SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(), + SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32); + + if (slot % 4 < 2) + interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, + MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32), + RegisterJNode, RegisterINode); + else + interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, + MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32), + RegisterJNode, RegisterINode); + return SDValue(interp, slot % 2); + } + case AMDGPUIntrinsic::R600_interp_xy: + case AMDGPUIntrinsic::R600_interp_zw: { + int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + MachineSDNode *interp; + SDValue RegisterINode = Op.getOperand(2); + SDValue RegisterJNode = Op.getOperand(3); + + if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy) + interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, + MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32), + RegisterJNode, RegisterINode); + else + interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, + MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32), + RegisterJNode, RegisterINode); + return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, + SDValue(interp, 0), SDValue(interp, 1)); + } + case AMDGPUIntrinsic::R600_tex: + case AMDGPUIntrinsic::R600_texc: + case AMDGPUIntrinsic::R600_txl: + case AMDGPUIntrinsic::R600_txlc: + case AMDGPUIntrinsic::R600_txb: + case AMDGPUIntrinsic::R600_txbc: + case AMDGPUIntrinsic::R600_txf: + case AMDGPUIntrinsic::R600_txq: + case AMDGPUIntrinsic::R600_ddx: + case AMDGPUIntrinsic::R600_ddy: + case AMDGPUIntrinsic::R600_ldptr: { + unsigned TextureOp; + switch (IntrinsicID) { + case AMDGPUIntrinsic::R600_tex: + TextureOp = 0; + break; + case AMDGPUIntrinsic::R600_texc: + TextureOp = 1; + break; + case AMDGPUIntrinsic::R600_txl: + TextureOp = 2; + break; + case AMDGPUIntrinsic::R600_txlc: + TextureOp = 3; + break; + case AMDGPUIntrinsic::R600_txb: + TextureOp = 4; + break; + case AMDGPUIntrinsic::R600_txbc: + TextureOp = 5; + break; + case AMDGPUIntrinsic::R600_txf: + TextureOp = 6; + break; + case AMDGPUIntrinsic::R600_txq: + TextureOp = 7; + break; + case AMDGPUIntrinsic::R600_ddx: + TextureOp = 8; + break; + case AMDGPUIntrinsic::R600_ddy: + TextureOp = 9; + break; + case AMDGPUIntrinsic::R600_ldptr: + TextureOp = 10; + break; + default: + llvm_unreachable("Unknow Texture Operation"); + } + + SDValue TexArgs[19] = { + DAG.getConstant(TextureOp, DL, MVT::i32), + Op.getOperand(1), + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(1, DL, MVT::i32), + DAG.getConstant(2, DL, MVT::i32), + DAG.getConstant(3, DL, MVT::i32), + Op.getOperand(2), + Op.getOperand(3), + Op.getOperand(4), + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(1, DL, MVT::i32), + DAG.getConstant(2, DL, MVT::i32), + DAG.getConstant(3, DL, MVT::i32), + Op.getOperand(5), + Op.getOperand(6), + Op.getOperand(7), + Op.getOperand(8), + Op.getOperand(9), + Op.getOperand(10) + }; + return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs); + } + case AMDGPUIntrinsic::AMDGPU_dp4: { + SDValue Args[8] = { + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), + DAG.getConstant(0, DL, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), + DAG.getConstant(0, DL, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), + DAG.getConstant(1, DL, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), + DAG.getConstant(1, DL, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), + DAG.getConstant(2, DL, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), + DAG.getConstant(2, DL, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), + DAG.getConstant(3, DL, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), + DAG.getConstant(3, DL, MVT::i32)) + }; + return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args); + } + + case Intrinsic::r600_read_ngroups_x: + return LowerImplicitParameter(DAG, VT, DL, 0); + case Intrinsic::r600_read_ngroups_y: + return LowerImplicitParameter(DAG, VT, DL, 1); + case Intrinsic::r600_read_ngroups_z: + return LowerImplicitParameter(DAG, VT, DL, 2); + case Intrinsic::r600_read_global_size_x: + return LowerImplicitParameter(DAG, VT, DL, 3); + case Intrinsic::r600_read_global_size_y: + return LowerImplicitParameter(DAG, VT, DL, 4); + case Intrinsic::r600_read_global_size_z: + return LowerImplicitParameter(DAG, VT, DL, 5); + case Intrinsic::r600_read_local_size_x: + return LowerImplicitParameter(DAG, VT, DL, 6); + case Intrinsic::r600_read_local_size_y: + return LowerImplicitParameter(DAG, VT, DL, 7); + case Intrinsic::r600_read_local_size_z: + return LowerImplicitParameter(DAG, VT, DL, 8); + + case Intrinsic::AMDGPU_read_workdim: { + uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM); + return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4); + } + + case Intrinsic::r600_read_tgid_x: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T1_X, VT); + case Intrinsic::r600_read_tgid_y: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T1_Y, VT); + case Intrinsic::r600_read_tgid_z: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T1_Z, VT); + case Intrinsic::r600_read_tidig_x: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T0_X, VT); + case Intrinsic::r600_read_tidig_y: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T0_Y, VT); + case Intrinsic::r600_read_tidig_z: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T0_Z, VT); + case Intrinsic::AMDGPU_rsq: + // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior. + return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_fract: + case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. + return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); + } + // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) + break; + } + } // end switch(Op.getOpcode()) + return SDValue(); +} + +void R600TargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const { + switch (N->getOpcode()) { + default: + AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); + return; + case ISD::FP_TO_UINT: + if (N->getValueType(0) == MVT::i1) { + Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); + return; + } + // Fall-through. Since we don't care about out of bounds values + // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint + // considers some extra cases which are not necessary here. + case ISD::FP_TO_SINT: { + SDValue Result; + if (expandFP_TO_SINT(N, Result, DAG)) + Results.push_back(Result); + return; + } + case ISD::SDIVREM: { + SDValue Op = SDValue(N, 1); + SDValue RES = LowerSDIVREM(Op, DAG); + Results.push_back(RES); + Results.push_back(RES.getValue(1)); + break; + } + case ISD::UDIVREM: { + SDValue Op = SDValue(N, 0); + LowerUDIVREM64(Op, DAG, Results); + break; + } + } +} + +SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, + SDValue Vector) const { + + SDLoc DL(Vector); + EVT VecVT = Vector.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + SmallVector<SDValue, 8> Args; + + for (unsigned i = 0, e = VecVT.getVectorNumElements(); + i != e; ++i) { + Args.push_back(DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector, + DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout())))); + } + + return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args); +} + +SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + + SDLoc DL(Op); + SDValue Vector = Op.getOperand(0); + SDValue Index = Op.getOperand(1); + + if (isa<ConstantSDNode>(Index) || + Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) + return Op; + + Vector = vectorToVerticalVector(DAG, Vector); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(), + Vector, Index); +} + +SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue Vector = Op.getOperand(0); + SDValue Value = Op.getOperand(1); + SDValue Index = Op.getOperand(2); + + if (isa<ConstantSDNode>(Index) || + Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) + return Op; + + Vector = vectorToVerticalVector(DAG, Vector); + SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), + Vector, Value, Index); + return vectorToVerticalVector(DAG, Insert); +} + +SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { + // On hw >= R700, COS/SIN input must be between -1. and 1. + // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5) + EVT VT = Op.getValueType(); + SDValue Arg = Op.getOperand(0); + SDLoc DL(Op); + + // TODO: Should this propagate fast-math-flags? + SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, + DAG.getNode(ISD::FADD, DL, VT, + DAG.getNode(ISD::FMUL, DL, VT, Arg, + DAG.getConstantFP(0.15915494309, DL, MVT::f32)), + DAG.getConstantFP(0.5, DL, MVT::f32))); + unsigned TrigNode; + switch (Op.getOpcode()) { + case ISD::FCOS: + TrigNode = AMDGPUISD::COS_HW; + break; + case ISD::FSIN: + TrigNode = AMDGPUISD::SIN_HW; + break; + default: + llvm_unreachable("Wrong trig opcode"); + } + SDValue TrigVal = DAG.getNode(TrigNode, DL, VT, + DAG.getNode(ISD::FADD, DL, VT, FractPart, + DAG.getConstantFP(-0.5, DL, MVT::f32))); + if (Gen >= AMDGPUSubtarget::R700) + return TrigVal; + // On R600 hw, COS/SIN input must be between -Pi and Pi. + return DAG.getNode(ISD::FMUL, DL, VT, TrigVal, + DAG.getConstantFP(3.14159265359, DL, MVT::f32)); +} + +SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + SDValue Shift = Op.getOperand(2); + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue One = DAG.getConstant(1, DL, VT); + + SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); + SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); + SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); + SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); + + // The dance around Width1 is necessary for 0 special case. + // Without it the CompShift might be 32, producing incorrect results in + // Overflow. So we do the shift in two steps, the alternative is to + // add a conditional to filter the special case. + + SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift); + Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One); + + SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift); + HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow); + SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift); + + SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift); + SDValue LoBig = Zero; + + Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); + Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); + + return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); +} + +SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + SDValue Shift = Op.getOperand(2); + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue One = DAG.getConstant(1, DL, VT); + + const bool SRA = Op.getOpcode() == ISD::SRA_PARTS; + + SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); + SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); + SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); + SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); + + // The dance around Width1 is necessary for 0 special case. + // Without it the CompShift might be 32, producing incorrect results in + // Overflow. So we do the shift in two steps, the alternative is to + // add a conditional to filter the special case. + + SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift); + Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One); + + SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift); + SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift); + LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow); + + SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift); + SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero; + + Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); + Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); + + return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); +} + +SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG, + unsigned mainop, unsigned ovf) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + + SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi); + // Extend sign. + OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF, + DAG.getValueType(MVT::i1)); + + SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi); + + return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF); +} + +SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + return DAG.getNode( + ISD::SETCC, + DL, + MVT::i1, + Op, DAG.getConstantFP(0.0f, DL, MVT::f32), + DAG.getCondCode(ISD::SETNE) + ); +} + +SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, + SDLoc DL, + unsigned DwordOffset) const { + unsigned ByteOffset = DwordOffset * 4; + PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), + AMDGPUAS::CONSTANT_BUFFER_0); + + // We shouldn't be using an offset wider than 16-bits for implicit parameters. + assert(isInt<16>(ByteOffset)); + + return DAG.getLoad(VT, DL, DAG.getEntryNode(), + DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR + MachinePointerInfo(ConstantPointerNull::get(PtrType)), + false, false, false, 0); +} + +bool R600TargetLowering::isZero(SDValue Op) const { + if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { + return Cst->isNullValue(); + } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){ + return CstFP->isZero(); + } else { + return false; + } +} + +SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDValue True = Op.getOperand(2); + SDValue False = Op.getOperand(3); + SDValue CC = Op.getOperand(4); + SDValue Temp; + + if (VT == MVT::f32) { + DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); + SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); + if (MinMax) + return MinMax; + } + + // LHS and RHS are guaranteed to be the same value type + EVT CompareVT = LHS.getValueType(); + + // Check if we can lower this to a native operation. + + // Try to lower to a SET* instruction: + // + // SET* can match the following patterns: + // + // select_cc f32, f32, -1, 0, cc_supported + // select_cc f32, f32, 1.0f, 0.0f, cc_supported + // select_cc i32, i32, -1, 0, cc_supported + // + + // Move hardware True/False values to the correct operand. + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); + ISD::CondCode InverseCC = + ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); + if (isHWTrueValue(False) && isHWFalseValue(True)) { + if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) { + std::swap(False, True); + CC = DAG.getCondCode(InverseCC); + } else { + ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC); + if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) { + std::swap(False, True); + std::swap(LHS, RHS); + CC = DAG.getCondCode(SwapInvCC); + } + } + } + + if (isHWTrueValue(True) && isHWFalseValue(False) && + (CompareVT == VT || VT == MVT::i32)) { + // This can be matched by a SET* instruction. + return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); + } + + // Try to lower to a CND* instruction: + // + // CND* can match the following patterns: + // + // select_cc f32, 0.0, f32, f32, cc_supported + // select_cc f32, 0.0, i32, i32, cc_supported + // select_cc i32, 0, f32, f32, cc_supported + // select_cc i32, 0, i32, i32, cc_supported + // + + // Try to move the zero value to the RHS + if (isZero(LHS)) { + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); + // Try swapping the operands + ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode); + if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { + std::swap(LHS, RHS); + CC = DAG.getCondCode(CCSwapped); + } else { + // Try inverting the conditon and then swapping the operands + ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger()); + CCSwapped = ISD::getSetCCSwappedOperands(CCInv); + if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { + std::swap(True, False); + std::swap(LHS, RHS); + CC = DAG.getCondCode(CCSwapped); + } + } + } + if (isZero(RHS)) { + SDValue Cond = LHS; + SDValue Zero = RHS; + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); + if (CompareVT != VT) { + // Bitcast True / False to the correct types. This will end up being + // a nop, but it allows us to define only a single pattern in the + // .TD files for each CND* instruction rather than having to have + // one pattern for integer True/False and one for fp True/False + True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); + False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); + } + + switch (CCOpcode) { + case ISD::SETONE: + case ISD::SETUNE: + case ISD::SETNE: + CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); + Temp = True; + True = False; + False = Temp; + break; + default: + break; + } + SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, + Cond, Zero, + True, False, + DAG.getCondCode(CCOpcode)); + return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); + } + + // If we make it this for it means we have no native instructions to handle + // this SELECT_CC, so we must lower it. + SDValue HWTrue, HWFalse; + + if (CompareVT == MVT::f32) { + HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT); + HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT); + } else if (CompareVT == MVT::i32) { + HWTrue = DAG.getConstant(-1, DL, CompareVT); + HWFalse = DAG.getConstant(0, DL, CompareVT); + } + else { + llvm_unreachable("Unhandled value type in LowerSELECT_CC"); + } + + // Lower this unsupported SELECT_CC into a combination of two supported + // SELECT_CC operations. + SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); + + return DAG.getNode(ISD::SELECT_CC, DL, VT, + Cond, HWFalse, + True, False, + DAG.getCondCode(ISD::SETNE)); +} + +/// LLVM generates byte-addressed pointers. For indirect addressing, we need to +/// convert these pointers to a register index. Each register holds +/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the +/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used +/// for indirect addressing. +SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, + unsigned StackWidth, + SelectionDAG &DAG) const { + unsigned SRLPad; + switch(StackWidth) { + case 1: + SRLPad = 2; + break; + case 2: + SRLPad = 3; + break; + case 4: + SRLPad = 4; + break; + default: llvm_unreachable("Invalid stack width"); + } + + SDLoc DL(Ptr); + return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(SRLPad, DL, MVT::i32)); +} + +void R600TargetLowering::getStackAddress(unsigned StackWidth, + unsigned ElemIdx, + unsigned &Channel, + unsigned &PtrIncr) const { + switch (StackWidth) { + default: + case 1: + Channel = 0; + if (ElemIdx > 0) { + PtrIncr = 1; + } else { + PtrIncr = 0; + } + break; + case 2: + Channel = ElemIdx % 2; + if (ElemIdx == 2) { + PtrIncr = 1; + } else { + PtrIncr = 0; + } + break; + case 4: + Channel = ElemIdx; + PtrIncr = 0; + break; + } +} + +SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + StoreSDNode *StoreNode = cast<StoreSDNode>(Op); + SDValue Chain = Op.getOperand(0); + SDValue Value = Op.getOperand(1); + SDValue Ptr = Op.getOperand(2); + + SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG); + if (Result.getNode()) { + return Result; + } + + if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) { + if (StoreNode->isTruncatingStore()) { + EVT VT = Value.getValueType(); + assert(VT.bitsLE(MVT::i32)); + EVT MemVT = StoreNode->getMemoryVT(); + SDValue MaskConstant; + if (MemVT == MVT::i8) { + MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32); + } else { + assert(MemVT == MVT::i16); + MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32); + } + SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr, + DAG.getConstant(2, DL, MVT::i32)); + SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(0x00000003, DL, VT)); + SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); + SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, + DAG.getConstant(3, DL, VT)); + SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift); + SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift); + // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32 + // vector instead. + SDValue Src[4] = { + ShiftedValue, + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32), + Mask + }; + SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src); + SDValue Args[3] = { Chain, Input, DWordAddr }; + return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, + Op->getVTList(), Args, MemVT, + StoreNode->getMemOperand()); + } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && + Value.getValueType().bitsGE(MVT::i32)) { + // Convert pointer from byte address to dword address. + Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), + DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), + Ptr, DAG.getConstant(2, DL, MVT::i32))); + + if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { + llvm_unreachable("Truncated and indexed stores not supported yet"); + } else { + Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); + } + return Chain; + } + } + + EVT ValueVT = Value.getValueType(); + + if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + return SDValue(); + } + + SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); + if (Ret.getNode()) { + return Ret; + } + // Lowering for indirect addressing + + const MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = + static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering()); + unsigned StackWidth = TFL->getStackWidth(MF); + + Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); + + if (ValueVT.isVector()) { + unsigned NumElemVT = ValueVT.getVectorNumElements(); + EVT ElemVT = ValueVT.getVectorElementType(); + SmallVector<SDValue, 4> Stores(NumElemVT); + + assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " + "vector width in load"); + + for (unsigned i = 0; i < NumElemVT; ++i) { + unsigned Channel, PtrIncr; + getStackAddress(StackWidth, i, Channel, PtrIncr); + Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, + DAG.getConstant(PtrIncr, DL, MVT::i32)); + SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, + Value, DAG.getConstant(i, DL, MVT::i32)); + + Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, + Chain, Elem, Ptr, + DAG.getTargetConstant(Channel, DL, MVT::i32)); + } + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); + } else { + if (ValueVT == MVT::i8) { + Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); + } + Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, + DAG.getTargetConstant(0, DL, MVT::i32)); // Channel + } + + return Chain; +} + +// return (512 + (kc_bank << 12) +static int +ConstantAddressBlock(unsigned AddressSpace) { + switch (AddressSpace) { + case AMDGPUAS::CONSTANT_BUFFER_0: + return 512; + case AMDGPUAS::CONSTANT_BUFFER_1: + return 512 + 4096; + case AMDGPUAS::CONSTANT_BUFFER_2: + return 512 + 4096 * 2; + case AMDGPUAS::CONSTANT_BUFFER_3: + return 512 + 4096 * 3; + case AMDGPUAS::CONSTANT_BUFFER_4: + return 512 + 4096 * 4; + case AMDGPUAS::CONSTANT_BUFFER_5: + return 512 + 4096 * 5; + case AMDGPUAS::CONSTANT_BUFFER_6: + return 512 + 4096 * 6; + case AMDGPUAS::CONSTANT_BUFFER_7: + return 512 + 4096 * 7; + case AMDGPUAS::CONSTANT_BUFFER_8: + return 512 + 4096 * 8; + case AMDGPUAS::CONSTANT_BUFFER_9: + return 512 + 4096 * 9; + case AMDGPUAS::CONSTANT_BUFFER_10: + return 512 + 4096 * 10; + case AMDGPUAS::CONSTANT_BUFFER_11: + return 512 + 4096 * 11; + case AMDGPUAS::CONSTANT_BUFFER_12: + return 512 + 4096 * 12; + case AMDGPUAS::CONSTANT_BUFFER_13: + return 512 + 4096 * 13; + case AMDGPUAS::CONSTANT_BUFFER_14: + return 512 + 4096 * 14; + case AMDGPUAS::CONSTANT_BUFFER_15: + return 512 + 4096 * 15; + default: + return -1; + } +} + +SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const +{ + EVT VT = Op.getValueType(); + SDLoc DL(Op); + LoadSDNode *LoadNode = cast<LoadSDNode>(Op); + SDValue Chain = Op.getOperand(0); + SDValue Ptr = Op.getOperand(1); + SDValue LoweredLoad; + + if (SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG)) + return Ret; + + // Lower loads constant address space global variable loads + if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + isa<GlobalVariable>(GetUnderlyingObject( + LoadNode->getMemOperand()->getValue(), DAG.getDataLayout()))) { + + SDValue Ptr = DAG.getZExtOrTrunc( + LoadNode->getBasePtr(), DL, + getPointerTy(DAG.getDataLayout(), AMDGPUAS::PRIVATE_ADDRESS)); + Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, + DAG.getConstant(2, DL, MVT::i32)); + return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(), + LoadNode->getChain(), Ptr, + DAG.getTargetConstant(0, DL, MVT::i32), + Op.getOperand(2)); + } + + if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { + SDValue MergedValues[2] = { + ScalarizeVectorLoad(Op, DAG), + Chain + }; + return DAG.getMergeValues(MergedValues, DL); + } + + int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); + if (ConstantBlock > -1 && + ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || + (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { + SDValue Result; + if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) || + isa<Constant>(LoadNode->getMemOperand()->getValue()) || + isa<ConstantSDNode>(Ptr)) { + SDValue Slots[4]; + for (unsigned i = 0; i < 4; i++) { + // We want Const position encoded with the following formula : + // (((512 + (kc_bank << 12) + const_index) << 2) + chan) + // const_index is Ptr computed by llvm using an alignment of 16. + // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and + // then div by 4 at the ISel step + SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32)); + Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); + } + EVT NewVT = MVT::v4i32; + unsigned NumElements = 4; + if (VT.isVector()) { + NewVT = VT; + NumElements = VT.getVectorNumElements(); + } + Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, + makeArrayRef(Slots, NumElements)); + } else { + // non-constant ptr can't be folded, keeps it as a v4f32 load + Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, + DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, + DAG.getConstant(4, DL, MVT::i32)), + DAG.getConstant(LoadNode->getAddressSpace() - + AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32) + ); + } + + if (!VT.isVector()) { + Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, + DAG.getConstant(0, DL, MVT::i32)); + } + + SDValue MergedValues[2] = { + Result, + Chain + }; + return DAG.getMergeValues(MergedValues, DL); + } + + // For most operations returning SDValue() will result in the node being + // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we + // need to manually expand loads that may be legal in some address spaces and + // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for + // compute shaders, since the data is sign extended when it is uploaded to the + // buffer. However SEXT loads from other address spaces are not supported, so + // we need to expand them here. + if (LoadNode->getExtensionType() == ISD::SEXTLOAD) { + EVT MemVT = LoadNode->getMemoryVT(); + assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8)); + SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr, + LoadNode->getPointerInfo(), MemVT, + LoadNode->isVolatile(), + LoadNode->isNonTemporal(), + LoadNode->isInvariant(), + LoadNode->getAlignment()); + SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad, + DAG.getValueType(MemVT)); + + SDValue MergedValues[2] = { Res, Chain }; + return DAG.getMergeValues(MergedValues, DL); + } + + if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + return SDValue(); + } + + // Lowering for indirect addressing + const MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = + static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering()); + unsigned StackWidth = TFL->getStackWidth(MF); + + Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); + + if (VT.isVector()) { + unsigned NumElemVT = VT.getVectorNumElements(); + EVT ElemVT = VT.getVectorElementType(); + SDValue Loads[4]; + + assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " + "vector width in load"); + + for (unsigned i = 0; i < NumElemVT; ++i) { + unsigned Channel, PtrIncr; + getStackAddress(StackWidth, i, Channel, PtrIncr); + Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, + DAG.getConstant(PtrIncr, DL, MVT::i32)); + Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, + Chain, Ptr, + DAG.getTargetConstant(Channel, DL, MVT::i32), + Op.getOperand(2)); + } + for (unsigned i = NumElemVT; i < 4; ++i) { + Loads[i] = DAG.getUNDEF(ElemVT); + } + EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); + LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads); + } else { + LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, + Chain, Ptr, + DAG.getTargetConstant(0, DL, MVT::i32), // Channel + Op.getOperand(2)); + } + + SDValue Ops[2] = { + LoweredLoad, + Chain + }; + + return DAG.getMergeValues(Ops, DL); +} + +SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + SDValue Cond = Op.getOperand(1); + SDValue Jump = Op.getOperand(2); + + return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(), + Chain, Jump, Cond); +} + +/// XXX Only kernel functions are supported, so we can assume for now that +/// every function is a kernel function, but in the future we should use +/// separate calling conventions for kernel and non-kernel functions. +SDValue R600TargetLowering::LowerFormalArguments( + SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); + MachineFunction &MF = DAG.getMachineFunction(); + R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); + + SmallVector<ISD::InputArg, 8> LocalIns; + + getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns); + + AnalyzeFormalArguments(CCInfo, LocalIns); + + for (unsigned i = 0, e = Ins.size(); i < e; ++i) { + CCValAssign &VA = ArgLocs[i]; + const ISD::InputArg &In = Ins[i]; + EVT VT = In.VT; + EVT MemVT = VA.getLocVT(); + if (!VT.isVector() && MemVT.isVector()) { + // Get load source type if scalarized. + MemVT = MemVT.getVectorElementType(); + } + + if (MFI->getShaderType() != ShaderType::COMPUTE) { + unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass); + SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); + InVals.push_back(Register); + continue; + } + + PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), + AMDGPUAS::CONSTANT_BUFFER_0); + + // i64 isn't a legal type, so the register type used ends up as i32, which + // isn't expected here. It attempts to create this sextload, but it ends up + // being invalid. Somehow this seems to work with i64 arguments, but breaks + // for <1 x i64>. + + // The first 36 bytes of the input buffer contains information about + // thread group and global sizes. + ISD::LoadExtType Ext = ISD::NON_EXTLOAD; + if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) { + // FIXME: This should really check the extload type, but the handling of + // extload vector parameters seems to be broken. + + // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + Ext = ISD::SEXTLOAD; + } + + // Compute the offset from the value. + // XXX - I think PartOffset should give you this, but it seems to give the + // size of the register which isn't useful. + + unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset(); + unsigned PartOffset = VA.getLocMemOffset(); + unsigned Offset = 36 + VA.getLocMemOffset(); + + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); + SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain, + DAG.getConstant(Offset, DL, MVT::i32), + DAG.getUNDEF(MVT::i32), + PtrInfo, + MemVT, false, true, true, 4); + + // 4 is the preferred alignment for the CONSTANT memory space. + InVals.push_back(Arg); + MFI->ABIArgOffset = Offset + MemVT.getStoreSize(); + } + return Chain; +} + +EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, + EVT VT) const { + if (!VT.isVector()) + return MVT::i32; + return VT.changeVectorElementTypeToInteger(); +} + +static SDValue CompactSwizzlableVector( + SelectionDAG &DAG, SDValue VectorEntry, + DenseMap<unsigned, unsigned> &RemapSwizzle) { + assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); + assert(RemapSwizzle.empty()); + SDValue NewBldVec[4] = { + VectorEntry.getOperand(0), + VectorEntry.getOperand(1), + VectorEntry.getOperand(2), + VectorEntry.getOperand(3) + }; + + for (unsigned i = 0; i < 4; i++) { + if (NewBldVec[i].getOpcode() == ISD::UNDEF) + // We mask write here to teach later passes that the ith element of this + // vector is undef. Thus we can use it to reduce 128 bits reg usage, + // break false dependencies and additionnaly make assembly easier to read. + RemapSwizzle[i] = 7; // SEL_MASK_WRITE + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) { + if (C->isZero()) { + RemapSwizzle[i] = 4; // SEL_0 + NewBldVec[i] = DAG.getUNDEF(MVT::f32); + } else if (C->isExactlyValue(1.0)) { + RemapSwizzle[i] = 5; // SEL_1 + NewBldVec[i] = DAG.getUNDEF(MVT::f32); + } + } + + if (NewBldVec[i].getOpcode() == ISD::UNDEF) + continue; + for (unsigned j = 0; j < i; j++) { + if (NewBldVec[i] == NewBldVec[j]) { + NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType()); + RemapSwizzle[i] = j; + break; + } + } + } + + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), + VectorEntry.getValueType(), NewBldVec); +} + +static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, + DenseMap<unsigned, unsigned> &RemapSwizzle) { + assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); + assert(RemapSwizzle.empty()); + SDValue NewBldVec[4] = { + VectorEntry.getOperand(0), + VectorEntry.getOperand(1), + VectorEntry.getOperand(2), + VectorEntry.getOperand(3) + }; + bool isUnmovable[4] = { false, false, false, false }; + for (unsigned i = 0; i < 4; i++) { + RemapSwizzle[i] = i; + if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) + ->getZExtValue(); + if (i == Idx) + isUnmovable[Idx] = true; + } + } + + for (unsigned i = 0; i < 4; i++) { + if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) + ->getZExtValue(); + if (isUnmovable[Idx]) + continue; + // Swap i and Idx + std::swap(NewBldVec[Idx], NewBldVec[i]); + std::swap(RemapSwizzle[i], RemapSwizzle[Idx]); + break; + } + } + + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), + VectorEntry.getValueType(), NewBldVec); +} + + +SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, + SDValue Swz[4], SelectionDAG &DAG, + SDLoc DL) const { + assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR); + // Old -> New swizzle values + DenseMap<unsigned, unsigned> SwizzleRemap; + + BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap); + for (unsigned i = 0; i < 4; i++) { + unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); + if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) + Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); + } + + SwizzleRemap.clear(); + BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap); + for (unsigned i = 0; i < 4; i++) { + unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); + if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) + Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); + } + + return BuildVector; +} + + +//===----------------------------------------------------------------------===// +// Custom DAG Optimizations +//===----------------------------------------------------------------------===// + +SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + + switch (N->getOpcode()) { + default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); + // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) + case ISD::FP_ROUND: { + SDValue Arg = N->getOperand(0); + if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { + return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0), + Arg.getOperand(0)); + } + break; + } + + // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> + // (i32 select_cc f32, f32, -1, 0 cc) + // + // Mesa's GLSL frontend generates the above pattern a lot and we can lower + // this to one of the SET*_DX10 instructions. + case ISD::FP_TO_SINT: { + SDValue FNeg = N->getOperand(0); + if (FNeg.getOpcode() != ISD::FNEG) { + return SDValue(); + } + SDValue SelectCC = FNeg.getOperand(0); + if (SelectCC.getOpcode() != ISD::SELECT_CC || + SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS + SelectCC.getOperand(2).getValueType() != MVT::f32 || // True + !isHWTrueValue(SelectCC.getOperand(2)) || + !isHWFalseValue(SelectCC.getOperand(3))) { + return SDValue(); + } + + SDLoc dl(N); + return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0), + SelectCC.getOperand(0), // LHS + SelectCC.getOperand(1), // RHS + DAG.getConstant(-1, dl, MVT::i32), // True + DAG.getConstant(0, dl, MVT::i32), // False + SelectCC.getOperand(4)); // CC + + break; + } + + // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx + // => build_vector elt0, ... , NewEltIdx, ... , eltN + case ISD::INSERT_VECTOR_ELT: { + SDValue InVec = N->getOperand(0); + SDValue InVal = N->getOperand(1); + SDValue EltNo = N->getOperand(2); + SDLoc dl(N); + + // If the inserted element is an UNDEF, just use the input vector. + if (InVal.getOpcode() == ISD::UNDEF) + return InVec; + + EVT VT = InVec.getValueType(); + + // If we can't generate a legal BUILD_VECTOR, exit + if (!isOperationLegal(ISD::BUILD_VECTOR, VT)) + return SDValue(); + + // Check that we know which element is being inserted + if (!isa<ConstantSDNode>(EltNo)) + return SDValue(); + unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); + + // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially + // be converted to a BUILD_VECTOR). Fill in the Ops vector with the + // vector elements. + SmallVector<SDValue, 8> Ops; + if (InVec.getOpcode() == ISD::BUILD_VECTOR) { + Ops.append(InVec.getNode()->op_begin(), + InVec.getNode()->op_end()); + } else if (InVec.getOpcode() == ISD::UNDEF) { + unsigned NElts = VT.getVectorNumElements(); + Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); + } else { + return SDValue(); + } + + // Insert the element + if (Elt < Ops.size()) { + // All the operands of BUILD_VECTOR must have the same type; + // we enforce that here. + EVT OpVT = Ops[0].getValueType(); + if (InVal.getValueType() != OpVT) + InVal = OpVT.bitsGT(InVal.getValueType()) ? + DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) : + DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal); + Ops[Elt] = InVal; + } + + // Return the new vector + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + } + + // Extract_vec (Build_vector) generated by custom lowering + // also needs to be customly combined + case ISD::EXTRACT_VECTOR_ELT: { + SDValue Arg = N->getOperand(0); + if (Arg.getOpcode() == ISD::BUILD_VECTOR) { + if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { + unsigned Element = Const->getZExtValue(); + return Arg->getOperand(Element); + } + } + if (Arg.getOpcode() == ISD::BITCAST && + Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { + if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { + unsigned Element = Const->getZExtValue(); + return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(), + Arg->getOperand(0).getOperand(Element)); + } + } + break; + } + + case ISD::SELECT_CC: { + // Try common optimizations + SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI); + if (Ret.getNode()) + return Ret; + + // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> + // selectcc x, y, a, b, inv(cc) + // + // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> + // selectcc x, y, a, b, cc + SDValue LHS = N->getOperand(0); + if (LHS.getOpcode() != ISD::SELECT_CC) { + return SDValue(); + } + + SDValue RHS = N->getOperand(1); + SDValue True = N->getOperand(2); + SDValue False = N->getOperand(3); + ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get(); + + if (LHS.getOperand(2).getNode() != True.getNode() || + LHS.getOperand(3).getNode() != False.getNode() || + RHS.getNode() != False.getNode()) { + return SDValue(); + } + + switch (NCC) { + default: return SDValue(); + case ISD::SETNE: return LHS; + case ISD::SETEQ: { + ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get(); + LHSCC = ISD::getSetCCInverse(LHSCC, + LHS.getOperand(0).getValueType().isInteger()); + if (DCI.isBeforeLegalizeOps() || + isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType())) + return DAG.getSelectCC(SDLoc(N), + LHS.getOperand(0), + LHS.getOperand(1), + LHS.getOperand(2), + LHS.getOperand(3), + LHSCC); + break; + } + } + return SDValue(); + } + + case AMDGPUISD::EXPORT: { + SDValue Arg = N->getOperand(1); + if (Arg.getOpcode() != ISD::BUILD_VECTOR) + break; + + SDValue NewArgs[8] = { + N->getOperand(0), // Chain + SDValue(), + N->getOperand(2), // ArrayBase + N->getOperand(3), // Type + N->getOperand(4), // SWZ_X + N->getOperand(5), // SWZ_Y + N->getOperand(6), // SWZ_Z + N->getOperand(7) // SWZ_W + }; + SDLoc DL(N); + NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL); + return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs); + } + case AMDGPUISD::TEXTURE_FETCH: { + SDValue Arg = N->getOperand(1); + if (Arg.getOpcode() != ISD::BUILD_VECTOR) + break; + + SDValue NewArgs[19] = { + N->getOperand(0), + N->getOperand(1), + N->getOperand(2), + N->getOperand(3), + N->getOperand(4), + N->getOperand(5), + N->getOperand(6), + N->getOperand(7), + N->getOperand(8), + N->getOperand(9), + N->getOperand(10), + N->getOperand(11), + N->getOperand(12), + N->getOperand(13), + N->getOperand(14), + N->getOperand(15), + N->getOperand(16), + N->getOperand(17), + N->getOperand(18), + }; + SDLoc DL(N); + NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL); + return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs); + } + } + + return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); +} + +static bool +FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, + SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) { + const R600InstrInfo *TII = + static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo()); + if (!Src.isMachineOpcode()) + return false; + switch (Src.getMachineOpcode()) { + case AMDGPU::FNEG_R600: + if (!Neg.getNode()) + return false; + Src = Src.getOperand(0); + Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); + return true; + case AMDGPU::FABS_R600: + if (!Abs.getNode()) + return false; + Src = Src.getOperand(0); + Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); + return true; + case AMDGPU::CONST_COPY: { + unsigned Opcode = ParentNode->getMachineOpcode(); + bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; + + if (!Sel.getNode()) + return false; + + SDValue CstOffset = Src.getOperand(0); + if (ParentNode->getValueType(0).isVector()) + return false; + + // Gather constants values + int SrcIndices[] = { + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src2), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) + }; + std::vector<unsigned> Consts; + for (int OtherSrcIdx : SrcIndices) { + int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx); + if (OtherSrcIdx < 0 || OtherSelIdx < 0) + continue; + if (HasDst) { + OtherSrcIdx--; + OtherSelIdx--; + } + if (RegisterSDNode *Reg = + dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) { + if (Reg->getReg() == AMDGPU::ALU_CONST) { + ConstantSDNode *Cst + = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx)); + Consts.push_back(Cst->getZExtValue()); + } + } + } + + ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset); + Consts.push_back(Cst->getZExtValue()); + if (!TII->fitsConstReadLimitations(Consts)) { + return false; + } + + Sel = CstOffset; + Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32); + return true; + } + case AMDGPU::MOV_IMM_I32: + case AMDGPU::MOV_IMM_F32: { + unsigned ImmReg = AMDGPU::ALU_LITERAL_X; + uint64_t ImmValue = 0; + + + if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) { + ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0)); + float FloatValue = FPC->getValueAPF().convertToFloat(); + if (FloatValue == 0.0) { + ImmReg = AMDGPU::ZERO; + } else if (FloatValue == 0.5) { + ImmReg = AMDGPU::HALF; + } else if (FloatValue == 1.0) { + ImmReg = AMDGPU::ONE; + } else { + ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue(); + } + } else { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0)); + uint64_t Value = C->getZExtValue(); + if (Value == 0) { + ImmReg = AMDGPU::ZERO; + } else if (Value == 1) { + ImmReg = AMDGPU::ONE_INT; + } else { + ImmValue = Value; + } + } + + // Check that we aren't already using an immediate. + // XXX: It's possible for an instruction to have more than one + // immediate operand, but this is not supported yet. + if (ImmReg == AMDGPU::ALU_LITERAL_X) { + if (!Imm.getNode()) + return false; + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm); + assert(C); + if (C->getZExtValue()) + return false; + Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32); + } + Src = DAG.getRegister(ImmReg, MVT::i32); + return true; + } + default: + return false; + } +} + + +/// \brief Fold the instructions after selecting them +SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, + SelectionDAG &DAG) const { + const R600InstrInfo *TII = + static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo()); + if (!Node->isMachineOpcode()) + return Node; + unsigned Opcode = Node->getMachineOpcode(); + SDValue FakeOp; + + std::vector<SDValue> Ops(Node->op_begin(), Node->op_end()); + + if (Opcode == AMDGPU::DOT_4) { + int OperandIdx[] = { + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) + }; + int NegIdx[] = { + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W) + }; + int AbsIdx[] = { + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W) + }; + for (unsigned i = 0; i < 8; i++) { + if (OperandIdx[i] < 0) + return Node; + SDValue &Src = Ops[OperandIdx[i] - 1]; + SDValue &Neg = Ops[NegIdx[i] - 1]; + SDValue &Abs = Ops[AbsIdx[i] - 1]; + bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; + int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); + if (HasDst) + SelIdx--; + SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; + if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG)) + return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); + } + } else if (Opcode == AMDGPU::REG_SEQUENCE) { + for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) { + SDValue &Src = Ops[i]; + if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG)) + return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); + } + } else if (Opcode == AMDGPU::CLAMP_R600) { + SDValue Src = Node->getOperand(0); + if (!Src.isMachineOpcode() || + !TII->hasInstrModifiers(Src.getMachineOpcode())) + return Node; + int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(), + AMDGPU::OpName::clamp); + if (ClampIdx < 0) + return Node; + SDLoc DL(Node); + std::vector<SDValue> Ops(Src->op_begin(), Src->op_end()); + Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32); + return DAG.getMachineNode(Src.getMachineOpcode(), DL, + Node->getVTList(), Ops); + } else { + if (!TII->hasInstrModifiers(Opcode)) + return Node; + int OperandIdx[] = { + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src2) + }; + int NegIdx[] = { + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg) + }; + int AbsIdx[] = { + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs), + -1 + }; + for (unsigned i = 0; i < 3; i++) { + if (OperandIdx[i] < 0) + return Node; + SDValue &Src = Ops[OperandIdx[i] - 1]; + SDValue &Neg = Ops[NegIdx[i] - 1]; + SDValue FakeAbs; + SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs; + bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; + int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); + int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal); + if (HasDst) { + SelIdx--; + ImmIdx--; + } + SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; + SDValue &Imm = Ops[ImmIdx]; + if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG)) + return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); + } + } + + return Node; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h new file mode 100644 index 0000000..4dbac97 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h @@ -0,0 +1,82 @@ +//===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief R600 DAG Lowering interface definition +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_R600ISELLOWERING_H +#define LLVM_LIB_TARGET_R600_R600ISELLOWERING_H + +#include "AMDGPUISelLowering.h" + +namespace llvm { + +class R600InstrInfo; + +class R600TargetLowering : public AMDGPUTargetLowering { +public: + R600TargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI); + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock * BB) const override; + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + void ReplaceNodeResults(SDNode * N, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const override; + SDValue LowerFormalArguments( + SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const override; + EVT getSetCCResultType(const DataLayout &DL, LLVMContext &, + EVT VT) const override; + +private: + unsigned Gen; + /// Each OpenCL kernel has nine implicit parameters that are stored in the + /// first nine dwords of a Vertex Buffer. These implicit parameters are + /// lowered to load instructions which retrieve the values from the Vertex + /// Buffer. + SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT, + SDLoc DL, unsigned DwordOffset) const; + + void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB, + MachineRegisterInfo & MRI, unsigned dword_offset) const; + SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG, + SDLoc DL) const; + SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const; + + SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSHLParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSRXParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUADDSUBO(SDValue Op, SelectionDAG &DAG, + unsigned mainop, unsigned ovf) const; + + SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth, + SelectionDAG &DAG) const; + void getStackAddress(unsigned StackWidth, unsigned ElemIdx, + unsigned &Channel, unsigned &PtrIncr) const; + bool isZero(SDValue Op) const; + SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; +}; + +} // End namespace llvm; + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td new file mode 100644 index 0000000..0ffd485 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td @@ -0,0 +1,495 @@ +//===-- R600InstrFormats.td - R600 Instruction Encodings ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// R600 Instruction format definitions. +// +//===----------------------------------------------------------------------===// + +class InstR600 <dag outs, dag ins, string asm, list<dag> pattern, + InstrItinClass itin> + : AMDGPUInst <outs, ins, asm, pattern> { + + field bits<64> Inst; + bit Trig = 0; + bit Op3 = 0; + bit isVector = 0; + bits<2> FlagOperandIdx = 0; + bit Op1 = 0; + bit Op2 = 0; + bit LDS_1A = 0; + bit LDS_1A1D = 0; + bit HasNativeOperands = 0; + bit VTXInst = 0; + bit TEXInst = 0; + bit ALUInst = 0; + bit IsExport = 0; + bit LDS_1A2D = 0; + + let Namespace = "AMDGPU"; + let OutOperandList = outs; + let InOperandList = ins; + let AsmString = asm; + let Pattern = pattern; + let Itinerary = itin; + + // No AsmMatcher support. + let isCodeGenOnly = 1; + + let TSFlags{4} = Trig; + let TSFlags{5} = Op3; + + // Vector instructions are instructions that must fill all slots in an + // instruction group + let TSFlags{6} = isVector; + let TSFlags{8-7} = FlagOperandIdx; + let TSFlags{9} = HasNativeOperands; + let TSFlags{10} = Op1; + let TSFlags{11} = Op2; + let TSFlags{12} = VTXInst; + let TSFlags{13} = TEXInst; + let TSFlags{14} = ALUInst; + let TSFlags{15} = LDS_1A; + let TSFlags{16} = LDS_1A1D; + let TSFlags{17} = IsExport; + let TSFlags{18} = LDS_1A2D; +} + +//===----------------------------------------------------------------------===// +// ALU instructions +//===----------------------------------------------------------------------===// + +class R600_ALU_LDS_Word0 { + field bits<32> Word0; + + bits<11> src0; + bits<1> src0_rel; + bits<11> src1; + bits<1> src1_rel; + bits<3> index_mode = 0; + bits<2> pred_sel; + bits<1> last; + + bits<9> src0_sel = src0{8-0}; + bits<2> src0_chan = src0{10-9}; + bits<9> src1_sel = src1{8-0}; + bits<2> src1_chan = src1{10-9}; + + let Word0{8-0} = src0_sel; + let Word0{9} = src0_rel; + let Word0{11-10} = src0_chan; + let Word0{21-13} = src1_sel; + let Word0{22} = src1_rel; + let Word0{24-23} = src1_chan; + let Word0{28-26} = index_mode; + let Word0{30-29} = pred_sel; + let Word0{31} = last; +} + +class R600ALU_Word0 : R600_ALU_LDS_Word0 { + + bits<1> src0_neg; + bits<1> src1_neg; + + let Word0{12} = src0_neg; + let Word0{25} = src1_neg; +} + +class R600ALU_Word1 { + field bits<32> Word1; + + bits<11> dst; + bits<3> bank_swizzle; + bits<1> dst_rel; + bits<1> clamp; + + bits<7> dst_sel = dst{6-0}; + bits<2> dst_chan = dst{10-9}; + + let Word1{20-18} = bank_swizzle; + let Word1{27-21} = dst_sel; + let Word1{28} = dst_rel; + let Word1{30-29} = dst_chan; + let Word1{31} = clamp; +} + +class R600ALU_Word1_OP2 <bits<11> alu_inst> : R600ALU_Word1{ + + bits<1> src0_abs; + bits<1> src1_abs; + bits<1> update_exec_mask; + bits<1> update_pred; + bits<1> write; + bits<2> omod; + + let Word1{0} = src0_abs; + let Word1{1} = src1_abs; + let Word1{2} = update_exec_mask; + let Word1{3} = update_pred; + let Word1{4} = write; + let Word1{6-5} = omod; + let Word1{17-7} = alu_inst; +} + +class R600ALU_Word1_OP3 <bits<5> alu_inst> : R600ALU_Word1{ + + bits<11> src2; + bits<1> src2_rel; + bits<1> src2_neg; + + bits<9> src2_sel = src2{8-0}; + bits<2> src2_chan = src2{10-9}; + + let Word1{8-0} = src2_sel; + let Word1{9} = src2_rel; + let Word1{11-10} = src2_chan; + let Word1{12} = src2_neg; + let Word1{17-13} = alu_inst; +} + +class R600LDS_Word1 { + field bits<32> Word1; + + bits<11> src2; + bits<9> src2_sel = src2{8-0}; + bits<2> src2_chan = src2{10-9}; + bits<1> src2_rel; + // offset specifies the stride offset to the second set of data to be read + // from. This is a dword offset. + bits<5> alu_inst = 17; // OP3_INST_LDS_IDX_OP + bits<3> bank_swizzle; + bits<6> lds_op; + bits<2> dst_chan = 0; + + let Word1{8-0} = src2_sel; + let Word1{9} = src2_rel; + let Word1{11-10} = src2_chan; + let Word1{17-13} = alu_inst; + let Word1{20-18} = bank_swizzle; + let Word1{26-21} = lds_op; + let Word1{30-29} = dst_chan; +} + + +/* +XXX: R600 subtarget uses a slightly different encoding than the other +subtargets. We currently handle this in R600MCCodeEmitter, but we may +want to use these instruction classes in the future. + +class R600ALU_Word1_OP2_r600 : R600ALU_Word1_OP2 { + + bits<1> fog_merge; + bits<10> alu_inst; + + let Inst{37} = fog_merge; + let Inst{39-38} = omod; + let Inst{49-40} = alu_inst; +} + +class R600ALU_Word1_OP2_r700 : R600ALU_Word1_OP2 { + + bits<11> alu_inst; + + let Inst{38-37} = omod; + let Inst{49-39} = alu_inst; +} +*/ + +//===----------------------------------------------------------------------===// +// Vertex Fetch instructions +//===----------------------------------------------------------------------===// + +class VTX_WORD0 { + field bits<32> Word0; + bits<7> src_gpr; + bits<5> VC_INST; + bits<2> FETCH_TYPE; + bits<1> FETCH_WHOLE_QUAD; + bits<8> BUFFER_ID; + bits<1> SRC_REL; + bits<2> SRC_SEL_X; + + let Word0{4-0} = VC_INST; + let Word0{6-5} = FETCH_TYPE; + let Word0{7} = FETCH_WHOLE_QUAD; + let Word0{15-8} = BUFFER_ID; + let Word0{22-16} = src_gpr; + let Word0{23} = SRC_REL; + let Word0{25-24} = SRC_SEL_X; +} + +class VTX_WORD0_eg : VTX_WORD0 { + + bits<6> MEGA_FETCH_COUNT; + + let Word0{31-26} = MEGA_FETCH_COUNT; +} + +class VTX_WORD0_cm : VTX_WORD0 { + + bits<2> SRC_SEL_Y; + bits<2> STRUCTURED_READ; + bits<1> LDS_REQ; + bits<1> COALESCED_READ; + + let Word0{27-26} = SRC_SEL_Y; + let Word0{29-28} = STRUCTURED_READ; + let Word0{30} = LDS_REQ; + let Word0{31} = COALESCED_READ; +} + +class VTX_WORD1_GPR { + field bits<32> Word1; + bits<7> dst_gpr; + bits<1> DST_REL; + bits<3> DST_SEL_X; + bits<3> DST_SEL_Y; + bits<3> DST_SEL_Z; + bits<3> DST_SEL_W; + bits<1> USE_CONST_FIELDS; + bits<6> DATA_FORMAT; + bits<2> NUM_FORMAT_ALL; + bits<1> FORMAT_COMP_ALL; + bits<1> SRF_MODE_ALL; + + let Word1{6-0} = dst_gpr; + let Word1{7} = DST_REL; + let Word1{8} = 0; // Reserved + let Word1{11-9} = DST_SEL_X; + let Word1{14-12} = DST_SEL_Y; + let Word1{17-15} = DST_SEL_Z; + let Word1{20-18} = DST_SEL_W; + let Word1{21} = USE_CONST_FIELDS; + let Word1{27-22} = DATA_FORMAT; + let Word1{29-28} = NUM_FORMAT_ALL; + let Word1{30} = FORMAT_COMP_ALL; + let Word1{31} = SRF_MODE_ALL; +} + +//===----------------------------------------------------------------------===// +// Texture fetch instructions +//===----------------------------------------------------------------------===// + +class TEX_WORD0 { + field bits<32> Word0; + + bits<5> TEX_INST; + bits<2> INST_MOD; + bits<1> FETCH_WHOLE_QUAD; + bits<8> RESOURCE_ID; + bits<7> SRC_GPR; + bits<1> SRC_REL; + bits<1> ALT_CONST; + bits<2> RESOURCE_INDEX_MODE; + bits<2> SAMPLER_INDEX_MODE; + + let Word0{4-0} = TEX_INST; + let Word0{6-5} = INST_MOD; + let Word0{7} = FETCH_WHOLE_QUAD; + let Word0{15-8} = RESOURCE_ID; + let Word0{22-16} = SRC_GPR; + let Word0{23} = SRC_REL; + let Word0{24} = ALT_CONST; + let Word0{26-25} = RESOURCE_INDEX_MODE; + let Word0{28-27} = SAMPLER_INDEX_MODE; +} + +class TEX_WORD1 { + field bits<32> Word1; + + bits<7> DST_GPR; + bits<1> DST_REL; + bits<3> DST_SEL_X; + bits<3> DST_SEL_Y; + bits<3> DST_SEL_Z; + bits<3> DST_SEL_W; + bits<7> LOD_BIAS; + bits<1> COORD_TYPE_X; + bits<1> COORD_TYPE_Y; + bits<1> COORD_TYPE_Z; + bits<1> COORD_TYPE_W; + + let Word1{6-0} = DST_GPR; + let Word1{7} = DST_REL; + let Word1{11-9} = DST_SEL_X; + let Word1{14-12} = DST_SEL_Y; + let Word1{17-15} = DST_SEL_Z; + let Word1{20-18} = DST_SEL_W; + let Word1{27-21} = LOD_BIAS; + let Word1{28} = COORD_TYPE_X; + let Word1{29} = COORD_TYPE_Y; + let Word1{30} = COORD_TYPE_Z; + let Word1{31} = COORD_TYPE_W; +} + +class TEX_WORD2 { + field bits<32> Word2; + + bits<5> OFFSET_X; + bits<5> OFFSET_Y; + bits<5> OFFSET_Z; + bits<5> SAMPLER_ID; + bits<3> SRC_SEL_X; + bits<3> SRC_SEL_Y; + bits<3> SRC_SEL_Z; + bits<3> SRC_SEL_W; + + let Word2{4-0} = OFFSET_X; + let Word2{9-5} = OFFSET_Y; + let Word2{14-10} = OFFSET_Z; + let Word2{19-15} = SAMPLER_ID; + let Word2{22-20} = SRC_SEL_X; + let Word2{25-23} = SRC_SEL_Y; + let Word2{28-26} = SRC_SEL_Z; + let Word2{31-29} = SRC_SEL_W; +} + +//===----------------------------------------------------------------------===// +// Control Flow Instructions +//===----------------------------------------------------------------------===// + +class CF_WORD1_R600 { + field bits<32> Word1; + + bits<3> POP_COUNT; + bits<5> CF_CONST; + bits<2> COND; + bits<3> COUNT; + bits<6> CALL_COUNT; + bits<1> COUNT_3; + bits<1> END_OF_PROGRAM; + bits<1> VALID_PIXEL_MODE; + bits<7> CF_INST; + bits<1> WHOLE_QUAD_MODE; + bits<1> BARRIER; + + let Word1{2-0} = POP_COUNT; + let Word1{7-3} = CF_CONST; + let Word1{9-8} = COND; + let Word1{12-10} = COUNT; + let Word1{18-13} = CALL_COUNT; + let Word1{19} = COUNT_3; + let Word1{21} = END_OF_PROGRAM; + let Word1{22} = VALID_PIXEL_MODE; + let Word1{29-23} = CF_INST; + let Word1{30} = WHOLE_QUAD_MODE; + let Word1{31} = BARRIER; +} + +class CF_WORD0_EG { + field bits<32> Word0; + + bits<24> ADDR; + bits<3> JUMPTABLE_SEL; + + let Word0{23-0} = ADDR; + let Word0{26-24} = JUMPTABLE_SEL; +} + +class CF_WORD1_EG { + field bits<32> Word1; + + bits<3> POP_COUNT; + bits<5> CF_CONST; + bits<2> COND; + bits<6> COUNT; + bits<1> VALID_PIXEL_MODE; + bits<1> END_OF_PROGRAM; + bits<8> CF_INST; + bits<1> BARRIER; + + let Word1{2-0} = POP_COUNT; + let Word1{7-3} = CF_CONST; + let Word1{9-8} = COND; + let Word1{15-10} = COUNT; + let Word1{20} = VALID_PIXEL_MODE; + let Word1{21} = END_OF_PROGRAM; + let Word1{29-22} = CF_INST; + let Word1{31} = BARRIER; +} + +class CF_ALU_WORD0 { + field bits<32> Word0; + + bits<22> ADDR; + bits<4> KCACHE_BANK0; + bits<4> KCACHE_BANK1; + bits<2> KCACHE_MODE0; + + let Word0{21-0} = ADDR; + let Word0{25-22} = KCACHE_BANK0; + let Word0{29-26} = KCACHE_BANK1; + let Word0{31-30} = KCACHE_MODE0; +} + +class CF_ALU_WORD1 { + field bits<32> Word1; + + bits<2> KCACHE_MODE1; + bits<8> KCACHE_ADDR0; + bits<8> KCACHE_ADDR1; + bits<7> COUNT; + bits<1> ALT_CONST; + bits<4> CF_INST; + bits<1> WHOLE_QUAD_MODE; + bits<1> BARRIER; + + let Word1{1-0} = KCACHE_MODE1; + let Word1{9-2} = KCACHE_ADDR0; + let Word1{17-10} = KCACHE_ADDR1; + let Word1{24-18} = COUNT; + let Word1{25} = ALT_CONST; + let Word1{29-26} = CF_INST; + let Word1{30} = WHOLE_QUAD_MODE; + let Word1{31} = BARRIER; +} + +class CF_ALLOC_EXPORT_WORD0_RAT { + field bits<32> Word0; + + bits<4> rat_id; + bits<6> rat_inst; + bits<2> rim; + bits<2> type; + bits<7> rw_gpr; + bits<1> rw_rel; + bits<7> index_gpr; + bits<2> elem_size; + + let Word0{3-0} = rat_id; + let Word0{9-4} = rat_inst; + let Word0{10} = 0; // Reserved + let Word0{12-11} = rim; + let Word0{14-13} = type; + let Word0{21-15} = rw_gpr; + let Word0{22} = rw_rel; + let Word0{29-23} = index_gpr; + let Word0{31-30} = elem_size; +} + +class CF_ALLOC_EXPORT_WORD1_BUF { + field bits<32> Word1; + + bits<12> array_size; + bits<4> comp_mask; + bits<4> burst_count; + bits<1> vpm; + bits<1> eop; + bits<8> cf_inst; + bits<1> mark; + bits<1> barrier; + + let Word1{11-0} = array_size; + let Word1{15-12} = comp_mask; + let Word1{19-16} = burst_count; + let Word1{20} = vpm; + let Word1{21} = eop; + let Word1{29-22} = cf_inst; + let Word1{30} = mark; + let Word1{31} = barrier; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp new file mode 100644 index 0000000..8b6eea1 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -0,0 +1,1430 @@ +//===-- R600InstrInfo.cpp - R600 Instruction Information ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief R600 Implementation of TargetInstrInfo. +// +//===----------------------------------------------------------------------===// + +#include "R600InstrInfo.h" +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "AMDGPUTargetMachine.h" +#include "R600Defines.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +#define GET_INSTRINFO_CTOR_DTOR +#include "AMDGPUGenDFAPacketizer.inc" + +R600InstrInfo::R600InstrInfo(const AMDGPUSubtarget &st) + : AMDGPUInstrInfo(st), RI() {} + +const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const { + return RI; +} + +bool R600InstrInfo::isTrig(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & R600_InstFlag::TRIG; +} + +bool R600InstrInfo::isVector(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR; +} + +void +R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + unsigned VectorComponents = 0; + if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) || + AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) && + (AMDGPU::R600_Reg128RegClass.contains(SrcReg) || + AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) { + VectorComponents = 4; + } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) || + AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) && + (AMDGPU::R600_Reg64RegClass.contains(SrcReg) || + AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) { + VectorComponents = 2; + } + + if (VectorComponents > 0) { + for (unsigned I = 0; I < VectorComponents; I++) { + unsigned SubRegIndex = RI.getSubRegFromChannel(I); + buildDefaultInstruction(MBB, MI, AMDGPU::MOV, + RI.getSubReg(DestReg, SubRegIndex), + RI.getSubReg(SrcReg, SubRegIndex)) + .addReg(DestReg, + RegState::Define | RegState::Implicit); + } + } else { + MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV, + DestReg, SrcReg); + NewMI->getOperand(getOperandIdx(*NewMI, AMDGPU::OpName::src0)) + .setIsKill(KillSrc); + } +} + +/// \returns true if \p MBBI can be moved into a new basic. +bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const { + for (MachineInstr::const_mop_iterator I = MBBI->operands_begin(), + E = MBBI->operands_end(); I != E; ++I) { + if (I->isReg() && !TargetRegisterInfo::isVirtualRegister(I->getReg()) && + I->isUse() && RI.isPhysRegLiveAcrossClauses(I->getReg())) + return false; + } + return true; +} + +bool R600InstrInfo::isMov(unsigned Opcode) const { + + + switch(Opcode) { + default: return false; + case AMDGPU::MOV: + case AMDGPU::MOV_IMM_F32: + case AMDGPU::MOV_IMM_I32: + return true; + } +} + +// Some instructions act as place holders to emulate operations that the GPU +// hardware does automatically. This function can be used to check if +// an opcode falls into this category. +bool R600InstrInfo::isPlaceHolderOpcode(unsigned Opcode) const { + switch (Opcode) { + default: return false; + case AMDGPU::RETURN: + return true; + } +} + +bool R600InstrInfo::isReductionOp(unsigned Opcode) const { + return false; +} + +bool R600InstrInfo::isCubeOp(unsigned Opcode) const { + switch(Opcode) { + default: return false; + case AMDGPU::CUBE_r600_pseudo: + case AMDGPU::CUBE_r600_real: + case AMDGPU::CUBE_eg_pseudo: + case AMDGPU::CUBE_eg_real: + return true; + } +} + +bool R600InstrInfo::isALUInstr(unsigned Opcode) const { + unsigned TargetFlags = get(Opcode).TSFlags; + + return (TargetFlags & R600_InstFlag::ALU_INST); +} + +bool R600InstrInfo::hasInstrModifiers(unsigned Opcode) const { + unsigned TargetFlags = get(Opcode).TSFlags; + + return ((TargetFlags & R600_InstFlag::OP1) | + (TargetFlags & R600_InstFlag::OP2) | + (TargetFlags & R600_InstFlag::OP3)); +} + +bool R600InstrInfo::isLDSInstr(unsigned Opcode) const { + unsigned TargetFlags = get(Opcode).TSFlags; + + return ((TargetFlags & R600_InstFlag::LDS_1A) | + (TargetFlags & R600_InstFlag::LDS_1A1D) | + (TargetFlags & R600_InstFlag::LDS_1A2D)); +} + +bool R600InstrInfo::isLDSNoRetInstr(unsigned Opcode) const { + return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) == -1; +} + +bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const { + return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1; +} + +bool R600InstrInfo::canBeConsideredALU(const MachineInstr *MI) const { + if (isALUInstr(MI->getOpcode())) + return true; + if (isVector(*MI) || isCubeOp(MI->getOpcode())) + return true; + switch (MI->getOpcode()) { + case AMDGPU::PRED_X: + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::COPY: + case AMDGPU::DOT_4: + return true; + default: + return false; + } +} + +bool R600InstrInfo::isTransOnly(unsigned Opcode) const { + if (ST.hasCaymanISA()) + return false; + return (get(Opcode).getSchedClass() == AMDGPU::Sched::TransALU); +} + +bool R600InstrInfo::isTransOnly(const MachineInstr *MI) const { + return isTransOnly(MI->getOpcode()); +} + +bool R600InstrInfo::isVectorOnly(unsigned Opcode) const { + return (get(Opcode).getSchedClass() == AMDGPU::Sched::VecALU); +} + +bool R600InstrInfo::isVectorOnly(const MachineInstr *MI) const { + return isVectorOnly(MI->getOpcode()); +} + +bool R600InstrInfo::isExport(unsigned Opcode) const { + return (get(Opcode).TSFlags & R600_InstFlag::IS_EXPORT); +} + +bool R600InstrInfo::usesVertexCache(unsigned Opcode) const { + return ST.hasVertexCache() && IS_VTX(get(Opcode)); +} + +bool R600InstrInfo::usesVertexCache(const MachineInstr *MI) const { + const MachineFunction *MF = MI->getParent()->getParent(); + const R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); + return MFI->getShaderType() != ShaderType::COMPUTE && + usesVertexCache(MI->getOpcode()); +} + +bool R600InstrInfo::usesTextureCache(unsigned Opcode) const { + return (!ST.hasVertexCache() && IS_VTX(get(Opcode))) || IS_TEX(get(Opcode)); +} + +bool R600InstrInfo::usesTextureCache(const MachineInstr *MI) const { + const MachineFunction *MF = MI->getParent()->getParent(); + const R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); + return (MFI->getShaderType() == ShaderType::COMPUTE && + usesVertexCache(MI->getOpcode())) || + usesTextureCache(MI->getOpcode()); +} + +bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const { + switch (Opcode) { + case AMDGPU::KILLGT: + case AMDGPU::GROUP_BARRIER: + return true; + default: + return false; + } +} + +bool R600InstrInfo::usesAddressRegister(MachineInstr *MI) const { + return MI->findRegisterUseOperandIdx(AMDGPU::AR_X) != -1; +} + +bool R600InstrInfo::definesAddressRegister(MachineInstr *MI) const { + return MI->findRegisterDefOperandIdx(AMDGPU::AR_X) != -1; +} + +bool R600InstrInfo::readsLDSSrcReg(const MachineInstr *MI) const { + if (!isALUInstr(MI->getOpcode())) { + return false; + } + for (MachineInstr::const_mop_iterator I = MI->operands_begin(), + E = MI->operands_end(); I != E; ++I) { + if (!I->isReg() || !I->isUse() || + TargetRegisterInfo::isVirtualRegister(I->getReg())) + continue; + + if (AMDGPU::R600_LDS_SRC_REGRegClass.contains(I->getReg())) + return true; + } + return false; +} + +int R600InstrInfo::getSrcIdx(unsigned Opcode, unsigned SrcNum) const { + static const unsigned OpTable[] = { + AMDGPU::OpName::src0, + AMDGPU::OpName::src1, + AMDGPU::OpName::src2 + }; + + assert (SrcNum < 3); + return getOperandIdx(Opcode, OpTable[SrcNum]); +} + +int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const { + static const unsigned SrcSelTable[][2] = { + {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel}, + {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel}, + {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel}, + {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X}, + {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y}, + {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z}, + {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W}, + {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X}, + {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y}, + {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z}, + {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W} + }; + + for (const auto &Row : SrcSelTable) { + if (getOperandIdx(Opcode, Row[0]) == (int)SrcIdx) { + return getOperandIdx(Opcode, Row[1]); + } + } + return -1; +} + +SmallVector<std::pair<MachineOperand *, int64_t>, 3> +R600InstrInfo::getSrcs(MachineInstr *MI) const { + SmallVector<std::pair<MachineOperand *, int64_t>, 3> Result; + + if (MI->getOpcode() == AMDGPU::DOT_4) { + static const unsigned OpTable[8][2] = { + {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X}, + {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y}, + {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z}, + {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W}, + {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X}, + {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y}, + {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z}, + {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W}, + }; + + for (unsigned j = 0; j < 8; j++) { + MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(), + OpTable[j][0])); + unsigned Reg = MO.getReg(); + if (Reg == AMDGPU::ALU_CONST) { + unsigned Sel = MI->getOperand(getOperandIdx(MI->getOpcode(), + OpTable[j][1])).getImm(); + Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Sel)); + continue; + } + + } + return Result; + } + + static const unsigned OpTable[3][2] = { + {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel}, + {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel}, + {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel}, + }; + + for (unsigned j = 0; j < 3; j++) { + int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]); + if (SrcIdx < 0) + break; + MachineOperand &MO = MI->getOperand(SrcIdx); + unsigned Reg = MI->getOperand(SrcIdx).getReg(); + if (Reg == AMDGPU::ALU_CONST) { + unsigned Sel = MI->getOperand( + getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm(); + Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Sel)); + continue; + } + if (Reg == AMDGPU::ALU_LITERAL_X) { + unsigned Imm = MI->getOperand( + getOperandIdx(MI->getOpcode(), AMDGPU::OpName::literal)).getImm(); + Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Imm)); + continue; + } + Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, 0)); + } + return Result; +} + +std::vector<std::pair<int, unsigned> > +R600InstrInfo::ExtractSrcs(MachineInstr *MI, + const DenseMap<unsigned, unsigned> &PV, + unsigned &ConstCount) const { + ConstCount = 0; + ArrayRef<std::pair<MachineOperand *, int64_t>> Srcs = getSrcs(MI); + const std::pair<int, unsigned> DummyPair(-1, 0); + std::vector<std::pair<int, unsigned> > Result; + unsigned i = 0; + for (unsigned n = Srcs.size(); i < n; ++i) { + unsigned Reg = Srcs[i].first->getReg(); + unsigned Index = RI.getEncodingValue(Reg) & 0xff; + if (Reg == AMDGPU::OQAP) { + Result.push_back(std::pair<int, unsigned>(Index, 0)); + } + if (PV.find(Reg) != PV.end()) { + // 255 is used to tells its a PS/PV reg + Result.push_back(std::pair<int, unsigned>(255, 0)); + continue; + } + if (Index > 127) { + ConstCount++; + Result.push_back(DummyPair); + continue; + } + unsigned Chan = RI.getHWRegChan(Reg); + Result.push_back(std::pair<int, unsigned>(Index, Chan)); + } + for (; i < 3; ++i) + Result.push_back(DummyPair); + return Result; +} + +static std::vector<std::pair<int, unsigned> > +Swizzle(std::vector<std::pair<int, unsigned> > Src, + R600InstrInfo::BankSwizzle Swz) { + if (Src[0] == Src[1]) + Src[1].first = -1; + switch (Swz) { + case R600InstrInfo::ALU_VEC_012_SCL_210: + break; + case R600InstrInfo::ALU_VEC_021_SCL_122: + std::swap(Src[1], Src[2]); + break; + case R600InstrInfo::ALU_VEC_102_SCL_221: + std::swap(Src[0], Src[1]); + break; + case R600InstrInfo::ALU_VEC_120_SCL_212: + std::swap(Src[0], Src[1]); + std::swap(Src[0], Src[2]); + break; + case R600InstrInfo::ALU_VEC_201: + std::swap(Src[0], Src[2]); + std::swap(Src[0], Src[1]); + break; + case R600InstrInfo::ALU_VEC_210: + std::swap(Src[0], Src[2]); + break; + } + return Src; +} + +static unsigned +getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) { + switch (Swz) { + case R600InstrInfo::ALU_VEC_012_SCL_210: { + unsigned Cycles[3] = { 2, 1, 0}; + return Cycles[Op]; + } + case R600InstrInfo::ALU_VEC_021_SCL_122: { + unsigned Cycles[3] = { 1, 2, 2}; + return Cycles[Op]; + } + case R600InstrInfo::ALU_VEC_120_SCL_212: { + unsigned Cycles[3] = { 2, 1, 2}; + return Cycles[Op]; + } + case R600InstrInfo::ALU_VEC_102_SCL_221: { + unsigned Cycles[3] = { 2, 2, 1}; + return Cycles[Op]; + } + default: + llvm_unreachable("Wrong Swizzle for Trans Slot"); + return 0; + } +} + +/// returns how many MIs (whose inputs are represented by IGSrcs) can be packed +/// in the same Instruction Group while meeting read port limitations given a +/// Swz swizzle sequence. +unsigned R600InstrInfo::isLegalUpTo( + const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs, + const std::vector<R600InstrInfo::BankSwizzle> &Swz, + const std::vector<std::pair<int, unsigned> > &TransSrcs, + R600InstrInfo::BankSwizzle TransSwz) const { + int Vector[4][3]; + memset(Vector, -1, sizeof(Vector)); + for (unsigned i = 0, e = IGSrcs.size(); i < e; i++) { + const std::vector<std::pair<int, unsigned> > &Srcs = + Swizzle(IGSrcs[i], Swz[i]); + for (unsigned j = 0; j < 3; j++) { + const std::pair<int, unsigned> &Src = Srcs[j]; + if (Src.first < 0 || Src.first == 255) + continue; + if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) { + if (Swz[i] != R600InstrInfo::ALU_VEC_012_SCL_210 && + Swz[i] != R600InstrInfo::ALU_VEC_021_SCL_122) { + // The value from output queue A (denoted by register OQAP) can + // only be fetched during the first cycle. + return false; + } + // OQAP does not count towards the normal read port restrictions + continue; + } + if (Vector[Src.second][j] < 0) + Vector[Src.second][j] = Src.first; + if (Vector[Src.second][j] != Src.first) + return i; + } + } + // Now check Trans Alu + for (unsigned i = 0, e = TransSrcs.size(); i < e; ++i) { + const std::pair<int, unsigned> &Src = TransSrcs[i]; + unsigned Cycle = getTransSwizzle(TransSwz, i); + if (Src.first < 0) + continue; + if (Src.first == 255) + continue; + if (Vector[Src.second][Cycle] < 0) + Vector[Src.second][Cycle] = Src.first; + if (Vector[Src.second][Cycle] != Src.first) + return IGSrcs.size() - 1; + } + return IGSrcs.size(); +} + +/// Given a swizzle sequence SwzCandidate and an index Idx, returns the next +/// (in lexicographic term) swizzle sequence assuming that all swizzles after +/// Idx can be skipped +static bool +NextPossibleSolution( + std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate, + unsigned Idx) { + assert(Idx < SwzCandidate.size()); + int ResetIdx = Idx; + while (ResetIdx > -1 && SwzCandidate[ResetIdx] == R600InstrInfo::ALU_VEC_210) + ResetIdx --; + for (unsigned i = ResetIdx + 1, e = SwzCandidate.size(); i < e; i++) { + SwzCandidate[i] = R600InstrInfo::ALU_VEC_012_SCL_210; + } + if (ResetIdx == -1) + return false; + int NextSwizzle = SwzCandidate[ResetIdx] + 1; + SwzCandidate[ResetIdx] = (R600InstrInfo::BankSwizzle)NextSwizzle; + return true; +} + +/// Enumerate all possible Swizzle sequence to find one that can meet all +/// read port requirements. +bool R600InstrInfo::FindSwizzleForVectorSlot( + const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs, + std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate, + const std::vector<std::pair<int, unsigned> > &TransSrcs, + R600InstrInfo::BankSwizzle TransSwz) const { + unsigned ValidUpTo = 0; + do { + ValidUpTo = isLegalUpTo(IGSrcs, SwzCandidate, TransSrcs, TransSwz); + if (ValidUpTo == IGSrcs.size()) + return true; + } while (NextPossibleSolution(SwzCandidate, ValidUpTo)); + return false; +} + +/// Instructions in Trans slot can't read gpr at cycle 0 if they also read +/// a const, and can't read a gpr at cycle 1 if they read 2 const. +static bool +isConstCompatible(R600InstrInfo::BankSwizzle TransSwz, + const std::vector<std::pair<int, unsigned> > &TransOps, + unsigned ConstCount) { + // TransALU can't read 3 constants + if (ConstCount > 2) + return false; + for (unsigned i = 0, e = TransOps.size(); i < e; ++i) { + const std::pair<int, unsigned> &Src = TransOps[i]; + unsigned Cycle = getTransSwizzle(TransSwz, i); + if (Src.first < 0) + continue; + if (ConstCount > 0 && Cycle == 0) + return false; + if (ConstCount > 1 && Cycle == 1) + return false; + } + return true; +} + +bool +R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG, + const DenseMap<unsigned, unsigned> &PV, + std::vector<BankSwizzle> &ValidSwizzle, + bool isLastAluTrans) + const { + //Todo : support shared src0 - src1 operand + + std::vector<std::vector<std::pair<int, unsigned> > > IGSrcs; + ValidSwizzle.clear(); + unsigned ConstCount; + BankSwizzle TransBS = ALU_VEC_012_SCL_210; + for (unsigned i = 0, e = IG.size(); i < e; ++i) { + IGSrcs.push_back(ExtractSrcs(IG[i], PV, ConstCount)); + unsigned Op = getOperandIdx(IG[i]->getOpcode(), + AMDGPU::OpName::bank_swizzle); + ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle) + IG[i]->getOperand(Op).getImm()); + } + std::vector<std::pair<int, unsigned> > TransOps; + if (!isLastAluTrans) + return FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, TransBS); + + TransOps = std::move(IGSrcs.back()); + IGSrcs.pop_back(); + ValidSwizzle.pop_back(); + + static const R600InstrInfo::BankSwizzle TransSwz[] = { + ALU_VEC_012_SCL_210, + ALU_VEC_021_SCL_122, + ALU_VEC_120_SCL_212, + ALU_VEC_102_SCL_221 + }; + for (unsigned i = 0; i < 4; i++) { + TransBS = TransSwz[i]; + if (!isConstCompatible(TransBS, TransOps, ConstCount)) + continue; + bool Result = FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, + TransBS); + if (Result) { + ValidSwizzle.push_back(TransBS); + return true; + } + } + + return false; +} + + +bool +R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts) + const { + assert (Consts.size() <= 12 && "Too many operands in instructions group"); + unsigned Pair1 = 0, Pair2 = 0; + for (unsigned i = 0, n = Consts.size(); i < n; ++i) { + unsigned ReadConstHalf = Consts[i] & 2; + unsigned ReadConstIndex = Consts[i] & (~3); + unsigned ReadHalfConst = ReadConstIndex | ReadConstHalf; + if (!Pair1) { + Pair1 = ReadHalfConst; + continue; + } + if (Pair1 == ReadHalfConst) + continue; + if (!Pair2) { + Pair2 = ReadHalfConst; + continue; + } + if (Pair2 != ReadHalfConst) + return false; + } + return true; +} + +bool +R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs) + const { + std::vector<unsigned> Consts; + SmallSet<int64_t, 4> Literals; + for (unsigned i = 0, n = MIs.size(); i < n; i++) { + MachineInstr *MI = MIs[i]; + if (!isALUInstr(MI->getOpcode())) + continue; + + ArrayRef<std::pair<MachineOperand *, int64_t>> Srcs = getSrcs(MI); + + for (unsigned j = 0, e = Srcs.size(); j < e; j++) { + std::pair<MachineOperand *, unsigned> Src = Srcs[j]; + if (Src.first->getReg() == AMDGPU::ALU_LITERAL_X) + Literals.insert(Src.second); + if (Literals.size() > 4) + return false; + if (Src.first->getReg() == AMDGPU::ALU_CONST) + Consts.push_back(Src.second); + if (AMDGPU::R600_KC0RegClass.contains(Src.first->getReg()) || + AMDGPU::R600_KC1RegClass.contains(Src.first->getReg())) { + unsigned Index = RI.getEncodingValue(Src.first->getReg()) & 0xff; + unsigned Chan = RI.getHWRegChan(Src.first->getReg()); + Consts.push_back((Index << 2) | Chan); + } + } + } + return fitsConstReadLimitations(Consts); +} + +DFAPacketizer * +R600InstrInfo::CreateTargetScheduleState(const TargetSubtargetInfo &STI) const { + const InstrItineraryData *II = STI.getInstrItineraryData(); + return static_cast<const AMDGPUSubtarget &>(STI).createDFAPacketizer(II); +} + +static bool +isPredicateSetter(unsigned Opcode) { + switch (Opcode) { + case AMDGPU::PRED_X: + return true; + default: + return false; + } +} + +static MachineInstr * +findFirstPredicateSetterFrom(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { + while (I != MBB.begin()) { + --I; + MachineInstr *MI = I; + if (isPredicateSetter(MI->getOpcode())) + return MI; + } + + return nullptr; +} + +static +bool isJump(unsigned Opcode) { + return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND; +} + +static bool isBranch(unsigned Opcode) { + return Opcode == AMDGPU::BRANCH || Opcode == AMDGPU::BRANCH_COND_i32 || + Opcode == AMDGPU::BRANCH_COND_f32; +} + +bool +R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + // Most of the following comes from the ARM implementation of AnalyzeBranch + + // If the block has no terminators, it just falls into the block after it. + MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); + if (I == MBB.end()) + return false; + + // AMDGPU::BRANCH* instructions are only available after isel and are not + // handled + if (isBranch(I->getOpcode())) + return true; + if (!isJump(static_cast<MachineInstr *>(I)->getOpcode())) { + return false; + } + + // Remove successive JUMP + while (I != MBB.begin() && std::prev(I)->getOpcode() == AMDGPU::JUMP) { + MachineBasicBlock::iterator PriorI = std::prev(I); + if (AllowModify) + I->removeFromParent(); + I = PriorI; + } + MachineInstr *LastInst = I; + + // If there is only one terminator instruction, process it. + unsigned LastOpc = LastInst->getOpcode(); + if (I == MBB.begin() || + !isJump(static_cast<MachineInstr *>(--I)->getOpcode())) { + if (LastOpc == AMDGPU::JUMP) { + TBB = LastInst->getOperand(0).getMBB(); + return false; + } else if (LastOpc == AMDGPU::JUMP_COND) { + MachineInstr *predSet = I; + while (!isPredicateSetter(predSet->getOpcode())) { + predSet = --I; + } + TBB = LastInst->getOperand(0).getMBB(); + Cond.push_back(predSet->getOperand(1)); + Cond.push_back(predSet->getOperand(2)); + Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); + return false; + } + return true; // Can't handle indirect branch. + } + + // Get the instruction before it if it is a terminator. + MachineInstr *SecondLastInst = I; + unsigned SecondLastOpc = SecondLastInst->getOpcode(); + + // If the block ends with a B and a Bcc, handle it. + if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) { + MachineInstr *predSet = --I; + while (!isPredicateSetter(predSet->getOpcode())) { + predSet = --I; + } + TBB = SecondLastInst->getOperand(0).getMBB(); + FBB = LastInst->getOperand(0).getMBB(); + Cond.push_back(predSet->getOperand(1)); + Cond.push_back(predSet->getOperand(2)); + Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); + return false; + } + + // Otherwise, can't handle this. + return true; +} + +static +MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) { + for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend(); + It != E; ++It) { + if (It->getOpcode() == AMDGPU::CF_ALU || + It->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE) + return std::prev(It.base()); + } + return MBB.end(); +} + +unsigned +R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, + MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + ArrayRef<MachineOperand> Cond, + DebugLoc DL) const { + assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + + if (!FBB) { + if (Cond.empty()) { + BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB); + return 1; + } else { + MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); + assert(PredSet && "No previous predicate !"); + addFlag(PredSet, 0, MO_FLAG_PUSH); + PredSet->getOperand(2).setImm(Cond[1].getImm()); + + BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND)) + .addMBB(TBB) + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); + if (CfAlu == MBB.end()) + return 1; + assert (CfAlu->getOpcode() == AMDGPU::CF_ALU); + CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE)); + return 1; + } + } else { + MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); + assert(PredSet && "No previous predicate !"); + addFlag(PredSet, 0, MO_FLAG_PUSH); + PredSet->getOperand(2).setImm(Cond[1].getImm()); + BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND)) + .addMBB(TBB) + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB); + MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); + if (CfAlu == MBB.end()) + return 2; + assert (CfAlu->getOpcode() == AMDGPU::CF_ALU); + CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE)); + return 2; + } +} + +unsigned +R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + + // Note : we leave PRED* instructions there. + // They may be needed when predicating instructions. + + MachineBasicBlock::iterator I = MBB.end(); + + if (I == MBB.begin()) { + return 0; + } + --I; + switch (I->getOpcode()) { + default: + return 0; + case AMDGPU::JUMP_COND: { + MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); + clearFlag(predSet, 0, MO_FLAG_PUSH); + I->eraseFromParent(); + MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); + if (CfAlu == MBB.end()) + break; + assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE); + CfAlu->setDesc(get(AMDGPU::CF_ALU)); + break; + } + case AMDGPU::JUMP: + I->eraseFromParent(); + break; + } + I = MBB.end(); + + if (I == MBB.begin()) { + return 1; + } + --I; + switch (I->getOpcode()) { + // FIXME: only one case?? + default: + return 1; + case AMDGPU::JUMP_COND: { + MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); + clearFlag(predSet, 0, MO_FLAG_PUSH); + I->eraseFromParent(); + MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); + if (CfAlu == MBB.end()) + break; + assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE); + CfAlu->setDesc(get(AMDGPU::CF_ALU)); + break; + } + case AMDGPU::JUMP: + I->eraseFromParent(); + break; + } + return 2; +} + +bool +R600InstrInfo::isPredicated(const MachineInstr *MI) const { + int idx = MI->findFirstPredOperandIdx(); + if (idx < 0) + return false; + + unsigned Reg = MI->getOperand(idx).getReg(); + switch (Reg) { + default: return false; + case AMDGPU::PRED_SEL_ONE: + case AMDGPU::PRED_SEL_ZERO: + case AMDGPU::PREDICATE_BIT: + return true; + } +} + +bool +R600InstrInfo::isPredicable(MachineInstr *MI) const { + // XXX: KILL* instructions can be predicated, but they must be the last + // instruction in a clause, so this means any instructions after them cannot + // be predicated. Until we have proper support for instruction clauses in the + // backend, we will mark KILL* instructions as unpredicable. + + if (MI->getOpcode() == AMDGPU::KILLGT) { + return false; + } else if (MI->getOpcode() == AMDGPU::CF_ALU) { + // If the clause start in the middle of MBB then the MBB has more + // than a single clause, unable to predicate several clauses. + if (MI->getParent()->begin() != MachineBasicBlock::iterator(MI)) + return false; + // TODO: We don't support KC merging atm + if (MI->getOperand(3).getImm() != 0 || MI->getOperand(4).getImm() != 0) + return false; + return true; + } else if (isVector(*MI)) { + return false; + } else { + return AMDGPUInstrInfo::isPredicable(MI); + } +} + + +bool +R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB, + unsigned NumCyles, + unsigned ExtraPredCycles, + BranchProbability Probability) const{ + return true; +} + +bool +R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned NumTCycles, + unsigned ExtraTCycles, + MachineBasicBlock &FMBB, + unsigned NumFCycles, + unsigned ExtraFCycles, + BranchProbability Probability) const { + return true; +} + +bool +R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB, + unsigned NumCyles, + BranchProbability Probability) + const { + return true; +} + +bool +R600InstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB, + MachineBasicBlock &FMBB) const { + return false; +} + + +bool +R600InstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { + MachineOperand &MO = Cond[1]; + switch (MO.getImm()) { + case OPCODE_IS_ZERO_INT: + MO.setImm(OPCODE_IS_NOT_ZERO_INT); + break; + case OPCODE_IS_NOT_ZERO_INT: + MO.setImm(OPCODE_IS_ZERO_INT); + break; + case OPCODE_IS_ZERO: + MO.setImm(OPCODE_IS_NOT_ZERO); + break; + case OPCODE_IS_NOT_ZERO: + MO.setImm(OPCODE_IS_ZERO); + break; + default: + return true; + } + + MachineOperand &MO2 = Cond[2]; + switch (MO2.getReg()) { + case AMDGPU::PRED_SEL_ZERO: + MO2.setReg(AMDGPU::PRED_SEL_ONE); + break; + case AMDGPU::PRED_SEL_ONE: + MO2.setReg(AMDGPU::PRED_SEL_ZERO); + break; + default: + return true; + } + return false; +} + +bool +R600InstrInfo::DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const { + return isPredicateSetter(MI->getOpcode()); +} + + +bool +R600InstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1, + ArrayRef<MachineOperand> Pred2) const { + return false; +} + + +bool +R600InstrInfo::PredicateInstruction(MachineInstr *MI, + ArrayRef<MachineOperand> Pred) const { + int PIdx = MI->findFirstPredOperandIdx(); + + if (MI->getOpcode() == AMDGPU::CF_ALU) { + MI->getOperand(8).setImm(0); + return true; + } + + if (MI->getOpcode() == AMDGPU::DOT_4) { + MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_X)) + .setReg(Pred[2].getReg()); + MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Y)) + .setReg(Pred[2].getReg()); + MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Z)) + .setReg(Pred[2].getReg()); + MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_W)) + .setReg(Pred[2].getReg()); + MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); + MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit); + return true; + } + + if (PIdx != -1) { + MachineOperand &PMO = MI->getOperand(PIdx); + PMO.setReg(Pred[2].getReg()); + MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); + MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit); + return true; + } + + return false; +} + +unsigned int R600InstrInfo::getPredicationCost(const MachineInstr *) const { + return 2; +} + +unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost) const { + if (PredCost) + *PredCost = 2; + return 2; +} + +bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { + + switch(MI->getOpcode()) { + default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); + case AMDGPU::R600_EXTRACT_ELT_V2: + case AMDGPU::R600_EXTRACT_ELT_V4: + buildIndirectRead(MI->getParent(), MI, MI->getOperand(0).getReg(), + RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address + MI->getOperand(2).getReg(), + RI.getHWRegChan(MI->getOperand(1).getReg())); + break; + case AMDGPU::R600_INSERT_ELT_V2: + case AMDGPU::R600_INSERT_ELT_V4: + buildIndirectWrite(MI->getParent(), MI, MI->getOperand(2).getReg(), // Value + RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address + MI->getOperand(3).getReg(), // Offset + RI.getHWRegChan(MI->getOperand(1).getReg())); // Channel + break; + } + MI->eraseFromParent(); + return true; +} + +void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved, + const MachineFunction &MF) const { + const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>( + MF.getSubtarget().getFrameLowering()); + + unsigned StackWidth = TFL->getStackWidth(MF); + int End = getIndirectIndexEnd(MF); + + if (End == -1) + return; + + for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) { + unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index); + Reserved.set(SuperReg); + for (unsigned Chan = 0; Chan < StackWidth; ++Chan) { + unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan); + Reserved.set(Reg); + } + } +} + +unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const { + // XXX: Remove when we support a stack width > 2 + assert(Channel == 0); + return RegIndex; +} + +const TargetRegisterClass *R600InstrInfo::getIndirectAddrRegClass() const { + return &AMDGPU::R600_TReg32_XRegClass; +} + +MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const { + return buildIndirectWrite(MBB, I, ValueReg, Address, OffsetReg, 0); +} + +MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const { + unsigned AddrReg; + switch (AddrChan) { + default: llvm_unreachable("Invalid Channel"); + case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break; + case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break; + case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break; + case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break; + } + MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, + AMDGPU::AR_X, OffsetReg); + setImmOperand(MOVA, AMDGPU::OpName::write, 0); + + MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV, + AddrReg, ValueReg) + .addReg(AMDGPU::AR_X, + RegState::Implicit | RegState::Kill); + setImmOperand(Mov, AMDGPU::OpName::dst_rel, 1); + return Mov; +} + +MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const { + return buildIndirectRead(MBB, I, ValueReg, Address, OffsetReg, 0); +} + +MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const { + unsigned AddrReg; + switch (AddrChan) { + default: llvm_unreachable("Invalid Channel"); + case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break; + case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break; + case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break; + case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break; + } + MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, + AMDGPU::AR_X, + OffsetReg); + setImmOperand(MOVA, AMDGPU::OpName::write, 0); + MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV, + ValueReg, + AddrReg) + .addReg(AMDGPU::AR_X, + RegState::Implicit | RegState::Kill); + setImmOperand(Mov, AMDGPU::OpName::src0_rel, 1); + + return Mov; +} + +unsigned R600InstrInfo::getMaxAlusPerClause() const { + return 115; +} + +MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned Opcode, + unsigned DstReg, + unsigned Src0Reg, + unsigned Src1Reg) const { + MachineInstrBuilder MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opcode), + DstReg); // $dst + + if (Src1Reg) { + MIB.addImm(0) // $update_exec_mask + .addImm(0); // $update_predicate + } + MIB.addImm(1) // $write + .addImm(0) // $omod + .addImm(0) // $dst_rel + .addImm(0) // $dst_clamp + .addReg(Src0Reg) // $src0 + .addImm(0) // $src0_neg + .addImm(0) // $src0_rel + .addImm(0) // $src0_abs + .addImm(-1); // $src0_sel + + if (Src1Reg) { + MIB.addReg(Src1Reg) // $src1 + .addImm(0) // $src1_neg + .addImm(0) // $src1_rel + .addImm(0) // $src1_abs + .addImm(-1); // $src1_sel + } + + //XXX: The r600g finalizer expects this to be 1, once we've moved the + //scheduling to the backend, we can change the default to 0. + MIB.addImm(1) // $last + .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel + .addImm(0) // $literal + .addImm(0); // $bank_swizzle + + return MIB; +} + +#define OPERAND_CASE(Label) \ + case Label: { \ + static const unsigned Ops[] = \ + { \ + Label##_X, \ + Label##_Y, \ + Label##_Z, \ + Label##_W \ + }; \ + return Ops[Slot]; \ + } + +static unsigned getSlotedOps(unsigned Op, unsigned Slot) { + switch (Op) { + OPERAND_CASE(AMDGPU::OpName::update_exec_mask) + OPERAND_CASE(AMDGPU::OpName::update_pred) + OPERAND_CASE(AMDGPU::OpName::write) + OPERAND_CASE(AMDGPU::OpName::omod) + OPERAND_CASE(AMDGPU::OpName::dst_rel) + OPERAND_CASE(AMDGPU::OpName::clamp) + OPERAND_CASE(AMDGPU::OpName::src0) + OPERAND_CASE(AMDGPU::OpName::src0_neg) + OPERAND_CASE(AMDGPU::OpName::src0_rel) + OPERAND_CASE(AMDGPU::OpName::src0_abs) + OPERAND_CASE(AMDGPU::OpName::src0_sel) + OPERAND_CASE(AMDGPU::OpName::src1) + OPERAND_CASE(AMDGPU::OpName::src1_neg) + OPERAND_CASE(AMDGPU::OpName::src1_rel) + OPERAND_CASE(AMDGPU::OpName::src1_abs) + OPERAND_CASE(AMDGPU::OpName::src1_sel) + OPERAND_CASE(AMDGPU::OpName::pred_sel) + default: + llvm_unreachable("Wrong Operand"); + } +} + +#undef OPERAND_CASE + +MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction( + MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg) + const { + assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented"); + unsigned Opcode; + if (ST.getGeneration() <= AMDGPUSubtarget::R700) + Opcode = AMDGPU::DOT4_r600; + else + Opcode = AMDGPU::DOT4_eg; + MachineBasicBlock::iterator I = MI; + MachineOperand &Src0 = MI->getOperand( + getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src0, Slot))); + MachineOperand &Src1 = MI->getOperand( + getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src1, Slot))); + MachineInstr *MIB = buildDefaultInstruction( + MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg()); + static const unsigned Operands[14] = { + AMDGPU::OpName::update_exec_mask, + AMDGPU::OpName::update_pred, + AMDGPU::OpName::write, + AMDGPU::OpName::omod, + AMDGPU::OpName::dst_rel, + AMDGPU::OpName::clamp, + AMDGPU::OpName::src0_neg, + AMDGPU::OpName::src0_rel, + AMDGPU::OpName::src0_abs, + AMDGPU::OpName::src0_sel, + AMDGPU::OpName::src1_neg, + AMDGPU::OpName::src1_rel, + AMDGPU::OpName::src1_abs, + AMDGPU::OpName::src1_sel, + }; + + MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(), + getSlotedOps(AMDGPU::OpName::pred_sel, Slot))); + MIB->getOperand(getOperandIdx(Opcode, AMDGPU::OpName::pred_sel)) + .setReg(MO.getReg()); + + for (unsigned i = 0; i < 14; i++) { + MachineOperand &MO = MI->getOperand( + getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot))); + assert (MO.isImm()); + setImmOperand(MIB, Operands[i], MO.getImm()); + } + MIB->getOperand(20).setImm(0); + return MIB; +} + +MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB, + MachineBasicBlock::iterator I, + unsigned DstReg, + uint64_t Imm) const { + MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg, + AMDGPU::ALU_LITERAL_X); + setImmOperand(MovImm, AMDGPU::OpName::literal, Imm); + return MovImm; +} + +MachineInstr *R600InstrInfo::buildMovInstr(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned DstReg, unsigned SrcReg) const { + return buildDefaultInstruction(*MBB, I, AMDGPU::MOV, DstReg, SrcReg); +} + +int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const { + return getOperandIdx(MI.getOpcode(), Op); +} + +int R600InstrInfo::getOperandIdx(unsigned Opcode, unsigned Op) const { + return AMDGPU::getNamedOperandIdx(Opcode, Op); +} + +void R600InstrInfo::setImmOperand(MachineInstr *MI, unsigned Op, + int64_t Imm) const { + int Idx = getOperandIdx(*MI, Op); + assert(Idx != -1 && "Operand not supported for this instruction."); + assert(MI->getOperand(Idx).isImm()); + MI->getOperand(Idx).setImm(Imm); +} + +//===----------------------------------------------------------------------===// +// Instruction flag getters/setters +//===----------------------------------------------------------------------===// + +bool R600InstrInfo::hasFlagOperand(const MachineInstr &MI) const { + return GET_FLAG_OPERAND_IDX(get(MI.getOpcode()).TSFlags) != 0; +} + +MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx, + unsigned Flag) const { + unsigned TargetFlags = get(MI->getOpcode()).TSFlags; + int FlagIndex = 0; + if (Flag != 0) { + // If we pass something other than the default value of Flag to this + // function, it means we are want to set a flag on an instruction + // that uses native encoding. + assert(HAS_NATIVE_OPERANDS(TargetFlags)); + bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3; + switch (Flag) { + case MO_FLAG_CLAMP: + FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::clamp); + break; + case MO_FLAG_MASK: + FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::write); + break; + case MO_FLAG_NOT_LAST: + case MO_FLAG_LAST: + FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::last); + break; + case MO_FLAG_NEG: + switch (SrcIdx) { + case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_neg); break; + case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_neg); break; + case 2: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src2_neg); break; + } + break; + + case MO_FLAG_ABS: + assert(!IsOP3 && "Cannot set absolute value modifier for OP3 " + "instructions."); + (void)IsOP3; + switch (SrcIdx) { + case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_abs); break; + case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_abs); break; + } + break; + + default: + FlagIndex = -1; + break; + } + assert(FlagIndex != -1 && "Flag not supported for this instruction"); + } else { + FlagIndex = GET_FLAG_OPERAND_IDX(TargetFlags); + assert(FlagIndex != 0 && + "Instruction flags not supported for this instruction"); + } + + MachineOperand &FlagOp = MI->getOperand(FlagIndex); + assert(FlagOp.isImm()); + return FlagOp; +} + +void R600InstrInfo::addFlag(MachineInstr *MI, unsigned Operand, + unsigned Flag) const { + unsigned TargetFlags = get(MI->getOpcode()).TSFlags; + if (Flag == 0) { + return; + } + if (HAS_NATIVE_OPERANDS(TargetFlags)) { + MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag); + if (Flag == MO_FLAG_NOT_LAST) { + clearFlag(MI, Operand, MO_FLAG_LAST); + } else if (Flag == MO_FLAG_MASK) { + clearFlag(MI, Operand, Flag); + } else { + FlagOp.setImm(1); + } + } else { + MachineOperand &FlagOp = getFlagOp(MI, Operand); + FlagOp.setImm(FlagOp.getImm() | (Flag << (NUM_MO_FLAGS * Operand))); + } +} + +void R600InstrInfo::clearFlag(MachineInstr *MI, unsigned Operand, + unsigned Flag) const { + unsigned TargetFlags = get(MI->getOpcode()).TSFlags; + if (HAS_NATIVE_OPERANDS(TargetFlags)) { + MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag); + FlagOp.setImm(0); + } else { + MachineOperand &FlagOp = getFlagOp(MI); + unsigned InstFlags = FlagOp.getImm(); + InstFlags &= ~(Flag << (NUM_MO_FLAGS * Operand)); + FlagOp.setImm(InstFlags); + } +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h new file mode 100644 index 0000000..e7251c3 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h @@ -0,0 +1,303 @@ +//===-- R600InstrInfo.h - R600 Instruction Info Interface -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface definition for R600InstrInfo +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_R600INSTRINFO_H +#define LLVM_LIB_TARGET_R600_R600INSTRINFO_H + +#include "AMDGPUInstrInfo.h" +#include "R600Defines.h" +#include "R600RegisterInfo.h" +#include <map> + +namespace llvm { + + class AMDGPUTargetMachine; + class DFAPacketizer; + class ScheduleDAG; + class MachineFunction; + class MachineInstr; + class MachineInstrBuilder; + + class R600InstrInfo : public AMDGPUInstrInfo { + private: + const R600RegisterInfo RI; + + std::vector<std::pair<int, unsigned> > + ExtractSrcs(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV, unsigned &ConstCount) const; + + + MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const; + + MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const; + public: + enum BankSwizzle { + ALU_VEC_012_SCL_210 = 0, + ALU_VEC_021_SCL_122, + ALU_VEC_120_SCL_212, + ALU_VEC_102_SCL_221, + ALU_VEC_201, + ALU_VEC_210 + }; + + explicit R600InstrInfo(const AMDGPUSubtarget &st); + + const R600RegisterInfo &getRegisterInfo() const override; + void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const override; + bool isLegalToSplitMBBAt(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const override; + + bool isTrig(const MachineInstr &MI) const; + bool isPlaceHolderOpcode(unsigned opcode) const; + bool isReductionOp(unsigned opcode) const; + bool isCubeOp(unsigned opcode) const; + + /// \returns true if this \p Opcode represents an ALU instruction. + bool isALUInstr(unsigned Opcode) const; + bool hasInstrModifiers(unsigned Opcode) const; + bool isLDSInstr(unsigned Opcode) const; + bool isLDSNoRetInstr(unsigned Opcode) const; + bool isLDSRetInstr(unsigned Opcode) const; + + /// \returns true if this \p Opcode represents an ALU instruction or an + /// instruction that will be lowered in ExpandSpecialInstrs Pass. + bool canBeConsideredALU(const MachineInstr *MI) const; + + bool isTransOnly(unsigned Opcode) const; + bool isTransOnly(const MachineInstr *MI) const; + bool isVectorOnly(unsigned Opcode) const; + bool isVectorOnly(const MachineInstr *MI) const; + bool isExport(unsigned Opcode) const; + + bool usesVertexCache(unsigned Opcode) const; + bool usesVertexCache(const MachineInstr *MI) const; + bool usesTextureCache(unsigned Opcode) const; + bool usesTextureCache(const MachineInstr *MI) const; + + bool mustBeLastInClause(unsigned Opcode) const; + bool usesAddressRegister(MachineInstr *MI) const; + bool definesAddressRegister(MachineInstr *MI) const; + bool readsLDSSrcReg(const MachineInstr *MI) const; + + /// \returns The operand index for the given source number. Legal values + /// for SrcNum are 0, 1, and 2. + int getSrcIdx(unsigned Opcode, unsigned SrcNum) const; + /// \returns The operand Index for the Sel operand given an index to one + /// of the instruction's src operands. + int getSelIdx(unsigned Opcode, unsigned SrcIdx) const; + + /// \returns a pair for each src of an ALU instructions. + /// The first member of a pair is the register id. + /// If register is ALU_CONST, second member is SEL. + /// If register is ALU_LITERAL, second member is IMM. + /// Otherwise, second member value is undefined. + SmallVector<std::pair<MachineOperand *, int64_t>, 3> + getSrcs(MachineInstr *MI) const; + + unsigned isLegalUpTo( + const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs, + const std::vector<R600InstrInfo::BankSwizzle> &Swz, + const std::vector<std::pair<int, unsigned> > &TransSrcs, + R600InstrInfo::BankSwizzle TransSwz) const; + + bool FindSwizzleForVectorSlot( + const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs, + std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate, + const std::vector<std::pair<int, unsigned> > &TransSrcs, + R600InstrInfo::BankSwizzle TransSwz) const; + + /// Given the order VEC_012 < VEC_021 < VEC_120 < VEC_102 < VEC_201 < VEC_210 + /// returns true and the first (in lexical order) BankSwizzle affectation + /// starting from the one already provided in the Instruction Group MIs that + /// fits Read Port limitations in BS if available. Otherwise returns false + /// and undefined content in BS. + /// isLastAluTrans should be set if the last Alu of MIs will be executed on + /// Trans ALU. In this case, ValidTSwizzle returns the BankSwizzle value to + /// apply to the last instruction. + /// PV holds GPR to PV registers in the Instruction Group MIs. + bool fitsReadPortLimitations(const std::vector<MachineInstr *> &MIs, + const DenseMap<unsigned, unsigned> &PV, + std::vector<BankSwizzle> &BS, + bool isLastAluTrans) const; + + /// An instruction group can only access 2 channel pair (either [XY] or [ZW]) + /// from KCache bank on R700+. This function check if MI set in input meet + /// this limitations + bool fitsConstReadLimitations(const std::vector<MachineInstr *> &) const; + /// Same but using const index set instead of MI set. + bool fitsConstReadLimitations(const std::vector<unsigned>&) const; + + /// \brief Vector instructions are instructions that must fill all + /// instruction slots within an instruction group. + bool isVector(const MachineInstr &MI) const; + + bool isMov(unsigned Opcode) const override; + + DFAPacketizer * + CreateTargetScheduleState(const TargetSubtargetInfo &) const override; + + bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; + + bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override; + + unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, + DebugLoc DL) const override; + + unsigned RemoveBranch(MachineBasicBlock &MBB) const override; + + bool isPredicated(const MachineInstr *MI) const override; + + bool isPredicable(MachineInstr *MI) const override; + + bool + isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, + BranchProbability Probability) const override; + + bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, + unsigned ExtraPredCycles, + BranchProbability Probability) const override ; + + bool + isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned NumTCycles, unsigned ExtraTCycles, + MachineBasicBlock &FMBB, + unsigned NumFCycles, unsigned ExtraFCycles, + BranchProbability Probability) const override; + + bool DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const override; + + bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1, + ArrayRef<MachineOperand> Pred2) const override; + + bool isProfitableToUnpredicate(MachineBasicBlock &TMBB, + MachineBasicBlock &FMBB) const override; + + bool PredicateInstruction(MachineInstr *MI, + ArrayRef<MachineOperand> Pred) const override; + + unsigned int getPredicationCost(const MachineInstr *) const override; + + unsigned int getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost = nullptr) const override; + + int getInstrLatency(const InstrItineraryData *ItinData, + SDNode *Node) const override { return 1;} + + bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + + /// \brief Reserve the registers that may be accesed using indirect addressing. + void reserveIndirectRegisters(BitVector &Reserved, + const MachineFunction &MF) const; + + unsigned calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const override; + + const TargetRegisterClass *getIndirectAddrRegClass() const override; + + MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const override; + + MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const override; + + unsigned getMaxAlusPerClause() const; + + ///buildDefaultInstruction - This function returns a MachineInstr with + /// all the instruction modifiers initialized to their default values. + /// You can use this function to avoid manually specifying each instruction + /// modifier operand when building a new instruction. + /// + /// \returns a MachineInstr with all the instruction modifiers initialized + /// to their default values. + MachineInstrBuilder buildDefaultInstruction(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned Opcode, + unsigned DstReg, + unsigned Src0Reg, + unsigned Src1Reg = 0) const; + + MachineInstr *buildSlotOfVectorInstruction(MachineBasicBlock &MBB, + MachineInstr *MI, + unsigned Slot, + unsigned DstReg) const; + + MachineInstr *buildMovImm(MachineBasicBlock &BB, + MachineBasicBlock::iterator I, + unsigned DstReg, + uint64_t Imm) const; + + MachineInstr *buildMovInstr(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned DstReg, unsigned SrcReg) const override; + + /// \brief Get the index of Op in the MachineInstr. + /// + /// \returns -1 if the Instruction does not contain the specified \p Op. + int getOperandIdx(const MachineInstr &MI, unsigned Op) const; + + /// \brief Get the index of \p Op for the given Opcode. + /// + /// \returns -1 if the Instruction does not contain the specified \p Op. + int getOperandIdx(unsigned Opcode, unsigned Op) const; + + /// \brief Helper function for setting instruction flag values. + void setImmOperand(MachineInstr *MI, unsigned Op, int64_t Imm) const; + + /// \returns true if this instruction has an operand for storing target flags. + bool hasFlagOperand(const MachineInstr &MI) const; + + ///\brief Add one of the MO_FLAG* flags to the specified \p Operand. + void addFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const; + + ///\brief Determine if the specified \p Flag is set on this \p Operand. + bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const; + + /// \param SrcIdx The register source to set the flag on (e.g src0, src1, src2) + /// \param Flag The flag being set. + /// + /// \returns the operand containing the flags for this instruction. + MachineOperand &getFlagOp(MachineInstr *MI, unsigned SrcIdx = 0, + unsigned Flag = 0) const; + + /// \brief Clear the specified flag on the instruction. + void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const; +}; + +namespace AMDGPU { + +int getLDSNoRetOp(uint16_t Opcode); + +} //End namespace AMDGPU + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td new file mode 100644 index 0000000..33ef6a4 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td @@ -0,0 +1,1744 @@ +//===-- R600Instructions.td - R600 Instruction defs -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TableGen definitions for instructions which are available on R600 family +// GPUs. +// +//===----------------------------------------------------------------------===// + +include "R600Intrinsics.td" +include "R600InstrFormats.td" + +class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> : + InstR600 <outs, ins, asm, pattern, NullALU> { + + let Namespace = "AMDGPU"; +} + +def MEMxi : Operand<iPTR> { + let MIOperandInfo = (ops R600_TReg32_X:$ptr, i32imm:$index); + let PrintMethod = "printMemOperand"; +} + +def MEMrr : Operand<iPTR> { + let MIOperandInfo = (ops R600_Reg32:$ptr, R600_Reg32:$index); +} + +// Operands for non-registers + +class InstFlag<string PM = "printOperand", int Default = 0> + : OperandWithDefaultOps <i32, (ops (i32 Default))> { + let PrintMethod = PM; +} + +// src_sel for ALU src operands, see also ALU_CONST, ALU_PARAM registers +def SEL : OperandWithDefaultOps <i32, (ops (i32 -1))> { + let PrintMethod = "printSel"; +} +def BANK_SWIZZLE : OperandWithDefaultOps <i32, (ops (i32 0))> { + let PrintMethod = "printBankSwizzle"; +} + +def LITERAL : InstFlag<"printLiteral">; + +def WRITE : InstFlag <"printWrite", 1>; +def OMOD : InstFlag <"printOMOD">; +def REL : InstFlag <"printRel">; +def CLAMP : InstFlag <"printClamp">; +def NEG : InstFlag <"printNeg">; +def ABS : InstFlag <"printAbs">; +def UEM : InstFlag <"printUpdateExecMask">; +def UP : InstFlag <"printUpdatePred">; + +// XXX: The r600g finalizer in Mesa expects last to be one in most cases. +// Once we start using the packetizer in this backend we should have this +// default to 0. +def LAST : InstFlag<"printLast", 1>; +def RSel : Operand<i32> { + let PrintMethod = "printRSel"; +} +def CT: Operand<i32> { + let PrintMethod = "printCT"; +} + +def FRAMEri : Operand<iPTR> { + let MIOperandInfo = (ops R600_Reg32:$ptr, i32imm:$index); +} + +def ADDRParam : ComplexPattern<i32, 2, "SelectADDRParam", [], []>; +def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>; +def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>; +def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>; +def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>; + + +def R600_Pred : PredicateOperand<i32, (ops R600_Predicate), + (ops PRED_SEL_OFF)>; + + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + +// Class for instructions with only one source register. +// If you add new ins to this instruction, make sure they are listed before +// $literal, because the backend currently assumes that the last operand is +// a literal. Also be sure to update the enum R600Op1OperandIndex::ROI in +// R600Defines.h, R600InstrInfo::buildDefaultInstruction(), +// and R600InstrInfo::getOperandIdx(). +class R600_1OP <bits<11> inst, string opName, list<dag> pattern, + InstrItinClass itin = AnyALU> : + InstR600 <(outs R600_Reg32:$dst), + (ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp, + R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel, + LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal, + BANK_SWIZZLE:$bank_swizzle), + !strconcat(" ", opName, + "$clamp $last $dst$write$dst_rel$omod, " + "$src0_neg$src0_abs$src0$src0_abs$src0_rel, " + "$pred_sel $bank_swizzle"), + pattern, + itin>, + R600ALU_Word0, + R600ALU_Word1_OP2 <inst> { + + let src1 = 0; + let src1_rel = 0; + let src1_neg = 0; + let src1_abs = 0; + let update_exec_mask = 0; + let update_pred = 0; + let HasNativeOperands = 1; + let Op1 = 1; + let ALUInst = 1; + let DisableEncoding = "$literal"; + let UseNamedOperandTable = 1; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node, + InstrItinClass itin = AnyALU> : + R600_1OP <inst, opName, + [(set R600_Reg32:$dst, (node R600_Reg32:$src0))], itin +>; + +// If you add or change the operands for R600_2OP instructions, you must +// also update the R600Op2OperandIndex::ROI enum in R600Defines.h, +// R600InstrInfo::buildDefaultInstruction(), and R600InstrInfo::getOperandIdx(). +class R600_2OP <bits<11> inst, string opName, list<dag> pattern, + InstrItinClass itin = AnyALU> : + InstR600 <(outs R600_Reg32:$dst), + (ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write, + OMOD:$omod, REL:$dst_rel, CLAMP:$clamp, + R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel, + R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs, SEL:$src1_sel, + LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal, + BANK_SWIZZLE:$bank_swizzle), + !strconcat(" ", opName, + "$clamp $last $update_exec_mask$update_pred$dst$write$dst_rel$omod, " + "$src0_neg$src0_abs$src0$src0_abs$src0_rel, " + "$src1_neg$src1_abs$src1$src1_abs$src1_rel, " + "$pred_sel $bank_swizzle"), + pattern, + itin>, + R600ALU_Word0, + R600ALU_Word1_OP2 <inst> { + + let HasNativeOperands = 1; + let Op2 = 1; + let ALUInst = 1; + let DisableEncoding = "$literal"; + let UseNamedOperandTable = 1; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node, + InstrItinClass itin = AnyALU> : + R600_2OP <inst, opName, + [(set R600_Reg32:$dst, (node R600_Reg32:$src0, + R600_Reg32:$src1))], itin +>; + +// If you add our change the operands for R600_3OP instructions, you must +// also update the R600Op3OperandIndex::ROI enum in R600Defines.h, +// R600InstrInfo::buildDefaultInstruction(), and +// R600InstrInfo::getOperandIdx(). +class R600_3OP <bits<5> inst, string opName, list<dag> pattern, + InstrItinClass itin = AnyALU> : + InstR600 <(outs R600_Reg32:$dst), + (ins REL:$dst_rel, CLAMP:$clamp, + R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel, + R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel, + R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel, + LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal, + BANK_SWIZZLE:$bank_swizzle), + !strconcat(" ", opName, "$clamp $last $dst$dst_rel, " + "$src0_neg$src0$src0_rel, " + "$src1_neg$src1$src1_rel, " + "$src2_neg$src2$src2_rel, " + "$pred_sel" + "$bank_swizzle"), + pattern, + itin>, + R600ALU_Word0, + R600ALU_Word1_OP3<inst>{ + + let HasNativeOperands = 1; + let DisableEncoding = "$literal"; + let Op3 = 1; + let UseNamedOperandTable = 1; + let ALUInst = 1; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern, + InstrItinClass itin = VecALU> : + InstR600 <(outs R600_Reg32:$dst), + ins, + asm, + pattern, + itin>; + + + +} // End mayLoad = 1, mayStore = 0, hasSideEffects = 0 + +def TEX_SHADOW : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return (TType >= 6 && TType <= 8) || TType == 13; + }] +>; + +def TEX_RECT : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 5; + }] +>; + +def TEX_ARRAY : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 9 || TType == 10 || TType == 16; + }] +>; + +def TEX_SHADOW_ARRAY : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 11 || TType == 12 || TType == 17; + }] +>; + +def TEX_MSAA : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 14; + }] +>; + +def TEX_ARRAY_MSAA : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 15; + }] +>; + +class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask, + dag outs, dag ins, string asm, list<dag> pattern> : + InstR600ISA <outs, ins, asm, pattern>, + CF_ALLOC_EXPORT_WORD0_RAT, CF_ALLOC_EXPORT_WORD1_BUF { + + let rat_id = ratid; + let rat_inst = ratinst; + let rim = 0; + // XXX: Have a separate instruction for non-indexed writes. + let type = 1; + let rw_rel = 0; + let elem_size = 0; + + let array_size = 0; + let comp_mask = mask; + let burst_count = 0; + let vpm = 0; + let cf_inst = cfinst; + let mark = 0; + let barrier = 1; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + let IsExport = 1; + +} + +class VTX_READ <string name, bits<8> buffer_id, dag outs, list<dag> pattern> + : InstR600ISA <outs, (ins MEMxi:$src_gpr), name, pattern>, + VTX_WORD1_GPR { + + // Static fields + let DST_REL = 0; + // The docs say that if this bit is set, then DATA_FORMAT, NUM_FORMAT_ALL, + // FORMAT_COMP_ALL, SRF_MODE_ALL, and ENDIAN_SWAP fields will be ignored, + // however, based on my testing if USE_CONST_FIELDS is set, then all + // these fields need to be set to 0. + let USE_CONST_FIELDS = 0; + let NUM_FORMAT_ALL = 1; + let FORMAT_COMP_ALL = 0; + let SRF_MODE_ALL = 0; + + let Inst{63-32} = Word1; + // LLVM can only encode 64-bit instructions, so these fields are manually + // encoded in R600CodeEmitter + // + // bits<16> OFFSET; + // bits<2> ENDIAN_SWAP = 0; + // bits<1> CONST_BUF_NO_STRIDE = 0; + // bits<1> MEGA_FETCH = 0; + // bits<1> ALT_CONST = 0; + // bits<2> BUFFER_INDEX_MODE = 0; + + // VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding + // is done in R600CodeEmitter + // + // Inst{79-64} = OFFSET; + // Inst{81-80} = ENDIAN_SWAP; + // Inst{82} = CONST_BUF_NO_STRIDE; + // Inst{83} = MEGA_FETCH; + // Inst{84} = ALT_CONST; + // Inst{86-85} = BUFFER_INDEX_MODE; + // Inst{95-86} = 0; Reserved + + // VTX_WORD3 (Padding) + // + // Inst{127-96} = 0; + + let VTXInst = 1; +} + +class LoadParamFrag <PatFrag load_type> : PatFrag < + (ops node:$ptr), (load_type node:$ptr), + [{ return isConstantLoad(dyn_cast<LoadSDNode>(N), 0); }] +>; + +def load_param : LoadParamFrag<load>; +def load_param_exti8 : LoadParamFrag<az_extloadi8>; +def load_param_exti16 : LoadParamFrag<az_extloadi16>; + +def isR600 : Predicate<"Subtarget->getGeneration() <= AMDGPUSubtarget::R700">; + +def isR600toCayman + : Predicate< + "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">; + +//===----------------------------------------------------------------------===// +// R600 SDNodes +//===----------------------------------------------------------------------===// + +def INTERP_PAIR_XY : AMDGPUShaderInst < + (outs R600_TReg32_X:$dst0, R600_TReg32_Y:$dst1), + (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2), + "INTERP_PAIR_XY $src0 $src1 $src2 : $dst0 dst1", + []>; + +def INTERP_PAIR_ZW : AMDGPUShaderInst < + (outs R600_TReg32_Z:$dst0, R600_TReg32_W:$dst1), + (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2), + "INTERP_PAIR_ZW $src0 $src1 $src2 : $dst0 dst1", + []>; + +def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS", + SDTypeProfile<1, -1, [SDTCisInt<0>, SDTCisPtrTy<1>]>, + [SDNPVariadic] +>; + +def DOT4 : SDNode<"AMDGPUISD::DOT4", + SDTypeProfile<1, 8, [SDTCisFP<0>, SDTCisVT<1, f32>, SDTCisVT<2, f32>, + SDTCisVT<3, f32>, SDTCisVT<4, f32>, SDTCisVT<5, f32>, + SDTCisVT<6, f32>, SDTCisVT<7, f32>, SDTCisVT<8, f32>]>, + [] +>; + +def COS_HW : SDNode<"AMDGPUISD::COS_HW", + SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]> +>; + +def SIN_HW : SDNode<"AMDGPUISD::SIN_HW", + SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]> +>; + +def TEXTURE_FETCH_Type : SDTypeProfile<1, 19, [SDTCisFP<0>]>; + +def TEXTURE_FETCH: SDNode<"AMDGPUISD::TEXTURE_FETCH", TEXTURE_FETCH_Type, []>; + +multiclass TexPattern<bits<32> TextureOp, Instruction inst, ValueType vt = v4f32> { +def : Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR, + (i32 imm:$srcx), (i32 imm:$srcy), (i32 imm:$srcz), (i32 imm:$srcw), + (i32 imm:$offsetx), (i32 imm:$offsety), (i32 imm:$offsetz), + (i32 imm:$DST_SEL_X), (i32 imm:$DST_SEL_Y), (i32 imm:$DST_SEL_Z), + (i32 imm:$DST_SEL_W), + (i32 imm:$RESOURCE_ID), (i32 imm:$SAMPLER_ID), + (i32 imm:$COORD_TYPE_X), (i32 imm:$COORD_TYPE_Y), (i32 imm:$COORD_TYPE_Z), + (i32 imm:$COORD_TYPE_W)), + (inst R600_Reg128:$SRC_GPR, + imm:$srcx, imm:$srcy, imm:$srcz, imm:$srcw, + imm:$offsetx, imm:$offsety, imm:$offsetz, + imm:$DST_SEL_X, imm:$DST_SEL_Y, imm:$DST_SEL_Z, + imm:$DST_SEL_W, + imm:$RESOURCE_ID, imm:$SAMPLER_ID, + imm:$COORD_TYPE_X, imm:$COORD_TYPE_Y, imm:$COORD_TYPE_Z, + imm:$COORD_TYPE_W)>; +} + +//===----------------------------------------------------------------------===// +// Interpolation Instructions +//===----------------------------------------------------------------------===// + +def INTERP_VEC_LOAD : AMDGPUShaderInst < + (outs R600_Reg128:$dst), + (ins i32imm:$src0), + "INTERP_LOAD $src0 : $dst", + [(set R600_Reg128:$dst, (int_R600_interp_const imm:$src0))]>; + +def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> { + let bank_swizzle = 5; +} + +def INTERP_ZW : R600_2OP <0xD7, "INTERP_ZW", []> { + let bank_swizzle = 5; +} + +def INTERP_LOAD_P0 : R600_1OP <0xE0, "INTERP_LOAD_P0", []>; + +//===----------------------------------------------------------------------===// +// Export Instructions +//===----------------------------------------------------------------------===// + +def ExportType : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>; + +def EXPORT: SDNode<"AMDGPUISD::EXPORT", ExportType, + [SDNPHasChain, SDNPSideEffect]>; + +class ExportWord0 { + field bits<32> Word0; + + bits<13> arraybase; + bits<2> type; + bits<7> gpr; + bits<2> elem_size; + + let Word0{12-0} = arraybase; + let Word0{14-13} = type; + let Word0{21-15} = gpr; + let Word0{22} = 0; // RW_REL + let Word0{29-23} = 0; // INDEX_GPR + let Word0{31-30} = elem_size; +} + +class ExportSwzWord1 { + field bits<32> Word1; + + bits<3> sw_x; + bits<3> sw_y; + bits<3> sw_z; + bits<3> sw_w; + bits<1> eop; + bits<8> inst; + + let Word1{2-0} = sw_x; + let Word1{5-3} = sw_y; + let Word1{8-6} = sw_z; + let Word1{11-9} = sw_w; +} + +class ExportBufWord1 { + field bits<32> Word1; + + bits<12> arraySize; + bits<4> compMask; + bits<1> eop; + bits<8> inst; + + let Word1{11-0} = arraySize; + let Word1{15-12} = compMask; +} + +multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> { + def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg), + (ExportInst + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0), + 0, 61, 0, 7, 7, 7, cf_inst, 0) + >; + + def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg), + (ExportInst + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0), + 0, 61, 7, 0, 7, 7, cf_inst, 0) + >; + + def : Pat<(int_R600_store_dummy (i32 imm:$type)), + (ExportInst + (v4f32 (IMPLICIT_DEF)), imm:$type, 0, 7, 7, 7, 7, cf_inst, 0) + >; + + def : Pat<(int_R600_store_dummy 1), + (ExportInst + (v4f32 (IMPLICIT_DEF)), 1, 60, 7, 7, 7, 7, cf_inst, 0) + >; + + def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type), + (i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)), + (ExportInst R600_Reg128:$src, imm:$type, imm:$base, + imm:$swz_x, imm:$swz_y, imm:$swz_z, imm:$swz_w, cf_inst, 0) + >; + +} + +multiclass SteamOutputExportPattern<Instruction ExportInst, + bits<8> buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> { +// Stream0 + def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + (i32 imm:$arraybase), (i32 0), (i32 imm:$mask)), + (ExportInst R600_Reg128:$src, 0, imm:$arraybase, + 4095, imm:$mask, buf0inst, 0)>; +// Stream1 + def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)), + (ExportInst $src, 0, imm:$arraybase, + 4095, imm:$mask, buf1inst, 0)>; +// Stream2 + def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)), + (ExportInst $src, 0, imm:$arraybase, + 4095, imm:$mask, buf2inst, 0)>; +// Stream3 + def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)), + (ExportInst $src, 0, imm:$arraybase, + 4095, imm:$mask, buf3inst, 0)>; +} + +// Export Instructions should not be duplicated by TailDuplication pass +// (which assumes that duplicable instruction are affected by exec mask) +let usesCustomInserter = 1, isNotDuplicable = 1 in { + +class ExportSwzInst : InstR600ISA<( + outs), + (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase, + RSel:$sw_x, RSel:$sw_y, RSel:$sw_z, RSel:$sw_w, i32imm:$inst, + i32imm:$eop), + !strconcat("EXPORT", " $gpr.$sw_x$sw_y$sw_z$sw_w"), + []>, ExportWord0, ExportSwzWord1 { + let elem_size = 3; + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + let IsExport = 1; +} + +} // End usesCustomInserter = 1 + +class ExportBufInst : InstR600ISA<( + outs), + (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase, + i32imm:$arraySize, i32imm:$compMask, i32imm:$inst, i32imm:$eop), + !strconcat("EXPORT", " $gpr"), + []>, ExportWord0, ExportBufWord1 { + let elem_size = 0; + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + let IsExport = 1; +} + +//===----------------------------------------------------------------------===// +// Control Flow Instructions +//===----------------------------------------------------------------------===// + + +def KCACHE : InstFlag<"printKCache">; + +class ALU_CLAUSE<bits<4> inst, string OpName> : AMDGPUInst <(outs), +(ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1, +KCACHE:$KCACHE_MODE0, KCACHE:$KCACHE_MODE1, +i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1, +i32imm:$COUNT, i32imm:$Enabled), +!strconcat(OpName, " $COUNT, @$ADDR, " +"KC0[$KCACHE_MODE0], KC1[$KCACHE_MODE1]"), +[] >, CF_ALU_WORD0, CF_ALU_WORD1 { + field bits<64> Inst; + + let CF_INST = inst; + let ALT_CONST = 0; + let WHOLE_QUAD_MODE = 0; + let BARRIER = 1; + let isCodeGenOnly = 1; + let UseNamedOperandTable = 1; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +class CF_WORD0_R600 { + field bits<32> Word0; + + bits<32> ADDR; + + let Word0 = ADDR; +} + +class CF_CLAUSE_R600 <bits<7> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs), +ins, AsmPrint, [] >, CF_WORD0_R600, CF_WORD1_R600 { + field bits<64> Inst; + bits<4> CNT; + + let CF_INST = inst; + let BARRIER = 1; + let CF_CONST = 0; + let VALID_PIXEL_MODE = 0; + let COND = 0; + let COUNT = CNT{2-0}; + let CALL_COUNT = 0; + let COUNT_3 = CNT{3}; + let END_OF_PROGRAM = 0; + let WHOLE_QUAD_MODE = 0; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +class CF_CLAUSE_EG <bits<8> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs), +ins, AsmPrint, [] >, CF_WORD0_EG, CF_WORD1_EG { + field bits<64> Inst; + + let CF_INST = inst; + let BARRIER = 1; + let JUMPTABLE_SEL = 0; + let CF_CONST = 0; + let VALID_PIXEL_MODE = 0; + let COND = 0; + let END_OF_PROGRAM = 0; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +def CF_ALU : ALU_CLAUSE<8, "ALU">; +def CF_ALU_PUSH_BEFORE : ALU_CLAUSE<9, "ALU_PUSH_BEFORE">; +def CF_ALU_POP_AFTER : ALU_CLAUSE<10, "ALU_POP_AFTER">; +def CF_ALU_CONTINUE : ALU_CLAUSE<13, "ALU_CONTINUE">; +def CF_ALU_BREAK : ALU_CLAUSE<14, "ALU_BREAK">; +def CF_ALU_ELSE_AFTER : ALU_CLAUSE<15, "ALU_ELSE_AFTER">; + +def FETCH_CLAUSE : AMDGPUInst <(outs), +(ins i32imm:$addr), "Fetch clause starting at $addr:", [] > { + field bits<8> Inst; + bits<8> num; + let Inst = num; + let isCodeGenOnly = 1; +} + +def ALU_CLAUSE : AMDGPUInst <(outs), +(ins i32imm:$addr), "ALU clause starting at $addr:", [] > { + field bits<8> Inst; + bits<8> num; + let Inst = num; + let isCodeGenOnly = 1; +} + +def LITERALS : AMDGPUInst <(outs), +(ins LITERAL:$literal1, LITERAL:$literal2), "$literal1, $literal2", [] > { + let isCodeGenOnly = 1; + + field bits<64> Inst; + bits<32> literal1; + bits<32> literal2; + + let Inst{31-0} = literal1; + let Inst{63-32} = literal2; +} + +def PAD : AMDGPUInst <(outs), (ins), "PAD", [] > { + field bits<64> Inst; +} + +let Predicates = [isR600toCayman] in { + +//===----------------------------------------------------------------------===// +// Common Instructions R600, R700, Evergreen, Cayman +//===----------------------------------------------------------------------===// + +def ADD : R600_2OP_Helper <0x0, "ADD", fadd>; +// Non-IEEE MUL: 0 * anything = 0 +def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>; +def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>; +// TODO: Do these actually match the regular fmin/fmax behavior? +def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax_legacy>; +def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin_legacy>; +// According to https://msdn.microsoft.com/en-us/library/windows/desktop/cc308050%28v=vs.85%29.aspx +// DX10 min/max returns the other operand if one is NaN, +// this matches http://llvm.org/docs/LangRef.html#llvm-minnum-intrinsic +def MAX_DX10 : R600_2OP_Helper <0x5, "MAX_DX10", fmaxnum>; +def MIN_DX10 : R600_2OP_Helper <0x6, "MIN_DX10", fminnum>; + +// For the SET* instructions there is a naming conflict in TargetSelectionDAG.td, +// so some of the instruction names don't match the asm string. +// XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics. +def SETE : R600_2OP < + 0x08, "SETE", + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OEQ))] +>; + +def SGT : R600_2OP < + 0x09, "SETGT", + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGT))] +>; + +def SGE : R600_2OP < + 0xA, "SETGE", + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGE))] +>; + +def SNE : R600_2OP < + 0xB, "SETNE", + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE_NE))] +>; + +def SETE_DX10 : R600_2OP < + 0xC, "SETE_DX10", + [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OEQ))] +>; + +def SETGT_DX10 : R600_2OP < + 0xD, "SETGT_DX10", + [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGT))] +>; + +def SETGE_DX10 : R600_2OP < + 0xE, "SETGE_DX10", + [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGE))] +>; + +// FIXME: This should probably be COND_ONE +def SETNE_DX10 : R600_2OP < + 0xF, "SETNE_DX10", + [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE_NE))] +>; + +def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>; +def TRUNC : R600_1OP_Helper <0x11, "TRUNC", ftrunc>; +def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>; +def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>; +def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>; + +def MOV : R600_1OP <0x19, "MOV", []>; + +let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in { + +class MOV_IMM <ValueType vt, Operand immType> : AMDGPUInst < + (outs R600_Reg32:$dst), + (ins immType:$imm), + "", + [] +>; + +} // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 + +def MOV_IMM_I32 : MOV_IMM<i32, i32imm>; +def : Pat < + (imm:$val), + (MOV_IMM_I32 imm:$val) +>; + +def MOV_IMM_F32 : MOV_IMM<f32, f32imm>; +def : Pat < + (fpimm:$val), + (MOV_IMM_F32 fpimm:$val) +>; + +def PRED_SETE : R600_2OP <0x20, "PRED_SETE", []>; +def PRED_SETGT : R600_2OP <0x21, "PRED_SETGT", []>; +def PRED_SETGE : R600_2OP <0x22, "PRED_SETGE", []>; +def PRED_SETNE : R600_2OP <0x23, "PRED_SETNE", []>; + +let hasSideEffects = 1 in { + +def KILLGT : R600_2OP <0x2D, "KILLGT", []>; + +} // end hasSideEffects + +def AND_INT : R600_2OP_Helper <0x30, "AND_INT", and>; +def OR_INT : R600_2OP_Helper <0x31, "OR_INT", or>; +def XOR_INT : R600_2OP_Helper <0x32, "XOR_INT", xor>; +def NOT_INT : R600_1OP_Helper <0x33, "NOT_INT", not>; +def ADD_INT : R600_2OP_Helper <0x34, "ADD_INT", add>; +def SUB_INT : R600_2OP_Helper <0x35, "SUB_INT", sub>; +def MAX_INT : R600_2OP_Helper <0x36, "MAX_INT", smax>; +def MIN_INT : R600_2OP_Helper <0x37, "MIN_INT", smin>; +def MAX_UINT : R600_2OP_Helper <0x38, "MAX_UINT", umax>; +def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", umin>; + +def SETE_INT : R600_2OP < + 0x3A, "SETE_INT", + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETEQ))] +>; + +def SETGT_INT : R600_2OP < + 0x3B, "SETGT_INT", + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGT))] +>; + +def SETGE_INT : R600_2OP < + 0x3C, "SETGE_INT", + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGE))] +>; + +def SETNE_INT : R600_2OP < + 0x3D, "SETNE_INT", + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETNE))] +>; + +def SETGT_UINT : R600_2OP < + 0x3E, "SETGT_UINT", + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGT))] +>; + +def SETGE_UINT : R600_2OP < + 0x3F, "SETGE_UINT", + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGE))] +>; + +def PRED_SETE_INT : R600_2OP <0x42, "PRED_SETE_INT", []>; +def PRED_SETGT_INT : R600_2OP <0x43, "PRED_SETGE_INT", []>; +def PRED_SETGE_INT : R600_2OP <0x44, "PRED_SETGE_INT", []>; +def PRED_SETNE_INT : R600_2OP <0x45, "PRED_SETNE_INT", []>; + +def CNDE_INT : R600_3OP < + 0x1C, "CNDE_INT", + [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_EQ))] +>; + +def CNDGE_INT : R600_3OP < + 0x1E, "CNDGE_INT", + [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGE))] +>; + +def CNDGT_INT : R600_3OP < + 0x1D, "CNDGT_INT", + [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGT))] +>; + +//===----------------------------------------------------------------------===// +// Texture instructions +//===----------------------------------------------------------------------===// + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + +class R600_TEX <bits<11> inst, string opName> : + InstR600 <(outs R600_Reg128:$DST_GPR), + (ins R600_Reg128:$SRC_GPR, + RSel:$srcx, RSel:$srcy, RSel:$srcz, RSel:$srcw, + i32imm:$offsetx, i32imm:$offsety, i32imm:$offsetz, + RSel:$DST_SEL_X, RSel:$DST_SEL_Y, RSel:$DST_SEL_Z, RSel:$DST_SEL_W, + i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID, + CT:$COORD_TYPE_X, CT:$COORD_TYPE_Y, CT:$COORD_TYPE_Z, + CT:$COORD_TYPE_W), + !strconcat(opName, + " $DST_GPR.$DST_SEL_X$DST_SEL_Y$DST_SEL_Z$DST_SEL_W, " + "$SRC_GPR.$srcx$srcy$srcz$srcw " + "RID:$RESOURCE_ID SID:$SAMPLER_ID " + "CT:$COORD_TYPE_X$COORD_TYPE_Y$COORD_TYPE_Z$COORD_TYPE_W"), + [], + NullALU>, TEX_WORD0, TEX_WORD1, TEX_WORD2 { + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + + let TEX_INST = inst{4-0}; + let SRC_REL = 0; + let DST_REL = 0; + let LOD_BIAS = 0; + + let INST_MOD = 0; + let FETCH_WHOLE_QUAD = 0; + let ALT_CONST = 0; + let SAMPLER_INDEX_MODE = 0; + let RESOURCE_INDEX_MODE = 0; + + let TEXInst = 1; +} + +} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0 + + + +def TEX_SAMPLE : R600_TEX <0x10, "TEX_SAMPLE">; +def TEX_SAMPLE_C : R600_TEX <0x18, "TEX_SAMPLE_C">; +def TEX_SAMPLE_L : R600_TEX <0x11, "TEX_SAMPLE_L">; +def TEX_SAMPLE_C_L : R600_TEX <0x19, "TEX_SAMPLE_C_L">; +def TEX_SAMPLE_LB : R600_TEX <0x12, "TEX_SAMPLE_LB">; +def TEX_SAMPLE_C_LB : R600_TEX <0x1A, "TEX_SAMPLE_C_LB">; +def TEX_LD : R600_TEX <0x03, "TEX_LD">; +def TEX_LDPTR : R600_TEX <0x03, "TEX_LDPTR"> { + let INST_MOD = 1; +} +def TEX_GET_TEXTURE_RESINFO : R600_TEX <0x04, "TEX_GET_TEXTURE_RESINFO">; +def TEX_GET_GRADIENTS_H : R600_TEX <0x07, "TEX_GET_GRADIENTS_H">; +def TEX_GET_GRADIENTS_V : R600_TEX <0x08, "TEX_GET_GRADIENTS_V">; +def TEX_SET_GRADIENTS_H : R600_TEX <0x0B, "TEX_SET_GRADIENTS_H">; +def TEX_SET_GRADIENTS_V : R600_TEX <0x0C, "TEX_SET_GRADIENTS_V">; +def TEX_SAMPLE_G : R600_TEX <0x14, "TEX_SAMPLE_G">; +def TEX_SAMPLE_C_G : R600_TEX <0x1C, "TEX_SAMPLE_C_G">; + +defm : TexPattern<0, TEX_SAMPLE>; +defm : TexPattern<1, TEX_SAMPLE_C>; +defm : TexPattern<2, TEX_SAMPLE_L>; +defm : TexPattern<3, TEX_SAMPLE_C_L>; +defm : TexPattern<4, TEX_SAMPLE_LB>; +defm : TexPattern<5, TEX_SAMPLE_C_LB>; +defm : TexPattern<6, TEX_LD, v4i32>; +defm : TexPattern<7, TEX_GET_TEXTURE_RESINFO, v4i32>; +defm : TexPattern<8, TEX_GET_GRADIENTS_H>; +defm : TexPattern<9, TEX_GET_GRADIENTS_V>; +defm : TexPattern<10, TEX_LDPTR, v4i32>; + +//===----------------------------------------------------------------------===// +// Helper classes for common instructions +//===----------------------------------------------------------------------===// + +class MUL_LIT_Common <bits<5> inst> : R600_3OP < + inst, "MUL_LIT", + [] +>; + +class MULADD_Common <bits<5> inst> : R600_3OP < + inst, "MULADD", + [] +>; + +class MULADD_IEEE_Common <bits<5> inst> : R600_3OP < + inst, "MULADD_IEEE", + [(set f32:$dst, (fmad f32:$src0, f32:$src1, f32:$src2))] +>; + +class FMA_Common <bits<5> inst> : R600_3OP < + inst, "FMA", + [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))], VecALU +>; + +class CNDE_Common <bits<5> inst> : R600_3OP < + inst, "CNDE", + [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OEQ))] +>; + +class CNDGT_Common <bits<5> inst> : R600_3OP < + inst, "CNDGT", + [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGT))] +> { + let Itinerary = VecALU; +} + +class CNDGE_Common <bits<5> inst> : R600_3OP < + inst, "CNDGE", + [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGE))] +> { + let Itinerary = VecALU; +} + + +let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in { +class R600_VEC2OP<list<dag> pattern> : InstR600 <(outs R600_Reg32:$dst), (ins +// Slot X + UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X, + OMOD:$omod_X, REL:$dst_rel_X, CLAMP:$clamp_X, + R600_TReg32_X:$src0_X, NEG:$src0_neg_X, REL:$src0_rel_X, ABS:$src0_abs_X, SEL:$src0_sel_X, + R600_TReg32_X:$src1_X, NEG:$src1_neg_X, REL:$src1_rel_X, ABS:$src1_abs_X, SEL:$src1_sel_X, + R600_Pred:$pred_sel_X, +// Slot Y + UEM:$update_exec_mask_Y, UP:$update_pred_Y, WRITE:$write_Y, + OMOD:$omod_Y, REL:$dst_rel_Y, CLAMP:$clamp_Y, + R600_TReg32_Y:$src0_Y, NEG:$src0_neg_Y, REL:$src0_rel_Y, ABS:$src0_abs_Y, SEL:$src0_sel_Y, + R600_TReg32_Y:$src1_Y, NEG:$src1_neg_Y, REL:$src1_rel_Y, ABS:$src1_abs_Y, SEL:$src1_sel_Y, + R600_Pred:$pred_sel_Y, +// Slot Z + UEM:$update_exec_mask_Z, UP:$update_pred_Z, WRITE:$write_Z, + OMOD:$omod_Z, REL:$dst_rel_Z, CLAMP:$clamp_Z, + R600_TReg32_Z:$src0_Z, NEG:$src0_neg_Z, REL:$src0_rel_Z, ABS:$src0_abs_Z, SEL:$src0_sel_Z, + R600_TReg32_Z:$src1_Z, NEG:$src1_neg_Z, REL:$src1_rel_Z, ABS:$src1_abs_Z, SEL:$src1_sel_Z, + R600_Pred:$pred_sel_Z, +// Slot W + UEM:$update_exec_mask_W, UP:$update_pred_W, WRITE:$write_W, + OMOD:$omod_W, REL:$dst_rel_W, CLAMP:$clamp_W, + R600_TReg32_W:$src0_W, NEG:$src0_neg_W, REL:$src0_rel_W, ABS:$src0_abs_W, SEL:$src0_sel_W, + R600_TReg32_W:$src1_W, NEG:$src1_neg_W, REL:$src1_rel_W, ABS:$src1_abs_W, SEL:$src1_sel_W, + R600_Pred:$pred_sel_W, + LITERAL:$literal0, LITERAL:$literal1), + "", + pattern, + AnyALU> { + + let UseNamedOperandTable = 1; + +} +} + +def DOT_4 : R600_VEC2OP<[(set R600_Reg32:$dst, (DOT4 + R600_TReg32_X:$src0_X, R600_TReg32_X:$src1_X, + R600_TReg32_Y:$src0_Y, R600_TReg32_Y:$src1_Y, + R600_TReg32_Z:$src0_Z, R600_TReg32_Z:$src1_Z, + R600_TReg32_W:$src0_W, R600_TReg32_W:$src1_W))]>; + + +class DOT4_Common <bits<11> inst> : R600_2OP <inst, "DOT4", []>; + + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { +multiclass CUBE_Common <bits<11> inst> { + + def _pseudo : InstR600 < + (outs R600_Reg128:$dst), + (ins R600_Reg128:$src0), + "CUBE $dst $src0", + [(set v4f32:$dst, (int_AMDGPU_cube v4f32:$src0))], + VecALU + > { + let isPseudo = 1; + let UseNamedOperandTable = 1; + } + + def _real : R600_2OP <inst, "CUBE", []>; +} +} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0 + +class EXP_IEEE_Common <bits<11> inst> : R600_1OP_Helper < + inst, "EXP_IEEE", fexp2 +> { + let Itinerary = TransALU; +} + +class FLT_TO_INT_Common <bits<11> inst> : R600_1OP_Helper < + inst, "FLT_TO_INT", fp_to_sint +> { + let Itinerary = TransALU; +} + +class INT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper < + inst, "INT_TO_FLT", sint_to_fp +> { + let Itinerary = TransALU; +} + +class FLT_TO_UINT_Common <bits<11> inst> : R600_1OP_Helper < + inst, "FLT_TO_UINT", fp_to_uint +> { + let Itinerary = TransALU; +} + +class UINT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper < + inst, "UINT_TO_FLT", uint_to_fp +> { + let Itinerary = TransALU; +} + +class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP < + inst, "LOG_CLAMPED", [] +>; + +class LOG_IEEE_Common <bits<11> inst> : R600_1OP_Helper < + inst, "LOG_IEEE", flog2 +> { + let Itinerary = TransALU; +} + +class LSHL_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHL", shl>; +class LSHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHR", srl>; +class ASHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "ASHR", sra>; +class MULHI_INT_Common <bits<11> inst> : R600_2OP_Helper < + inst, "MULHI_INT", mulhs +> { + let Itinerary = TransALU; +} +class MULHI_UINT_Common <bits<11> inst> : R600_2OP_Helper < + inst, "MULHI", mulhu +> { + let Itinerary = TransALU; +} +class MULLO_INT_Common <bits<11> inst> : R600_2OP_Helper < + inst, "MULLO_INT", mul +> { + let Itinerary = TransALU; +} +class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []> { + let Itinerary = TransALU; +} + +class RECIP_CLAMPED_Common <bits<11> inst> : R600_1OP < + inst, "RECIP_CLAMPED", [] +> { + let Itinerary = TransALU; +} + +class RECIP_IEEE_Common <bits<11> inst> : R600_1OP < + inst, "RECIP_IEEE", [(set f32:$dst, (AMDGPUrcp f32:$src0))] +> { + let Itinerary = TransALU; +} + +class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper < + inst, "RECIP_UINT", AMDGPUurecip +> { + let Itinerary = TransALU; +} + +// Clamped to maximum. +class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper < + inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamped +> { + let Itinerary = TransALU; +} + +class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP_Helper < + inst, "RECIPSQRT_IEEE", AMDGPUrsq_legacy +> { + let Itinerary = TransALU; +} + +// TODO: There is also RECIPSQRT_FF which clamps to zero. + +class SIN_Common <bits<11> inst> : R600_1OP < + inst, "SIN", [(set f32:$dst, (SIN_HW f32:$src0))]>{ + let Trig = 1; + let Itinerary = TransALU; +} + +class COS_Common <bits<11> inst> : R600_1OP < + inst, "COS", [(set f32:$dst, (COS_HW f32:$src0))]> { + let Trig = 1; + let Itinerary = TransALU; +} + +def CLAMP_R600 : CLAMP <R600_Reg32>; +def FABS_R600 : FABS<R600_Reg32>; +def FNEG_R600 : FNEG<R600_Reg32>; + +//===----------------------------------------------------------------------===// +// Helper patterns for complex intrinsics +//===----------------------------------------------------------------------===// + +// FIXME: Should be predicated on unsafe fp math. +multiclass DIV_Common <InstR600 recip_ieee> { +def : Pat< + (int_AMDGPU_div f32:$src0, f32:$src1), + (MUL_IEEE $src0, (recip_ieee $src1)) +>; + +def : Pat< + (fdiv f32:$src0, f32:$src1), + (MUL_IEEE $src0, (recip_ieee $src1)) +>; + +def : RcpPat<recip_ieee, f32>; +} + +class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee> + : Pat < + (int_TGSI_lit_z f32:$src_x, f32:$src_y, f32:$src_w), + (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x)) +>; + +//===----------------------------------------------------------------------===// +// R600 / R700 Instructions +//===----------------------------------------------------------------------===// + +let Predicates = [isR600] in { + + def MUL_LIT_r600 : MUL_LIT_Common<0x0C>; + def MULADD_r600 : MULADD_Common<0x10>; + def MULADD_IEEE_r600 : MULADD_IEEE_Common<0x14>; + def CNDE_r600 : CNDE_Common<0x18>; + def CNDGT_r600 : CNDGT_Common<0x19>; + def CNDGE_r600 : CNDGE_Common<0x1A>; + def DOT4_r600 : DOT4_Common<0x50>; + defm CUBE_r600 : CUBE_Common<0x52>; + def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>; + def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>; + def LOG_IEEE_r600 : LOG_IEEE_Common<0x63>; + def RECIP_CLAMPED_r600 : RECIP_CLAMPED_Common<0x64>; + def RECIP_IEEE_r600 : RECIP_IEEE_Common<0x66>; + def RECIPSQRT_CLAMPED_r600 : RECIPSQRT_CLAMPED_Common<0x67>; + def RECIPSQRT_IEEE_r600 : RECIPSQRT_IEEE_Common<0x69>; + def FLT_TO_INT_r600 : FLT_TO_INT_Common<0x6b>; + def INT_TO_FLT_r600 : INT_TO_FLT_Common<0x6c>; + def FLT_TO_UINT_r600 : FLT_TO_UINT_Common<0x79>; + def UINT_TO_FLT_r600 : UINT_TO_FLT_Common<0x6d>; + def SIN_r600 : SIN_Common<0x6E>; + def COS_r600 : COS_Common<0x6F>; + def ASHR_r600 : ASHR_Common<0x70>; + def LSHR_r600 : LSHR_Common<0x71>; + def LSHL_r600 : LSHL_Common<0x72>; + def MULLO_INT_r600 : MULLO_INT_Common<0x73>; + def MULHI_INT_r600 : MULHI_INT_Common<0x74>; + def MULLO_UINT_r600 : MULLO_UINT_Common<0x75>; + def MULHI_UINT_r600 : MULHI_UINT_Common<0x76>; + def RECIP_UINT_r600 : RECIP_UINT_Common <0x78>; + + defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>; + def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>; + def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>; + + def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>; + def : RsqPat<RECIPSQRT_IEEE_r600, f32>; + + def R600_ExportSwz : ExportSwzInst { + let Word1{20-17} = 0; // BURST_COUNT + let Word1{21} = eop; + let Word1{22} = 0; // VALID_PIXEL_MODE + let Word1{30-23} = inst; + let Word1{31} = 1; // BARRIER + } + defm : ExportPattern<R600_ExportSwz, 39>; + + def R600_ExportBuf : ExportBufInst { + let Word1{20-17} = 0; // BURST_COUNT + let Word1{21} = eop; + let Word1{22} = 0; // VALID_PIXEL_MODE + let Word1{30-23} = inst; + let Word1{31} = 1; // BARRIER + } + defm : SteamOutputExportPattern<R600_ExportBuf, 0x20, 0x21, 0x22, 0x23>; + + def CF_TC_R600 : CF_CLAUSE_R600<1, (ins i32imm:$ADDR, i32imm:$CNT), + "TEX $CNT @$ADDR"> { + let POP_COUNT = 0; + } + def CF_VC_R600 : CF_CLAUSE_R600<2, (ins i32imm:$ADDR, i32imm:$CNT), + "VTX $CNT @$ADDR"> { + let POP_COUNT = 0; + } + def WHILE_LOOP_R600 : CF_CLAUSE_R600<6, (ins i32imm:$ADDR), + "LOOP_START_DX10 @$ADDR"> { + let POP_COUNT = 0; + let CNT = 0; + } + def END_LOOP_R600 : CF_CLAUSE_R600<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> { + let POP_COUNT = 0; + let CNT = 0; + } + def LOOP_BREAK_R600 : CF_CLAUSE_R600<9, (ins i32imm:$ADDR), + "LOOP_BREAK @$ADDR"> { + let POP_COUNT = 0; + let CNT = 0; + } + def CF_CONTINUE_R600 : CF_CLAUSE_R600<8, (ins i32imm:$ADDR), + "CONTINUE @$ADDR"> { + let POP_COUNT = 0; + let CNT = 0; + } + def CF_JUMP_R600 : CF_CLAUSE_R600<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "JUMP @$ADDR POP:$POP_COUNT"> { + let CNT = 0; + } + def CF_PUSH_ELSE_R600 : CF_CLAUSE_R600<12, (ins i32imm:$ADDR), + "PUSH_ELSE @$ADDR"> { + let CNT = 0; + let POP_COUNT = 0; // FIXME? + } + def CF_ELSE_R600 : CF_CLAUSE_R600<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "ELSE @$ADDR POP:$POP_COUNT"> { + let CNT = 0; + } + def CF_CALL_FS_R600 : CF_CLAUSE_R600<19, (ins), "CALL_FS"> { + let ADDR = 0; + let CNT = 0; + let POP_COUNT = 0; + } + def POP_R600 : CF_CLAUSE_R600<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "POP @$ADDR POP:$POP_COUNT"> { + let CNT = 0; + } + def CF_END_R600 : CF_CLAUSE_R600<0, (ins), "CF_END"> { + let CNT = 0; + let POP_COUNT = 0; + let ADDR = 0; + let END_OF_PROGRAM = 1; + } + +} + + +//===----------------------------------------------------------------------===// +// Regist loads and stores - for indirect addressing +//===----------------------------------------------------------------------===// + +defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>; + + +//===----------------------------------------------------------------------===// +// Pseudo instructions +//===----------------------------------------------------------------------===// + +let isPseudo = 1 in { + +def PRED_X : InstR600 < + (outs R600_Predicate_Bit:$dst), + (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags), + "", [], NullALU> { + let FlagOperandIdx = 3; +} + +let isTerminator = 1, isBranch = 1 in { +def JUMP_COND : InstR600 < + (outs), + (ins brtarget:$target, R600_Predicate_Bit:$p), + "JUMP $target ($p)", + [], AnyALU + >; + +def JUMP : InstR600 < + (outs), + (ins brtarget:$target), + "JUMP $target", + [], AnyALU + > +{ + let isPredicable = 1; + let isBarrier = 1; +} + +} // End isTerminator = 1, isBranch = 1 + +let usesCustomInserter = 1 in { + +let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in { + +def MASK_WRITE : AMDGPUShaderInst < + (outs), + (ins R600_Reg32:$src), + "MASK_WRITE $src", + [] +>; + +} // End mayLoad = 0, mayStore = 0, hasSideEffects = 1 + + +def TXD: InstR600 < + (outs R600_Reg128:$dst), + (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, + i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), + "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", + [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2, + imm:$resourceId, imm:$samplerId, imm:$textureTarget))], + NullALU > { + let TEXInst = 1; +} + +def TXD_SHADOW: InstR600 < + (outs R600_Reg128:$dst), + (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, + i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), + "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", + [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2, + imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))], + NullALU +> { + let TEXInst = 1; +} +} // End isPseudo = 1 +} // End usesCustomInserter = 1 + + +//===----------------------------------------------------------------------===// +// Constant Buffer Addressing Support +//===----------------------------------------------------------------------===// + +let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in { +def CONST_COPY : Instruction { + let OutOperandList = (outs R600_Reg32:$dst); + let InOperandList = (ins i32imm:$src); + let Pattern = + [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))]; + let AsmString = "CONST_COPY"; + let hasSideEffects = 0; + let isAsCheapAsAMove = 1; + let Itinerary = NullALU; +} +} // end usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" + +def TEX_VTX_CONSTBUF : + InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "VTX_READ_eg $dst, $ptr", + [(set v4i32:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$BUFFER_ID)))]>, + VTX_WORD1_GPR, VTX_WORD0_eg { + + let VC_INST = 0; + let FETCH_TYPE = 2; + let FETCH_WHOLE_QUAD = 0; + let SRC_REL = 0; + let SRC_SEL_X = 0; + let DST_REL = 0; + let USE_CONST_FIELDS = 0; + let NUM_FORMAT_ALL = 2; + let FORMAT_COMP_ALL = 1; + let SRF_MODE_ALL = 1; + let MEGA_FETCH_COUNT = 16; + let DST_SEL_X = 0; + let DST_SEL_Y = 1; + let DST_SEL_Z = 2; + let DST_SEL_W = 3; + let DATA_FORMAT = 35; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + +// LLVM can only encode 64-bit instructions, so these fields are manually +// encoded in R600CodeEmitter +// +// bits<16> OFFSET; +// bits<2> ENDIAN_SWAP = 0; +// bits<1> CONST_BUF_NO_STRIDE = 0; +// bits<1> MEGA_FETCH = 0; +// bits<1> ALT_CONST = 0; +// bits<2> BUFFER_INDEX_MODE = 0; + + + +// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding +// is done in R600CodeEmitter +// +// Inst{79-64} = OFFSET; +// Inst{81-80} = ENDIAN_SWAP; +// Inst{82} = CONST_BUF_NO_STRIDE; +// Inst{83} = MEGA_FETCH; +// Inst{84} = ALT_CONST; +// Inst{86-85} = BUFFER_INDEX_MODE; +// Inst{95-86} = 0; Reserved + +// VTX_WORD3 (Padding) +// +// Inst{127-96} = 0; + let VTXInst = 1; +} + +def TEX_VTX_TEXBUF: + InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr", + [(set v4f32:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>, +VTX_WORD1_GPR, VTX_WORD0_eg { + +let VC_INST = 0; +let FETCH_TYPE = 2; +let FETCH_WHOLE_QUAD = 0; +let SRC_REL = 0; +let SRC_SEL_X = 0; +let DST_REL = 0; +let USE_CONST_FIELDS = 1; +let NUM_FORMAT_ALL = 0; +let FORMAT_COMP_ALL = 0; +let SRF_MODE_ALL = 1; +let MEGA_FETCH_COUNT = 16; +let DST_SEL_X = 0; +let DST_SEL_Y = 1; +let DST_SEL_Z = 2; +let DST_SEL_W = 3; +let DATA_FORMAT = 0; + +let Inst{31-0} = Word0; +let Inst{63-32} = Word1; + +// LLVM can only encode 64-bit instructions, so these fields are manually +// encoded in R600CodeEmitter +// +// bits<16> OFFSET; +// bits<2> ENDIAN_SWAP = 0; +// bits<1> CONST_BUF_NO_STRIDE = 0; +// bits<1> MEGA_FETCH = 0; +// bits<1> ALT_CONST = 0; +// bits<2> BUFFER_INDEX_MODE = 0; + + + +// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding +// is done in R600CodeEmitter +// +// Inst{79-64} = OFFSET; +// Inst{81-80} = ENDIAN_SWAP; +// Inst{82} = CONST_BUF_NO_STRIDE; +// Inst{83} = MEGA_FETCH; +// Inst{84} = ALT_CONST; +// Inst{86-85} = BUFFER_INDEX_MODE; +// Inst{95-86} = 0; Reserved + +// VTX_WORD3 (Padding) +// +// Inst{127-96} = 0; + let VTXInst = 1; +} + +//===---------------------------------------------------------------------===// +// Flow and Program control Instructions +//===---------------------------------------------------------------------===// +class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern> +: Instruction { + + let Namespace = "AMDGPU"; + dag OutOperandList = outs; + dag InOperandList = ins; + let Pattern = pattern; + let AsmString = !strconcat(asmstr, "\n"); + let isPseudo = 1; + let Itinerary = NullALU; + bit hasIEEEFlag = 0; + bit hasZeroOpFlag = 0; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let isCodeGenOnly = 1; +} + +multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> { + def _i32 : ILFormat<(outs), + (ins brtarget:$target, rci:$src0), + "; i32 Pseudo branch instruction", + [(Op bb:$target, (i32 rci:$src0))]>; + def _f32 : ILFormat<(outs), + (ins brtarget:$target, rcf:$src0), + "; f32 Pseudo branch instruction", + [(Op bb:$target, (f32 rcf:$src0))]>; +} + +// Only scalar types should generate flow control +multiclass BranchInstr<string name> { + def _i32 : ILFormat<(outs), (ins R600_Reg32:$src), + !strconcat(name, " $src"), []>; + def _f32 : ILFormat<(outs), (ins R600_Reg32:$src), + !strconcat(name, " $src"), []>; +} +// Only scalar types should generate flow control +multiclass BranchInstr2<string name> { + def _i32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1), + !strconcat(name, " $src0, $src1"), []>; + def _f32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1), + !strconcat(name, " $src0, $src1"), []>; +} + +//===---------------------------------------------------------------------===// +// Custom Inserter for Branches and returns, this eventually will be a +// separate pass +//===---------------------------------------------------------------------===// +let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in { + def BRANCH : ILFormat<(outs), (ins brtarget:$target), + "; Pseudo unconditional branch instruction", + [(br bb:$target)]>; + defm BRANCH_COND : BranchConditional<IL_brcond, R600_Reg32, R600_Reg32>; +} + +//===---------------------------------------------------------------------===// +// Return instruction +//===---------------------------------------------------------------------===// +let isTerminator = 1, isReturn = 1, hasCtrlDep = 1, + usesCustomInserter = 1 in { + def RETURN : ILFormat<(outs), (ins variable_ops), + "RETURN", [(IL_retflag)]>; +} + +//===----------------------------------------------------------------------===// +// Branch Instructions +//===----------------------------------------------------------------------===// + +def IF_PREDICATE_SET : ILFormat<(outs), (ins R600_Reg32:$src), + "IF_PREDICATE_SET $src", []>; + +let isTerminator=1 in { + def BREAK : ILFormat< (outs), (ins), + "BREAK", []>; + def CONTINUE : ILFormat< (outs), (ins), + "CONTINUE", []>; + def DEFAULT : ILFormat< (outs), (ins), + "DEFAULT", []>; + def ELSE : ILFormat< (outs), (ins), + "ELSE", []>; + def ENDSWITCH : ILFormat< (outs), (ins), + "ENDSWITCH", []>; + def ENDMAIN : ILFormat< (outs), (ins), + "ENDMAIN", []>; + def END : ILFormat< (outs), (ins), + "END", []>; + def ENDFUNC : ILFormat< (outs), (ins), + "ENDFUNC", []>; + def ENDIF : ILFormat< (outs), (ins), + "ENDIF", []>; + def WHILELOOP : ILFormat< (outs), (ins), + "WHILE", []>; + def ENDLOOP : ILFormat< (outs), (ins), + "ENDLOOP", []>; + def FUNC : ILFormat< (outs), (ins), + "FUNC", []>; + def RETDYN : ILFormat< (outs), (ins), + "RET_DYN", []>; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm IF_LOGICALNZ : BranchInstr<"IF_LOGICALNZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm IF_LOGICALZ : BranchInstr<"IF_LOGICALZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm BREAK_LOGICALNZ : BranchInstr<"BREAK_LOGICALNZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm BREAK_LOGICALZ : BranchInstr<"BREAK_LOGICALZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm CONTINUE_LOGICALNZ : BranchInstr<"CONTINUE_LOGICALNZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm CONTINUE_LOGICALZ : BranchInstr<"CONTINUE_LOGICALZ">; + defm IFC : BranchInstr2<"IFC">; + defm BREAKC : BranchInstr2<"BREAKC">; + defm CONTINUEC : BranchInstr2<"CONTINUEC">; +} + +//===----------------------------------------------------------------------===// +// Indirect addressing pseudo instructions +//===----------------------------------------------------------------------===// + +let isPseudo = 1 in { + +class ExtractVertical <RegisterClass vec_rc> : InstR600 < + (outs R600_Reg32:$dst), + (ins vec_rc:$vec, R600_Reg32:$index), "", + [], + AnyALU +>; + +let Constraints = "$dst = $vec" in { + +class InsertVertical <RegisterClass vec_rc> : InstR600 < + (outs vec_rc:$dst), + (ins vec_rc:$vec, R600_Reg32:$value, R600_Reg32:$index), "", + [], + AnyALU +>; + +} // End Constraints = "$dst = $vec" + +} // End isPseudo = 1 + +def R600_EXTRACT_ELT_V2 : ExtractVertical <R600_Reg64Vertical>; +def R600_EXTRACT_ELT_V4 : ExtractVertical <R600_Reg128Vertical>; + +def R600_INSERT_ELT_V2 : InsertVertical <R600_Reg64Vertical>; +def R600_INSERT_ELT_V4 : InsertVertical <R600_Reg128Vertical>; + +class ExtractVerticalPat <Instruction inst, ValueType vec_ty, + ValueType scalar_ty> : Pat < + (scalar_ty (extractelt vec_ty:$vec, i32:$index)), + (inst $vec, $index) +>; + +def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2i32, i32>; +def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2f32, f32>; +def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4i32, i32>; +def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4f32, f32>; + +class InsertVerticalPat <Instruction inst, ValueType vec_ty, + ValueType scalar_ty> : Pat < + (vec_ty (insertelt vec_ty:$vec, scalar_ty:$value, i32:$index)), + (inst $vec, $value, $index) +>; + +def : InsertVerticalPat <R600_INSERT_ELT_V2, v2i32, i32>; +def : InsertVerticalPat <R600_INSERT_ELT_V2, v2f32, f32>; +def : InsertVerticalPat <R600_INSERT_ELT_V4, v4i32, i32>; +def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>; + +//===----------------------------------------------------------------------===// +// ISel Patterns +//===----------------------------------------------------------------------===// + +// CND*_INT Patterns for f32 True / False values + +class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat < + (selectcc i32:$src0, 0, f32:$src1, f32:$src2, cc), + (cnd $src0, $src1, $src2) +>; + +def : CND_INT_f32 <CNDE_INT, SETEQ>; +def : CND_INT_f32 <CNDGT_INT, SETGT>; +def : CND_INT_f32 <CNDGE_INT, SETGE>; + +//CNDGE_INT extra pattern +def : Pat < + (selectcc i32:$src0, -1, i32:$src1, i32:$src2, COND_SGT), + (CNDGE_INT $src0, $src1, $src2) +>; + +// KIL Patterns +def KILP : Pat < + (int_AMDGPU_kilp), + (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO))) +>; + +def KIL : Pat < + (int_AMDGPU_kill f32:$src0), + (MASK_WRITE (KILLGT (f32 ZERO), $src0)) +>; + +def : Extract_Element <f32, v4f32, 0, sub0>; +def : Extract_Element <f32, v4f32, 1, sub1>; +def : Extract_Element <f32, v4f32, 2, sub2>; +def : Extract_Element <f32, v4f32, 3, sub3>; + +def : Insert_Element <f32, v4f32, 0, sub0>; +def : Insert_Element <f32, v4f32, 1, sub1>; +def : Insert_Element <f32, v4f32, 2, sub2>; +def : Insert_Element <f32, v4f32, 3, sub3>; + +def : Extract_Element <i32, v4i32, 0, sub0>; +def : Extract_Element <i32, v4i32, 1, sub1>; +def : Extract_Element <i32, v4i32, 2, sub2>; +def : Extract_Element <i32, v4i32, 3, sub3>; + +def : Insert_Element <i32, v4i32, 0, sub0>; +def : Insert_Element <i32, v4i32, 1, sub1>; +def : Insert_Element <i32, v4i32, 2, sub2>; +def : Insert_Element <i32, v4i32, 3, sub3>; + +def : Extract_Element <f32, v2f32, 0, sub0>; +def : Extract_Element <f32, v2f32, 1, sub1>; + +def : Insert_Element <f32, v2f32, 0, sub0>; +def : Insert_Element <f32, v2f32, 1, sub1>; + +def : Extract_Element <i32, v2i32, 0, sub0>; +def : Extract_Element <i32, v2i32, 1, sub1>; + +def : Insert_Element <i32, v2i32, 0, sub0>; +def : Insert_Element <i32, v2i32, 1, sub1>; + +// bitconvert patterns + +def : BitConvert <i32, f32, R600_Reg32>; +def : BitConvert <f32, i32, R600_Reg32>; +def : BitConvert <v2f32, v2i32, R600_Reg64>; +def : BitConvert <v2i32, v2f32, R600_Reg64>; +def : BitConvert <v4f32, v4i32, R600_Reg128>; +def : BitConvert <v4i32, v4f32, R600_Reg128>; + +// DWORDADDR pattern +def : DwordAddrPat <i32, R600_Reg32>; + +} // End isR600toCayman Predicate + +let Predicates = [isR600] in { +// Intrinsic patterns +defm : Expand24IBitOps<MULLO_INT_r600, ADD_INT>; +defm : Expand24UBitOps<MULLO_UINT_r600, ADD_INT>; +} // End isR600 + +def getLDSNoRetOp : InstrMapping { + let FilterClass = "R600_LDS_1A1D"; + let RowFields = ["BaseOp"]; + let ColFields = ["DisableEncoding"]; + let KeyCol = ["$dst"]; + let ValueCols = [[""""]]; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td b/contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td new file mode 100644 index 0000000..9681747 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td @@ -0,0 +1,75 @@ +//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// R600 Intrinsic Definitions +// +//===----------------------------------------------------------------------===// + +let TargetPrefix = "R600", isTarget = 1 in { + class TextureIntrinsicFloatInput : + Intrinsic<[llvm_v4f32_ty], [ + llvm_v4f32_ty, // Coord + llvm_i32_ty, // offset_x + llvm_i32_ty, // offset_y, + llvm_i32_ty, // offset_z, + llvm_i32_ty, // resource_id + llvm_i32_ty, // samplerid + llvm_i32_ty, // coord_type_x + llvm_i32_ty, // coord_type_y + llvm_i32_ty, // coord_type_z + llvm_i32_ty // coord_type_w + ], [IntrNoMem]>; + class TextureIntrinsicInt32Input : + Intrinsic<[llvm_v4i32_ty], [ + llvm_v4i32_ty, // Coord + llvm_i32_ty, // offset_x + llvm_i32_ty, // offset_y, + llvm_i32_ty, // offset_z, + llvm_i32_ty, // resource_id + llvm_i32_ty, // samplerid + llvm_i32_ty, // coord_type_x + llvm_i32_ty, // coord_type_y + llvm_i32_ty, // coord_type_z + llvm_i32_ty // coord_type_w + ], [IntrNoMem]>; + + def int_R600_load_input : + Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_R600_interp_input : + Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_R600_interp_const : + Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>; +def int_R600_interp_xy : + Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; +def int_R600_interp_zw : + Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_R600_load_texbuf : + Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_R600_tex : TextureIntrinsicFloatInput; + def int_R600_texc : TextureIntrinsicFloatInput; + def int_R600_txl : TextureIntrinsicFloatInput; + def int_R600_txlc : TextureIntrinsicFloatInput; + def int_R600_txb : TextureIntrinsicFloatInput; + def int_R600_txbc : TextureIntrinsicFloatInput; + def int_R600_txf : TextureIntrinsicInt32Input; + def int_R600_ldptr : TextureIntrinsicInt32Input; + def int_R600_txq : TextureIntrinsicInt32Input; + def int_R600_ddx : TextureIntrinsicFloatInput; + def int_R600_ddy : TextureIntrinsicFloatInput; + def int_R600_store_swizzle : + Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>; + def int_R600_store_stream_output : + Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; + def int_R600_store_pixel_depth : + Intrinsic<[], [llvm_float_ty], []>; + def int_R600_store_pixel_stencil : + Intrinsic<[], [llvm_float_ty], []>; + def int_R600_store_dummy : + Intrinsic<[], [llvm_i32_ty], []>; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp new file mode 100644 index 0000000..01105c6 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp @@ -0,0 +1,20 @@ +//===-- R600MachineFunctionInfo.cpp - R600 Machine Function Info-*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#include "R600MachineFunctionInfo.h" + +using namespace llvm; + + +// Pin the vtable to this file. +void R600MachineFunctionInfo::anchor() {} + +R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF) + : AMDGPUMachineFunction(MF) { } diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.h new file mode 100644 index 0000000..263561e --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.h @@ -0,0 +1,34 @@ +//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H +#define LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H + +#include "AMDGPUMachineFunction.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include <vector> + +namespace llvm { + +class R600MachineFunctionInfo : public AMDGPUMachineFunction { + void anchor() override; +public: + R600MachineFunctionInfo(const MachineFunction &MF); + SmallVector<unsigned, 4> LiveOuts; + std::vector<unsigned> IndirectRegs; + unsigned StackSize; +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp new file mode 100644 index 0000000..bcde5fb --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp @@ -0,0 +1,469 @@ +//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief R600 Machine Scheduler interface +// +//===----------------------------------------------------------------------===// + +#include "R600MachineScheduler.h" +#include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Pass.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "misched" + +void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { + assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness"); + DAG = static_cast<ScheduleDAGMILive*>(dag); + const AMDGPUSubtarget &ST = DAG->MF.getSubtarget<AMDGPUSubtarget>(); + TII = static_cast<const R600InstrInfo*>(DAG->TII); + TRI = static_cast<const R600RegisterInfo*>(DAG->TRI); + VLIW5 = !ST.hasCaymanISA(); + MRI = &DAG->MRI; + CurInstKind = IDOther; + CurEmitted = 0; + OccupedSlotsMask = 31; + InstKindLimit[IDAlu] = TII->getMaxAlusPerClause(); + InstKindLimit[IDOther] = 32; + InstKindLimit[IDFetch] = ST.getTexVTXClauseSize(); + AluInstCount = 0; + FetchInstCount = 0; +} + +void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc, + std::vector<SUnit *> &QDst) +{ + QDst.insert(QDst.end(), QSrc.begin(), QSrc.end()); + QSrc.clear(); +} + +static +unsigned getWFCountLimitedByGPR(unsigned GPRCount) { + assert (GPRCount && "GPRCount cannot be 0"); + return 248 / GPRCount; +} + +SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { + SUnit *SU = nullptr; + NextInstKind = IDOther; + + IsTopNode = false; + + // check if we might want to switch current clause type + bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) || + (Available[CurInstKind].empty()); + bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) && + (!Available[IDFetch].empty() || !Available[IDOther].empty()); + + if (CurInstKind == IDAlu && !Available[IDFetch].empty()) { + // We use the heuristic provided by AMD Accelerated Parallel Processing + // OpenCL Programming Guide : + // The approx. number of WF that allows TEX inst to hide ALU inst is : + // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU)) + float ALUFetchRationEstimate = + (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) / + (FetchInstCount + Available[IDFetch].size()); + if (ALUFetchRationEstimate == 0) { + AllowSwitchFromAlu = true; + } else { + unsigned NeededWF = 62.5f / ALUFetchRationEstimate; + DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" ); + // We assume the local GPR requirements to be "dominated" by the requirement + // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and + // after TEX are indeed likely to consume or generate values from/for the + // TEX clause. + // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause + // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need + // one GPR) or TmXYZW = TnXYZW (need 2 GPR). + // (TODO : use RegisterPressure) + // If we are going too use too many GPR, we flush Fetch instruction to lower + // register pressure on 128 bits regs. + unsigned NearRegisterRequirement = 2 * Available[IDFetch].size(); + if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement)) + AllowSwitchFromAlu = true; + } + } + + if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) || + (!AllowSwitchFromAlu && CurInstKind == IDAlu))) { + // try to pick ALU + SU = pickAlu(); + if (!SU && !PhysicalRegCopy.empty()) { + SU = PhysicalRegCopy.front(); + PhysicalRegCopy.erase(PhysicalRegCopy.begin()); + } + if (SU) { + if (CurEmitted >= InstKindLimit[IDAlu]) + CurEmitted = 0; + NextInstKind = IDAlu; + } + } + + if (!SU) { + // try to pick FETCH + SU = pickOther(IDFetch); + if (SU) + NextInstKind = IDFetch; + } + + // try to pick other + if (!SU) { + SU = pickOther(IDOther); + if (SU) + NextInstKind = IDOther; + } + + DEBUG( + if (SU) { + dbgs() << " ** Pick node **\n"; + SU->dump(DAG); + } else { + dbgs() << "NO NODE \n"; + for (unsigned i = 0; i < DAG->SUnits.size(); i++) { + const SUnit &S = DAG->SUnits[i]; + if (!S.isScheduled) + S.dump(DAG); + } + } + ); + + return SU; +} + +void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { + if (NextInstKind != CurInstKind) { + DEBUG(dbgs() << "Instruction Type Switch\n"); + if (NextInstKind != IDAlu) + OccupedSlotsMask |= 31; + CurEmitted = 0; + CurInstKind = NextInstKind; + } + + if (CurInstKind == IDAlu) { + AluInstCount ++; + switch (getAluKind(SU)) { + case AluT_XYZW: + CurEmitted += 4; + break; + case AluDiscarded: + break; + default: { + ++CurEmitted; + for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(), + E = SU->getInstr()->operands_end(); It != E; ++It) { + MachineOperand &MO = *It; + if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) + ++CurEmitted; + } + } + } + } else { + ++CurEmitted; + } + + + DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n"); + + if (CurInstKind != IDFetch) { + MoveUnits(Pending[IDFetch], Available[IDFetch]); + } else + FetchInstCount++; +} + +static bool +isPhysicalRegCopy(MachineInstr *MI) { + if (MI->getOpcode() != AMDGPU::COPY) + return false; + + return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg()); +} + +void R600SchedStrategy::releaseTopNode(SUnit *SU) { + DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG);); +} + +void R600SchedStrategy::releaseBottomNode(SUnit *SU) { + DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG);); + if (isPhysicalRegCopy(SU->getInstr())) { + PhysicalRegCopy.push_back(SU); + return; + } + + int IK = getInstKind(SU); + + // There is no export clause, we can schedule one as soon as its ready + if (IK == IDOther) + Available[IDOther].push_back(SU); + else + Pending[IK].push_back(SU); + +} + +bool R600SchedStrategy::regBelongsToClass(unsigned Reg, + const TargetRegisterClass *RC) const { + if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + return RC->contains(Reg); + } else { + return MRI->getRegClass(Reg) == RC; + } +} + +R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { + MachineInstr *MI = SU->getInstr(); + + if (TII->isTransOnly(MI)) + return AluTrans; + + switch (MI->getOpcode()) { + case AMDGPU::PRED_X: + return AluPredX; + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::DOT_4: + return AluT_XYZW; + case AMDGPU::COPY: + if (MI->getOperand(1).isUndef()) { + // MI will become a KILL, don't considers it in scheduling + return AluDiscarded; + } + default: + break; + } + + // Does the instruction take a whole IG ? + // XXX: Is it possible to add a helper function in R600InstrInfo that can + // be used here and in R600PacketizerList::isSoloInstruction() ? + if(TII->isVector(*MI) || + TII->isCubeOp(MI->getOpcode()) || + TII->isReductionOp(MI->getOpcode()) || + MI->getOpcode() == AMDGPU::GROUP_BARRIER) { + return AluT_XYZW; + } + + if (TII->isLDSInstr(MI->getOpcode())) { + return AluT_X; + } + + // Is the result already assigned to a channel ? + unsigned DestSubReg = MI->getOperand(0).getSubReg(); + switch (DestSubReg) { + case AMDGPU::sub0: + return AluT_X; + case AMDGPU::sub1: + return AluT_Y; + case AMDGPU::sub2: + return AluT_Z; + case AMDGPU::sub3: + return AluT_W; + default: + break; + } + + // Is the result already member of a X/Y/Z/W class ? + unsigned DestReg = MI->getOperand(0).getReg(); + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) || + regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass)) + return AluT_X; + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass)) + return AluT_Y; + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass)) + return AluT_Z; + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass)) + return AluT_W; + if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass)) + return AluT_XYZW; + + // LDS src registers cannot be used in the Trans slot. + if (TII->readsLDSSrcReg(MI)) + return AluT_XYZW; + + return AluAny; + +} + +int R600SchedStrategy::getInstKind(SUnit* SU) { + int Opcode = SU->getInstr()->getOpcode(); + + if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode)) + return IDFetch; + + if (TII->isALUInstr(Opcode)) { + return IDAlu; + } + + switch (Opcode) { + case AMDGPU::PRED_X: + case AMDGPU::COPY: + case AMDGPU::CONST_COPY: + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::DOT_4: + return IDAlu; + default: + return IDOther; + } +} + +SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) { + if (Q.empty()) + return nullptr; + for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend(); + It != E; ++It) { + SUnit *SU = *It; + InstructionsGroupCandidate.push_back(SU->getInstr()); + if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) + && (!AnyALU || !TII->isVectorOnly(SU->getInstr())) + ) { + InstructionsGroupCandidate.pop_back(); + Q.erase((It + 1).base()); + return SU; + } else { + InstructionsGroupCandidate.pop_back(); + } + } + return nullptr; +} + +void R600SchedStrategy::LoadAlu() { + std::vector<SUnit *> &QSrc = Pending[IDAlu]; + for (unsigned i = 0, e = QSrc.size(); i < e; ++i) { + AluKind AK = getAluKind(QSrc[i]); + AvailableAlus[AK].push_back(QSrc[i]); + } + QSrc.clear(); +} + +void R600SchedStrategy::PrepareNextSlot() { + DEBUG(dbgs() << "New Slot\n"); + assert (OccupedSlotsMask && "Slot wasn't filled"); + OccupedSlotsMask = 0; +// if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS) +// OccupedSlotsMask |= 16; + InstructionsGroupCandidate.clear(); + LoadAlu(); +} + +void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) { + int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); + if (DstIndex == -1) { + return; + } + unsigned DestReg = MI->getOperand(DstIndex).getReg(); + // PressureRegister crashes if an operand is def and used in the same inst + // and we try to constraint its regclass + for (MachineInstr::mop_iterator It = MI->operands_begin(), + E = MI->operands_end(); It != E; ++It) { + MachineOperand &MO = *It; + if (MO.isReg() && !MO.isDef() && + MO.getReg() == DestReg) + return; + } + // Constrains the regclass of DestReg to assign it to Slot + switch (Slot) { + case 0: + MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass); + break; + case 1: + MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass); + break; + case 2: + MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass); + break; + case 3: + MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass); + break; + } +} + +SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) { + static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W}; + SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu); + if (SlotedSU) + return SlotedSU; + SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu); + if (UnslotedSU) + AssignSlot(UnslotedSU->getInstr(), Slot); + return UnslotedSU; +} + +unsigned R600SchedStrategy::AvailablesAluCount() const { + return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() + + AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() + + AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() + + AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() + + AvailableAlus[AluPredX].size(); +} + +SUnit* R600SchedStrategy::pickAlu() { + while (AvailablesAluCount() || !Pending[IDAlu].empty()) { + if (!OccupedSlotsMask) { + // Bottom up scheduling : predX must comes first + if (!AvailableAlus[AluPredX].empty()) { + OccupedSlotsMask |= 31; + return PopInst(AvailableAlus[AluPredX], false); + } + // Flush physical reg copies (RA will discard them) + if (!AvailableAlus[AluDiscarded].empty()) { + OccupedSlotsMask |= 31; + return PopInst(AvailableAlus[AluDiscarded], false); + } + // If there is a T_XYZW alu available, use it + if (!AvailableAlus[AluT_XYZW].empty()) { + OccupedSlotsMask |= 15; + return PopInst(AvailableAlus[AluT_XYZW], false); + } + } + bool TransSlotOccuped = OccupedSlotsMask & 16; + if (!TransSlotOccuped && VLIW5) { + if (!AvailableAlus[AluTrans].empty()) { + OccupedSlotsMask |= 16; + return PopInst(AvailableAlus[AluTrans], false); + } + SUnit *SU = AttemptFillSlot(3, true); + if (SU) { + OccupedSlotsMask |= 16; + return SU; + } + } + for (int Chan = 3; Chan > -1; --Chan) { + bool isOccupied = OccupedSlotsMask & (1 << Chan); + if (!isOccupied) { + SUnit *SU = AttemptFillSlot(Chan, false); + if (SU) { + OccupedSlotsMask |= (1 << Chan); + InstructionsGroupCandidate.push_back(SU->getInstr()); + return SU; + } + } + } + PrepareNextSlot(); + } + return nullptr; +} + +SUnit* R600SchedStrategy::pickOther(int QID) { + SUnit *SU = nullptr; + std::vector<SUnit *> &AQ = Available[QID]; + + if (AQ.empty()) { + MoveUnits(Pending[QID], AQ); + } + if (!AQ.empty()) { + SU = AQ.back(); + AQ.resize(AQ.size() - 1); + } + return SU; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h new file mode 100644 index 0000000..fc5b95c --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h @@ -0,0 +1,103 @@ +//===-- R600MachineScheduler.h - R600 Scheduler Interface -*- C++ -*-------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief R600 Machine Scheduler interface +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H +#define LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H + +#include "R600InstrInfo.h" +#include "llvm/ADT/PriorityQueue.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +namespace llvm { + +class R600SchedStrategy : public MachineSchedStrategy { + + const ScheduleDAGMILive *DAG; + const R600InstrInfo *TII; + const R600RegisterInfo *TRI; + MachineRegisterInfo *MRI; + + enum InstKind { + IDAlu, + IDFetch, + IDOther, + IDLast + }; + + enum AluKind { + AluAny, + AluT_X, + AluT_Y, + AluT_Z, + AluT_W, + AluT_XYZW, + AluPredX, + AluTrans, + AluDiscarded, // LLVM Instructions that are going to be eliminated + AluLast + }; + + std::vector<SUnit *> Available[IDLast], Pending[IDLast]; + std::vector<SUnit *> AvailableAlus[AluLast]; + std::vector<SUnit *> PhysicalRegCopy; + + InstKind CurInstKind; + int CurEmitted; + InstKind NextInstKind; + + unsigned AluInstCount; + unsigned FetchInstCount; + + int InstKindLimit[IDLast]; + + int OccupedSlotsMask; + +public: + R600SchedStrategy() : + DAG(nullptr), TII(nullptr), TRI(nullptr), MRI(nullptr) { + } + + virtual ~R600SchedStrategy() {} + + void initialize(ScheduleDAGMI *dag) override; + SUnit *pickNode(bool &IsTopNode) override; + void schedNode(SUnit *SU, bool IsTopNode) override; + void releaseTopNode(SUnit *SU) override; + void releaseBottomNode(SUnit *SU) override; + +private: + std::vector<MachineInstr *> InstructionsGroupCandidate; + bool VLIW5; + + int getInstKind(SUnit *SU); + bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const; + AluKind getAluKind(SUnit *SU) const; + void LoadAlu(); + unsigned AvailablesAluCount() const; + SUnit *AttemptFillSlot (unsigned Slot, bool AnyAlu); + void PrepareNextSlot(); + SUnit *PopInst(std::vector<SUnit*> &Q, bool AnyALU); + + void AssignSlot(MachineInstr *MI, unsigned Slot); + SUnit* pickAlu(); + SUnit* pickOther(int QID); + void MoveUnits(std::vector<SUnit *> &QSrc, std::vector<SUnit *> &QDst); +}; + +} // namespace llvm + +#endif /* R600MACHINESCHEDULER_H_ */ diff --git a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp new file mode 100644 index 0000000..5efb3b9 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -0,0 +1,382 @@ +//===--------------------- R600MergeVectorRegisters.cpp -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass merges inputs of swizzeable instructions into vector sharing +/// common data and/or have enough undef subreg using swizzle abilities. +/// +/// For instance let's consider the following pseudo code : +/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3 +/// ... +/// vreg7<def> = REG_SEQ vreg1, sub0, vreg3, sub1, undef, sub2, vreg4, sub3 +/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub1, sub2, sub3 +/// +/// is turned into : +/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3 +/// ... +/// vreg7<def> = INSERT_SUBREG vreg4, sub3 +/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub2, sub1, sub3 +/// +/// This allow regalloc to reduce register pressure for vector registers and +/// to reduce MOV count. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "R600InstrInfo.h" +#include "llvm/CodeGen/DFAPacketizer.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "vec-merger" + +namespace { + +static bool +isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) { + for (MachineRegisterInfo::def_instr_iterator It = MRI.def_instr_begin(Reg), + E = MRI.def_instr_end(); It != E; ++It) { + return (*It).isImplicitDef(); + } + if (MRI.isReserved(Reg)) { + return false; + } + llvm_unreachable("Reg without a def"); + return false; +} + +class RegSeqInfo { +public: + MachineInstr *Instr; + DenseMap<unsigned, unsigned> RegToChan; + std::vector<unsigned> UndefReg; + RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) { + assert(MI->getOpcode() == AMDGPU::REG_SEQUENCE); + for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) { + MachineOperand &MO = Instr->getOperand(i); + unsigned Chan = Instr->getOperand(i + 1).getImm(); + if (isImplicitlyDef(MRI, MO.getReg())) + UndefReg.push_back(Chan); + else + RegToChan[MO.getReg()] = Chan; + } + } + RegSeqInfo() {} + + bool operator==(const RegSeqInfo &RSI) const { + return RSI.Instr == Instr; + } +}; + +class R600VectorRegMerger : public MachineFunctionPass { +private: + MachineRegisterInfo *MRI; + const R600InstrInfo *TII; + bool canSwizzle(const MachineInstr &) const; + bool areAllUsesSwizzeable(unsigned Reg) const; + void SwizzleInput(MachineInstr &, + const std::vector<std::pair<unsigned, unsigned> > &) const; + bool tryMergeVector(const RegSeqInfo *, RegSeqInfo *, + std::vector<std::pair<unsigned, unsigned> > &Remap) const; + bool tryMergeUsingCommonSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI, + std::vector<std::pair<unsigned, unsigned> > &RemapChan); + bool tryMergeUsingFreeSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI, + std::vector<std::pair<unsigned, unsigned> > &RemapChan); + MachineInstr *RebuildVector(RegSeqInfo *MI, + const RegSeqInfo *BaseVec, + const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const; + void RemoveMI(MachineInstr *); + void trackRSI(const RegSeqInfo &RSI); + + typedef DenseMap<unsigned, std::vector<MachineInstr *> > InstructionSetMap; + DenseMap<MachineInstr *, RegSeqInfo> PreviousRegSeq; + InstructionSetMap PreviousRegSeqByReg; + InstructionSetMap PreviousRegSeqByUndefCount; +public: + static char ID; + R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID), + TII(nullptr) { } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineLoopInfo>(); + AU.addPreserved<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + const char *getPassName() const override { + return "R600 Vector Registers Merge Pass"; + } + + bool runOnMachineFunction(MachineFunction &Fn) override; +}; + +char R600VectorRegMerger::ID = 0; + +bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI) + const { + if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) + return true; + switch (MI.getOpcode()) { + case AMDGPU::R600_ExportSwz: + case AMDGPU::EG_ExportSwz: + return true; + default: + return false; + } +} + +bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched, + RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned> > &Remap) + const { + unsigned CurrentUndexIdx = 0; + for (DenseMap<unsigned, unsigned>::iterator It = ToMerge->RegToChan.begin(), + E = ToMerge->RegToChan.end(); It != E; ++It) { + DenseMap<unsigned, unsigned>::const_iterator PosInUntouched = + Untouched->RegToChan.find((*It).first); + if (PosInUntouched != Untouched->RegToChan.end()) { + Remap.push_back(std::pair<unsigned, unsigned> + ((*It).second, (*PosInUntouched).second)); + continue; + } + if (CurrentUndexIdx >= Untouched->UndefReg.size()) + return false; + Remap.push_back(std::pair<unsigned, unsigned> + ((*It).second, Untouched->UndefReg[CurrentUndexIdx++])); + } + + return true; +} + +static +unsigned getReassignedChan( + const std::vector<std::pair<unsigned, unsigned> > &RemapChan, + unsigned Chan) { + for (unsigned j = 0, je = RemapChan.size(); j < je; j++) { + if (RemapChan[j].first == Chan) + return RemapChan[j].second; + } + llvm_unreachable("Chan wasn't reassigned"); +} + +MachineInstr *R600VectorRegMerger::RebuildVector( + RegSeqInfo *RSI, const RegSeqInfo *BaseRSI, + const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const { + unsigned Reg = RSI->Instr->getOperand(0).getReg(); + MachineBasicBlock::iterator Pos = RSI->Instr; + MachineBasicBlock &MBB = *Pos->getParent(); + DebugLoc DL = Pos->getDebugLoc(); + + unsigned SrcVec = BaseRSI->Instr->getOperand(0).getReg(); + DenseMap<unsigned, unsigned> UpdatedRegToChan = BaseRSI->RegToChan; + std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg; + for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(), + E = RSI->RegToChan.end(); It != E; ++It) { + unsigned DstReg = MRI->createVirtualRegister(&AMDGPU::R600_Reg128RegClass); + unsigned SubReg = (*It).first; + unsigned Swizzle = (*It).second; + unsigned Chan = getReassignedChan(RemapChan, Swizzle); + + MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::INSERT_SUBREG), + DstReg) + .addReg(SrcVec) + .addReg(SubReg) + .addImm(Chan); + UpdatedRegToChan[SubReg] = Chan; + std::vector<unsigned>::iterator ChanPos = + std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan); + if (ChanPos != UpdatedUndef.end()) + UpdatedUndef.erase(ChanPos); + assert(std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan) == + UpdatedUndef.end() && + "UpdatedUndef shouldn't contain Chan more than once!"); + DEBUG(dbgs() << " ->"; Tmp->dump();); + (void)Tmp; + SrcVec = DstReg; + } + Pos = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg) + .addReg(SrcVec); + DEBUG(dbgs() << " ->"; Pos->dump();); + + DEBUG(dbgs() << " Updating Swizzle:\n"); + for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg), + E = MRI->use_instr_end(); It != E; ++It) { + DEBUG(dbgs() << " ";(*It).dump(); dbgs() << " ->"); + SwizzleInput(*It, RemapChan); + DEBUG((*It).dump()); + } + RSI->Instr->eraseFromParent(); + + // Update RSI + RSI->Instr = Pos; + RSI->RegToChan = UpdatedRegToChan; + RSI->UndefReg = UpdatedUndef; + + return Pos; +} + +void R600VectorRegMerger::RemoveMI(MachineInstr *MI) { + for (InstructionSetMap::iterator It = PreviousRegSeqByReg.begin(), + E = PreviousRegSeqByReg.end(); It != E; ++It) { + std::vector<MachineInstr *> &MIs = (*It).second; + MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end()); + } + for (InstructionSetMap::iterator It = PreviousRegSeqByUndefCount.begin(), + E = PreviousRegSeqByUndefCount.end(); It != E; ++It) { + std::vector<MachineInstr *> &MIs = (*It).second; + MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end()); + } +} + +void R600VectorRegMerger::SwizzleInput(MachineInstr &MI, + const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const { + unsigned Offset; + if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) + Offset = 2; + else + Offset = 3; + for (unsigned i = 0; i < 4; i++) { + unsigned Swizzle = MI.getOperand(i + Offset).getImm() + 1; + for (unsigned j = 0, e = RemapChan.size(); j < e; j++) { + if (RemapChan[j].first == Swizzle) { + MI.getOperand(i + Offset).setImm(RemapChan[j].second - 1); + break; + } + } + } +} + +bool R600VectorRegMerger::areAllUsesSwizzeable(unsigned Reg) const { + for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg), + E = MRI->use_instr_end(); It != E; ++It) { + if (!canSwizzle(*It)) + return false; + } + return true; +} + +bool R600VectorRegMerger::tryMergeUsingCommonSlot(RegSeqInfo &RSI, + RegSeqInfo &CompatibleRSI, + std::vector<std::pair<unsigned, unsigned> > &RemapChan) { + for (MachineInstr::mop_iterator MOp = RSI.Instr->operands_begin(), + MOE = RSI.Instr->operands_end(); MOp != MOE; ++MOp) { + if (!MOp->isReg()) + continue; + if (PreviousRegSeqByReg[MOp->getReg()].empty()) + continue; + for (MachineInstr *MI : PreviousRegSeqByReg[MOp->getReg()]) { + CompatibleRSI = PreviousRegSeq[MI]; + if (RSI == CompatibleRSI) + continue; + if (tryMergeVector(&CompatibleRSI, &RSI, RemapChan)) + return true; + } + } + return false; +} + +bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI, + RegSeqInfo &CompatibleRSI, + std::vector<std::pair<unsigned, unsigned> > &RemapChan) { + unsigned NeededUndefs = 4 - RSI.UndefReg.size(); + if (PreviousRegSeqByUndefCount[NeededUndefs].empty()) + return false; + std::vector<MachineInstr *> &MIs = + PreviousRegSeqByUndefCount[NeededUndefs]; + CompatibleRSI = PreviousRegSeq[MIs.back()]; + tryMergeVector(&CompatibleRSI, &RSI, RemapChan); + return true; +} + +void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) { + for (DenseMap<unsigned, unsigned>::const_iterator + It = RSI.RegToChan.begin(), E = RSI.RegToChan.end(); It != E; ++It) { + PreviousRegSeqByReg[(*It).first].push_back(RSI.Instr); + } + PreviousRegSeqByUndefCount[RSI.UndefReg.size()].push_back(RSI.Instr); + PreviousRegSeq[RSI.Instr] = RSI; +} + +bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { + TII = static_cast<const R600InstrInfo *>(Fn.getSubtarget().getInstrInfo()); + MRI = &(Fn.getRegInfo()); + for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); + MBB != MBBe; ++MBB) { + MachineBasicBlock *MB = &*MBB; + PreviousRegSeq.clear(); + PreviousRegSeqByReg.clear(); + PreviousRegSeqByUndefCount.clear(); + + for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end(); + MII != MIIE; ++MII) { + MachineInstr *MI = MII; + if (MI->getOpcode() != AMDGPU::REG_SEQUENCE) { + if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::TEX_INST) { + unsigned Reg = MI->getOperand(1).getReg(); + for (MachineRegisterInfo::def_instr_iterator + It = MRI->def_instr_begin(Reg), E = MRI->def_instr_end(); + It != E; ++It) { + RemoveMI(&(*It)); + } + } + continue; + } + + + RegSeqInfo RSI(*MRI, MI); + + // All uses of MI are swizzeable ? + unsigned Reg = MI->getOperand(0).getReg(); + if (!areAllUsesSwizzeable(Reg)) + continue; + + DEBUG (dbgs() << "Trying to optimize "; + MI->dump(); + ); + + RegSeqInfo CandidateRSI; + std::vector<std::pair<unsigned, unsigned> > RemapChan; + DEBUG(dbgs() << "Using common slots...\n";); + if (tryMergeUsingCommonSlot(RSI, CandidateRSI, RemapChan)) { + // Remove CandidateRSI mapping + RemoveMI(CandidateRSI.Instr); + MII = RebuildVector(&RSI, &CandidateRSI, RemapChan); + trackRSI(RSI); + continue; + } + DEBUG(dbgs() << "Using free slots...\n";); + RemapChan.clear(); + if (tryMergeUsingFreeSlot(RSI, CandidateRSI, RemapChan)) { + RemoveMI(CandidateRSI.Instr); + MII = RebuildVector(&RSI, &CandidateRSI, RemapChan); + trackRSI(RSI); + continue; + } + //Failed to merge + trackRSI(RSI); + } + } + return false; +} + +} + +llvm::FunctionPass *llvm::createR600VectorRegMerger(TargetMachine &tm) { + return new R600VectorRegMerger(tm); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp new file mode 100644 index 0000000..2126961 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp @@ -0,0 +1,408 @@ +//===----- R600Packetizer.cpp - VLIW packetizer ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass implements instructions packetization for R600. It unsets isLast +/// bit of instructions inside a bundle and substitutes src register with +/// PreviousVector when applicable. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/Debug.h" +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "R600InstrInfo.h" +#include "llvm/CodeGen/DFAPacketizer.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "packets" + +namespace { + +class R600Packetizer : public MachineFunctionPass { + +public: + static char ID; + R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineLoopInfo>(); + AU.addPreserved<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + const char *getPassName() const override { + return "R600 Packetizer"; + } + + bool runOnMachineFunction(MachineFunction &Fn) override; +}; +char R600Packetizer::ID = 0; + +class R600PacketizerList : public VLIWPacketizerList { + +private: + const R600InstrInfo *TII; + const R600RegisterInfo &TRI; + bool VLIW5; + bool ConsideredInstUsesAlreadyWrittenVectorElement; + + unsigned getSlot(const MachineInstr *MI) const { + return TRI.getHWRegChan(MI->getOperand(0).getReg()); + } + + /// \returns register to PV chan mapping for bundle/single instructions that + /// immediately precedes I. + DenseMap<unsigned, unsigned> getPreviousVector(MachineBasicBlock::iterator I) + const { + DenseMap<unsigned, unsigned> Result; + I--; + if (!TII->isALUInstr(I->getOpcode()) && !I->isBundle()) + return Result; + MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); + if (I->isBundle()) + BI++; + int LastDstChan = -1; + do { + bool isTrans = false; + int BISlot = getSlot(&*BI); + if (LastDstChan >= BISlot) + isTrans = true; + LastDstChan = BISlot; + if (TII->isPredicated(&*BI)) + continue; + int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write); + if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0) + continue; + int DstIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::dst); + if (DstIdx == -1) { + continue; + } + unsigned Dst = BI->getOperand(DstIdx).getReg(); + if (isTrans || TII->isTransOnly(&*BI)) { + Result[Dst] = AMDGPU::PS; + continue; + } + if (BI->getOpcode() == AMDGPU::DOT4_r600 || + BI->getOpcode() == AMDGPU::DOT4_eg) { + Result[Dst] = AMDGPU::PV_X; + continue; + } + if (Dst == AMDGPU::OQAP) { + continue; + } + unsigned PVReg = 0; + switch (TRI.getHWRegChan(Dst)) { + case 0: + PVReg = AMDGPU::PV_X; + break; + case 1: + PVReg = AMDGPU::PV_Y; + break; + case 2: + PVReg = AMDGPU::PV_Z; + break; + case 3: + PVReg = AMDGPU::PV_W; + break; + default: + llvm_unreachable("Invalid Chan"); + } + Result[Dst] = PVReg; + } while ((++BI)->isBundledWithPred()); + return Result; + } + + void substitutePV(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PVs) + const { + unsigned Ops[] = { + AMDGPU::OpName::src0, + AMDGPU::OpName::src1, + AMDGPU::OpName::src2 + }; + for (unsigned i = 0; i < 3; i++) { + int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]); + if (OperandIdx < 0) + continue; + unsigned Src = MI->getOperand(OperandIdx).getReg(); + const DenseMap<unsigned, unsigned>::const_iterator It = PVs.find(Src); + if (It != PVs.end()) + MI->getOperand(OperandIdx).setReg(It->second); + } + } +public: + // Ctor. + R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI) + : VLIWPacketizerList(MF, MLI, nullptr), + TII(static_cast<const R600InstrInfo *>( + MF.getSubtarget().getInstrInfo())), + TRI(TII->getRegisterInfo()) { + VLIW5 = !MF.getSubtarget<AMDGPUSubtarget>().hasCaymanISA(); + } + + // initPacketizerState - initialize some internal flags. + void initPacketizerState() override { + ConsideredInstUsesAlreadyWrittenVectorElement = false; + } + + // ignorePseudoInstruction - Ignore bundling of pseudo instructions. + bool ignorePseudoInstruction(const MachineInstr *MI, + const MachineBasicBlock *MBB) override { + return false; + } + + // isSoloInstruction - return true if instruction MI can not be packetized + // with any other instruction, which means that MI itself is a packet. + bool isSoloInstruction(const MachineInstr *MI) override { + if (TII->isVector(*MI)) + return true; + if (!TII->isALUInstr(MI->getOpcode())) + return true; + if (MI->getOpcode() == AMDGPU::GROUP_BARRIER) + return true; + // XXX: This can be removed once the packetizer properly handles all the + // LDS instruction group restrictions. + if (TII->isLDSInstr(MI->getOpcode())) + return true; + return false; + } + + // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ + // together. + bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override { + MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr(); + if (getSlot(MII) == getSlot(MIJ)) + ConsideredInstUsesAlreadyWrittenVectorElement = true; + // Does MII and MIJ share the same pred_sel ? + int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel), + OpJ = TII->getOperandIdx(MIJ->getOpcode(), AMDGPU::OpName::pred_sel); + unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0, + PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0; + if (PredI != PredJ) + return false; + if (SUJ->isSucc(SUI)) { + for (unsigned i = 0, e = SUJ->Succs.size(); i < e; ++i) { + const SDep &Dep = SUJ->Succs[i]; + if (Dep.getSUnit() != SUI) + continue; + if (Dep.getKind() == SDep::Anti) + continue; + if (Dep.getKind() == SDep::Output) + if (MII->getOperand(0).getReg() != MIJ->getOperand(0).getReg()) + continue; + return false; + } + } + + bool ARDef = TII->definesAddressRegister(MII) || + TII->definesAddressRegister(MIJ); + bool ARUse = TII->usesAddressRegister(MII) || + TII->usesAddressRegister(MIJ); + if (ARDef && ARUse) + return false; + + return true; + } + + // isLegalToPruneDependencies - Is it legal to prune dependece between SUI + // and SUJ. + bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override { + return false; + } + + void setIsLastBit(MachineInstr *MI, unsigned Bit) const { + unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last); + MI->getOperand(LastOp).setImm(Bit); + } + + bool isBundlableWithCurrentPMI(MachineInstr *MI, + const DenseMap<unsigned, unsigned> &PV, + std::vector<R600InstrInfo::BankSwizzle> &BS, + bool &isTransSlot) { + isTransSlot = TII->isTransOnly(MI); + assert (!isTransSlot || VLIW5); + + // Is the dst reg sequence legal ? + if (!isTransSlot && !CurrentPacketMIs.empty()) { + if (getSlot(MI) <= getSlot(CurrentPacketMIs.back())) { + if (ConsideredInstUsesAlreadyWrittenVectorElement && + !TII->isVectorOnly(MI) && VLIW5) { + isTransSlot = true; + DEBUG(dbgs() << "Considering as Trans Inst :"; MI->dump();); + } + else + return false; + } + } + + // Are the Constants limitations met ? + CurrentPacketMIs.push_back(MI); + if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) { + DEBUG( + dbgs() << "Couldn't pack :\n"; + MI->dump(); + dbgs() << "with the following packets :\n"; + for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { + CurrentPacketMIs[i]->dump(); + dbgs() << "\n"; + } + dbgs() << "because of Consts read limitations\n"; + ); + CurrentPacketMIs.pop_back(); + return false; + } + + // Is there a BankSwizzle set that meet Read Port limitations ? + if (!TII->fitsReadPortLimitations(CurrentPacketMIs, + PV, BS, isTransSlot)) { + DEBUG( + dbgs() << "Couldn't pack :\n"; + MI->dump(); + dbgs() << "with the following packets :\n"; + for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { + CurrentPacketMIs[i]->dump(); + dbgs() << "\n"; + } + dbgs() << "because of Read port limitations\n"; + ); + CurrentPacketMIs.pop_back(); + return false; + } + + // We cannot read LDS source registrs from the Trans slot. + if (isTransSlot && TII->readsLDSSrcReg(MI)) + return false; + + CurrentPacketMIs.pop_back(); + return true; + } + + MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override { + MachineBasicBlock::iterator FirstInBundle = + CurrentPacketMIs.empty() ? MI : CurrentPacketMIs.front(); + const DenseMap<unsigned, unsigned> &PV = + getPreviousVector(FirstInBundle); + std::vector<R600InstrInfo::BankSwizzle> BS; + bool isTransSlot; + + if (isBundlableWithCurrentPMI(MI, PV, BS, isTransSlot)) { + for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) { + MachineInstr *MI = CurrentPacketMIs[i]; + unsigned Op = TII->getOperandIdx(MI->getOpcode(), + AMDGPU::OpName::bank_swizzle); + MI->getOperand(Op).setImm(BS[i]); + } + unsigned Op = TII->getOperandIdx(MI->getOpcode(), + AMDGPU::OpName::bank_swizzle); + MI->getOperand(Op).setImm(BS.back()); + if (!CurrentPacketMIs.empty()) + setIsLastBit(CurrentPacketMIs.back(), 0); + substitutePV(MI, PV); + MachineBasicBlock::iterator It = VLIWPacketizerList::addToPacket(MI); + if (isTransSlot) { + endPacket(std::next(It)->getParent(), std::next(It)); + } + return It; + } + endPacket(MI->getParent(), MI); + if (TII->isTransOnly(MI)) + return MI; + return VLIWPacketizerList::addToPacket(MI); + } +}; + +bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { + const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo(); + MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); + + // Instantiate the packetizer. + R600PacketizerList Packetizer(Fn, MLI); + + // DFA state table should not be empty. + assert(Packetizer.getResourceTracker() && "Empty DFA table!"); + + // + // Loop over all basic blocks and remove KILL pseudo-instructions + // These instructions confuse the dependence analysis. Consider: + // D0 = ... (Insn 0) + // R0 = KILL R0, D0 (Insn 1) + // R0 = ... (Insn 2) + // Here, Insn 1 will result in the dependence graph not emitting an output + // dependence between Insn 0 and Insn 2. This can lead to incorrect + // packetization + // + for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); + MBB != MBBe; ++MBB) { + MachineBasicBlock::iterator End = MBB->end(); + MachineBasicBlock::iterator MI = MBB->begin(); + while (MI != End) { + if (MI->isKill() || MI->getOpcode() == AMDGPU::IMPLICIT_DEF || + (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) { + MachineBasicBlock::iterator DeleteMI = MI; + ++MI; + MBB->erase(DeleteMI); + End = MBB->end(); + continue; + } + ++MI; + } + } + + // Loop over all of the basic blocks. + for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); + MBB != MBBe; ++MBB) { + // Find scheduling regions and schedule / packetize each region. + unsigned RemainingCount = MBB->size(); + for(MachineBasicBlock::iterator RegionEnd = MBB->end(); + RegionEnd != MBB->begin();) { + // The next region starts above the previous region. Look backward in the + // instruction stream until we find the nearest boundary. + MachineBasicBlock::iterator I = RegionEnd; + for(;I != MBB->begin(); --I, --RemainingCount) { + if (TII->isSchedulingBoundary(&*std::prev(I), &*MBB, Fn)) + break; + } + I = MBB->begin(); + + // Skip empty scheduling regions. + if (I == RegionEnd) { + RegionEnd = std::prev(RegionEnd); + --RemainingCount; + continue; + } + // Skip regions with one instruction. + if (I == std::prev(RegionEnd)) { + RegionEnd = std::prev(RegionEnd); + continue; + } + + Packetizer.PacketizeMIs(&*MBB, &*I, RegionEnd); + RegionEnd = I; + } + } + + return true; + +} + +} // end anonymous namespace + +llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) { + return new R600Packetizer(tm); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp new file mode 100644 index 0000000..fb0359c --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp @@ -0,0 +1,91 @@ +//===-- R600RegisterInfo.cpp - R600 Register Information ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief R600 implementation of the TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#include "R600RegisterInfo.h" +#include "AMDGPUTargetMachine.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" + +using namespace llvm; + +R600RegisterInfo::R600RegisterInfo() : AMDGPURegisterInfo() { + RCW.RegWeight = 0; + RCW.WeightLimit = 0; +} + +BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { + BitVector Reserved(getNumRegs()); + + const R600InstrInfo *TII = + static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); + + Reserved.set(AMDGPU::ZERO); + Reserved.set(AMDGPU::HALF); + Reserved.set(AMDGPU::ONE); + Reserved.set(AMDGPU::ONE_INT); + Reserved.set(AMDGPU::NEG_HALF); + Reserved.set(AMDGPU::NEG_ONE); + Reserved.set(AMDGPU::PV_X); + Reserved.set(AMDGPU::ALU_LITERAL_X); + Reserved.set(AMDGPU::ALU_CONST); + Reserved.set(AMDGPU::PREDICATE_BIT); + Reserved.set(AMDGPU::PRED_SEL_OFF); + Reserved.set(AMDGPU::PRED_SEL_ZERO); + Reserved.set(AMDGPU::PRED_SEL_ONE); + Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); + + for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(), + E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) { + Reserved.set(*I); + } + + TII->reserveIndirectRegisters(Reserved, MF); + + return Reserved; +} + +unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const { + return this->getEncodingValue(reg) >> HW_CHAN_SHIFT; +} + +unsigned R600RegisterInfo::getHWRegIndex(unsigned Reg) const { + return GET_REG_INDEX(getEncodingValue(Reg)); +} + +const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass( + MVT VT) const { + switch(VT.SimpleTy) { + default: + case MVT::i32: return &AMDGPU::R600_TReg32RegClass; + } +} + +const RegClassWeight &R600RegisterInfo::getRegClassWeight( + const TargetRegisterClass *RC) const { + return RCW; +} + +bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const { + assert(!TargetRegisterInfo::isVirtualRegister(Reg)); + + switch (Reg) { + case AMDGPU::OQAP: + case AMDGPU::OQBP: + case AMDGPU::AR_X: + return false; + default: + return true; + } +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h new file mode 100644 index 0000000..4f8a129 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h @@ -0,0 +1,49 @@ +//===-- R600RegisterInfo.h - R600 Register Info Interface ------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface definition for R600RegisterInfo +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_R600REGISTERINFO_H +#define LLVM_LIB_TARGET_R600_R600REGISTERINFO_H + +#include "AMDGPURegisterInfo.h" + +namespace llvm { + +class AMDGPUSubtarget; + +struct R600RegisterInfo : public AMDGPURegisterInfo { + RegClassWeight RCW; + + R600RegisterInfo(); + + BitVector getReservedRegs(const MachineFunction &MF) const override; + + /// \brief get the HW encoding for a register's channel. + unsigned getHWRegChan(unsigned reg) const; + + unsigned getHWRegIndex(unsigned Reg) const override; + + /// \brief get the register class of the specified type to use in the + /// CFGStructurizer + const TargetRegisterClass *getCFGStructurizerRegClass(MVT VT) const; + + const RegClassWeight & + getRegClassWeight(const TargetRegisterClass *RC) const override; + + // \returns true if \p Reg can be defined in one ALU caluse and used in another. + bool isPhysRegLiveAcrossClauses(unsigned Reg) const; +}; + +} // End namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td new file mode 100644 index 0000000..cc667d9 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td @@ -0,0 +1,252 @@ + +class R600Reg <string name, bits<16> encoding> : Register<name> { + let Namespace = "AMDGPU"; + let HWEncoding = encoding; +} + +class R600RegWithChan <string name, bits<9> sel, string chan> : + Register <name> { + + field bits<2> chan_encoding = !if(!eq(chan, "X"), 0, + !if(!eq(chan, "Y"), 1, + !if(!eq(chan, "Z"), 2, + !if(!eq(chan, "W"), 3, 0)))); + let HWEncoding{8-0} = sel; + let HWEncoding{10-9} = chan_encoding; + let Namespace = "AMDGPU"; +} + +class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> : + RegisterWithSubRegs<n, subregs> { + field bits<2> chan_encoding = 0; + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1, sub2, sub3]; + let HWEncoding{8-0} = encoding{8-0}; + let HWEncoding{10-9} = chan_encoding; +} + +class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> : + RegisterWithSubRegs<n, subregs> { + field bits<2> chan_encoding = 0; + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = encoding; + let HWEncoding{8-0} = encoding{8-0}; + let HWEncoding{10-9} = chan_encoding; +} + +class R600Reg_64Vertical<int lo, int hi, string chan> : R600Reg_64 < + "V"#lo#hi#"_"#chan, + [!cast<Register>("T"#lo#"_"#chan), !cast<Register>("T"#hi#"_"#chan)], + lo +>; + +foreach Index = 0-127 in { + foreach Chan = [ "X", "Y", "Z", "W" ] in { + // 32-bit Temporary Registers + def T#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, Chan>; + + // Indirect addressing offset registers + def Addr#Index#_#Chan : R600RegWithChan <"T("#Index#" + AR.x)."#Chan, + Index, Chan>; + } + // 128-bit Temporary Registers + def T#Index#_XYZW : R600Reg_128 <"T"#Index#"", + [!cast<Register>("T"#Index#"_X"), + !cast<Register>("T"#Index#"_Y"), + !cast<Register>("T"#Index#"_Z"), + !cast<Register>("T"#Index#"_W")], + Index>; + + def T#Index#_XY : R600Reg_64 <"T"#Index#"", + [!cast<Register>("T"#Index#"_X"), + !cast<Register>("T"#Index#"_Y")], + Index>; +} + +foreach Chan = [ "X", "Y", "Z", "W"] in { + + let chan_encoding = !if(!eq(Chan, "X"), 0, + !if(!eq(Chan, "Y"), 1, + !if(!eq(Chan, "Z"), 2, + !if(!eq(Chan, "W"), 3, 0)))) in { + def V0123_#Chan : R600Reg_128 <"V0123_"#Chan, + [!cast<Register>("T0_"#Chan), + !cast<Register>("T1_"#Chan), + !cast<Register>("T2_"#Chan), + !cast<Register>("T3_"#Chan)], + 0>; + def V01_#Chan : R600Reg_64Vertical<0, 1, Chan>; + def V23_#Chan : R600Reg_64Vertical<2, 3, Chan>; + } +} + + +// KCACHE_BANK0 +foreach Index = 159-128 in { + foreach Chan = [ "X", "Y", "Z", "W" ] in { + // 32-bit Temporary Registers + def KC0_#Index#_#Chan : R600RegWithChan <"KC0["#!add(Index,-128)#"]."#Chan, Index, Chan>; + } + // 128-bit Temporary Registers + def KC0_#Index#_XYZW : R600Reg_128 <"KC0["#!add(Index, -128)#"].XYZW", + [!cast<Register>("KC0_"#Index#"_X"), + !cast<Register>("KC0_"#Index#"_Y"), + !cast<Register>("KC0_"#Index#"_Z"), + !cast<Register>("KC0_"#Index#"_W")], + Index>; +} + +// KCACHE_BANK1 +foreach Index = 191-160 in { + foreach Chan = [ "X", "Y", "Z", "W" ] in { + // 32-bit Temporary Registers + def KC1_#Index#_#Chan : R600RegWithChan <"KC1["#!add(Index,-160)#"]."#Chan, Index, Chan>; + } + // 128-bit Temporary Registers + def KC1_#Index#_XYZW : R600Reg_128 <"KC1["#!add(Index, -160)#"].XYZW", + [!cast<Register>("KC1_"#Index#"_X"), + !cast<Register>("KC1_"#Index#"_Y"), + !cast<Register>("KC1_"#Index#"_Z"), + !cast<Register>("KC1_"#Index#"_W")], + Index>; +} + + +// Array Base Register holding input in FS +foreach Index = 448-480 in { + def ArrayBase#Index : R600Reg<"ARRAY_BASE", Index>; +} + + +// Special Registers + +def OQA : R600Reg<"OQA", 219>; +def OQB : R600Reg<"OQB", 220>; +def OQAP : R600Reg<"OQAP", 221>; +def OQBP : R600Reg<"OQAP", 222>; +def LDS_DIRECT_A : R600Reg<"LDS_DIRECT_A", 223>; +def LDS_DIRECT_B : R600Reg<"LDS_DIRECT_B", 224>; +def ZERO : R600Reg<"0.0", 248>; +def ONE : R600Reg<"1.0", 249>; +def NEG_ONE : R600Reg<"-1.0", 249>; +def ONE_INT : R600Reg<"1", 250>; +def HALF : R600Reg<"0.5", 252>; +def NEG_HALF : R600Reg<"-0.5", 252>; +def ALU_LITERAL_X : R600RegWithChan<"literal.x", 253, "X">; +def ALU_LITERAL_Y : R600RegWithChan<"literal.y", 253, "Y">; +def ALU_LITERAL_Z : R600RegWithChan<"literal.z", 253, "Z">; +def ALU_LITERAL_W : R600RegWithChan<"literal.w", 253, "W">; +def PV_X : R600RegWithChan<"PV.X", 254, "X">; +def PV_Y : R600RegWithChan<"PV.Y", 254, "Y">; +def PV_Z : R600RegWithChan<"PV.Z", 254, "Z">; +def PV_W : R600RegWithChan<"PV.W", 254, "W">; +def PS: R600Reg<"PS", 255>; +def PREDICATE_BIT : R600Reg<"PredicateBit", 0>; +def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>; +def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>; +def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>; +def AR_X : R600Reg<"AR.x", 0>; + +def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "ArrayBase%u", 448, 480))>; +// special registers for ALU src operands +// const buffer reference, SRCx_SEL contains index +def ALU_CONST : R600Reg<"CBuf", 0>; +// interpolation param reference, SRCx_SEL contains index +def ALU_PARAM : R600Reg<"Param", 0>; + +let isAllocatable = 0 in { + +def R600_Addr : RegisterClass <"AMDGPU", [i32], 32, (add (sequence "Addr%u_X", 0, 127))>; + +// We only use Addr_[YZW] for vertical vectors. +// FIXME if we add more vertical vector registers we will need to ad more +// registers to these classes. +def R600_Addr_Y : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Y)>; +def R600_Addr_Z : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Z)>; +def R600_Addr_W : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_W)>; + +def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32, + (add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>; + +def R600_KC0_X : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC0_%u_X", 128, 159))>; + +def R600_KC0_Y : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC0_%u_Y", 128, 159))>; + +def R600_KC0_Z : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC0_%u_Z", 128, 159))>; + +def R600_KC0_W : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC0_%u_W", 128, 159))>; + +def R600_KC0 : RegisterClass <"AMDGPU", [f32, i32], 32, + (interleave R600_KC0_X, R600_KC0_Y, + R600_KC0_Z, R600_KC0_W)>; + +def R600_KC1_X : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC1_%u_X", 160, 191))>; + +def R600_KC1_Y : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC1_%u_Y", 160, 191))>; + +def R600_KC1_Z : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC1_%u_Z", 160, 191))>; + +def R600_KC1_W : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC1_%u_W", 160, 191))>; + +def R600_KC1 : RegisterClass <"AMDGPU", [f32, i32], 32, + (interleave R600_KC1_X, R600_KC1_Y, + R600_KC1_Z, R600_KC1_W)>; + +} // End isAllocatable = 0 + +def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "T%u_X", 0, 127), AR_X)>; + +def R600_TReg32_Y : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "T%u_Y", 0, 127))>; + +def R600_TReg32_Z : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "T%u_Z", 0, 127))>; + +def R600_TReg32_W : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "T%u_W", 0, 127))>; + +def R600_TReg32 : RegisterClass <"AMDGPU", [f32, i32], 32, + (interleave R600_TReg32_X, R600_TReg32_Y, + R600_TReg32_Z, R600_TReg32_W)>; + +def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add + R600_TReg32, + R600_ArrayBase, + R600_Addr, + R600_KC0, R600_KC1, + ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF, + ALU_CONST, ALU_PARAM, OQAP + )>; + +def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add + PRED_SEL_OFF, PRED_SEL_ZERO, PRED_SEL_ONE)>; + +def R600_Predicate_Bit: RegisterClass <"AMDGPU", [i32], 32, (add + PREDICATE_BIT)>; + +def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, + (add (sequence "T%u_XYZW", 0, 127))> { + let CopyCost = -1; +} + +def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, + (add V0123_W, V0123_Z, V0123_Y, V0123_X) +>; + +def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64, + (add (sequence "T%u_XY", 0, 63))>; + +def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64, + (add V01_X, V01_Y, V01_Z, V01_W, + V23_X, V23_Y, V23_Z, V23_W)>; diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Schedule.td b/contrib/llvm/lib/Target/AMDGPU/R600Schedule.td new file mode 100644 index 0000000..df62bf8 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600Schedule.td @@ -0,0 +1,49 @@ +//===-- R600Schedule.td - R600 Scheduling definitions ------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// R600 has a VLIW architecture. On pre-cayman cards there are 5 instruction +// slots ALU.X, ALU.Y, ALU.Z, ALU.W, and TRANS. For cayman cards, the TRANS +// slot has been removed. +// +//===----------------------------------------------------------------------===// + + +def ALU_X : FuncUnit; +def ALU_Y : FuncUnit; +def ALU_Z : FuncUnit; +def ALU_W : FuncUnit; +def TRANS : FuncUnit; + +def AnyALU : InstrItinClass; +def VecALU : InstrItinClass; +def TransALU : InstrItinClass; +def XALU : InstrItinClass; + +def R600_VLIW5_Itin : ProcessorItineraries < + [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL], + [], + [ + InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS]>]>, + InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>, + InstrItinData<TransALU, [InstrStage<1, [TRANS]>]>, + InstrItinData<XALU, [InstrStage<1, [ALU_X]>]>, + InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]> + ] +>; + +def R600_VLIW4_Itin : ProcessorItineraries < + [ALU_X, ALU_Y, ALU_Z, ALU_W, ALU_NULL], + [], + [ + InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>, + InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>, + InstrItinData<TransALU, [InstrStage<1, [ALU_NULL]>]>, + InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]> + ] +>; diff --git a/contrib/llvm/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp new file mode 100644 index 0000000..2fc7b02 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp @@ -0,0 +1,303 @@ +//===-- R600TextureIntrinsicsReplacer.cpp ---------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass translates tgsi-like texture intrinsics into R600 texture +/// closer to hardware intrinsics. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" + +using namespace llvm; + +namespace { +class R600TextureIntrinsicsReplacer : + public FunctionPass, public InstVisitor<R600TextureIntrinsicsReplacer> { + static char ID; + + Module *Mod; + Type *FloatType; + Type *Int32Type; + Type *V4f32Type; + Type *V4i32Type; + FunctionType *TexSign; + FunctionType *TexQSign; + + void getAdjustmentFromTextureTarget(unsigned TextureType, bool hasLOD, + unsigned SrcSelect[4], unsigned CT[4], + bool &useShadowVariant) { + enum TextureTypes { + TEXTURE_1D = 1, + TEXTURE_2D, + TEXTURE_3D, + TEXTURE_CUBE, + TEXTURE_RECT, + TEXTURE_SHADOW1D, + TEXTURE_SHADOW2D, + TEXTURE_SHADOWRECT, + TEXTURE_1D_ARRAY, + TEXTURE_2D_ARRAY, + TEXTURE_SHADOW1D_ARRAY, + TEXTURE_SHADOW2D_ARRAY, + TEXTURE_SHADOWCUBE, + TEXTURE_2D_MSAA, + TEXTURE_2D_ARRAY_MSAA, + TEXTURE_CUBE_ARRAY, + TEXTURE_SHADOWCUBE_ARRAY + }; + + switch (TextureType) { + case 0: + useShadowVariant = false; + return; + case TEXTURE_RECT: + case TEXTURE_1D: + case TEXTURE_2D: + case TEXTURE_3D: + case TEXTURE_CUBE: + case TEXTURE_1D_ARRAY: + case TEXTURE_2D_ARRAY: + case TEXTURE_CUBE_ARRAY: + case TEXTURE_2D_MSAA: + case TEXTURE_2D_ARRAY_MSAA: + useShadowVariant = false; + break; + case TEXTURE_SHADOW1D: + case TEXTURE_SHADOW2D: + case TEXTURE_SHADOWRECT: + case TEXTURE_SHADOW1D_ARRAY: + case TEXTURE_SHADOW2D_ARRAY: + case TEXTURE_SHADOWCUBE: + case TEXTURE_SHADOWCUBE_ARRAY: + useShadowVariant = true; + break; + default: + llvm_unreachable("Unknow Texture Type"); + } + + if (TextureType == TEXTURE_RECT || + TextureType == TEXTURE_SHADOWRECT) { + CT[0] = 0; + CT[1] = 0; + } + + if (TextureType == TEXTURE_CUBE_ARRAY || + TextureType == TEXTURE_SHADOWCUBE_ARRAY) + CT[2] = 0; + + if (TextureType == TEXTURE_1D_ARRAY || + TextureType == TEXTURE_SHADOW1D_ARRAY) { + if (hasLOD && useShadowVariant) { + CT[1] = 0; + } else { + CT[2] = 0; + SrcSelect[2] = 1; + } + } else if (TextureType == TEXTURE_2D_ARRAY || + TextureType == TEXTURE_SHADOW2D_ARRAY) { + CT[2] = 0; + } + + if ((TextureType == TEXTURE_SHADOW1D || + TextureType == TEXTURE_SHADOW2D || + TextureType == TEXTURE_SHADOWRECT || + TextureType == TEXTURE_SHADOW1D_ARRAY) && + !(hasLOD && useShadowVariant)) + SrcSelect[3] = 2; + } + + void ReplaceCallInst(CallInst &I, FunctionType *FT, const char *Name, + unsigned SrcSelect[4], Value *Offset[3], Value *Resource, + Value *Sampler, unsigned CT[4], Value *Coord) { + IRBuilder<> Builder(&I); + Constant *Mask[] = { + ConstantInt::get(Int32Type, SrcSelect[0]), + ConstantInt::get(Int32Type, SrcSelect[1]), + ConstantInt::get(Int32Type, SrcSelect[2]), + ConstantInt::get(Int32Type, SrcSelect[3]) + }; + Value *SwizzleMask = ConstantVector::get(Mask); + Value *SwizzledCoord = + Builder.CreateShuffleVector(Coord, Coord, SwizzleMask); + + Value *Args[] = { + SwizzledCoord, + Offset[0], + Offset[1], + Offset[2], + Resource, + Sampler, + ConstantInt::get(Int32Type, CT[0]), + ConstantInt::get(Int32Type, CT[1]), + ConstantInt::get(Int32Type, CT[2]), + ConstantInt::get(Int32Type, CT[3]) + }; + + Function *F = Mod->getFunction(Name); + if (!F) { + F = Function::Create(FT, GlobalValue::ExternalLinkage, Name, Mod); + F->addFnAttr(Attribute::ReadNone); + } + I.replaceAllUsesWith(Builder.CreateCall(F, Args)); + I.eraseFromParent(); + } + + void ReplaceTexIntrinsic(CallInst &I, bool hasLOD, FunctionType *FT, + const char *VanillaInt, + const char *ShadowInt) { + Value *Coord = I.getArgOperand(0); + Value *ResourceId = I.getArgOperand(1); + Value *SamplerId = I.getArgOperand(2); + + unsigned TextureType = + cast<ConstantInt>(I.getArgOperand(3))->getZExtValue(); + + unsigned SrcSelect[4] = { 0, 1, 2, 3 }; + unsigned CT[4] = {1, 1, 1, 1}; + Value *Offset[3] = { + ConstantInt::get(Int32Type, 0), + ConstantInt::get(Int32Type, 0), + ConstantInt::get(Int32Type, 0) + }; + bool useShadowVariant; + + getAdjustmentFromTextureTarget(TextureType, hasLOD, SrcSelect, CT, + useShadowVariant); + + ReplaceCallInst(I, FT, useShadowVariant?ShadowInt:VanillaInt, SrcSelect, + Offset, ResourceId, SamplerId, CT, Coord); + } + + void ReplaceTXF(CallInst &I) { + Value *Coord = I.getArgOperand(0); + Value *ResourceId = I.getArgOperand(4); + Value *SamplerId = I.getArgOperand(5); + + unsigned TextureType = + cast<ConstantInt>(I.getArgOperand(6))->getZExtValue(); + + unsigned SrcSelect[4] = { 0, 1, 2, 3 }; + unsigned CT[4] = {1, 1, 1, 1}; + Value *Offset[3] = { + I.getArgOperand(1), + I.getArgOperand(2), + I.getArgOperand(3), + }; + bool useShadowVariant; + + getAdjustmentFromTextureTarget(TextureType, false, SrcSelect, CT, + useShadowVariant); + + ReplaceCallInst(I, TexQSign, "llvm.R600.txf", SrcSelect, + Offset, ResourceId, SamplerId, CT, Coord); + } + +public: + R600TextureIntrinsicsReplacer(): + FunctionPass(ID) { + } + + bool doInitialization(Module &M) override { + LLVMContext &Ctx = M.getContext(); + Mod = &M; + FloatType = Type::getFloatTy(Ctx); + Int32Type = Type::getInt32Ty(Ctx); + V4f32Type = VectorType::get(FloatType, 4); + V4i32Type = VectorType::get(Int32Type, 4); + Type *ArgsType[] = { + V4f32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + }; + TexSign = FunctionType::get(V4f32Type, ArgsType, /*isVarArg=*/false); + Type *ArgsQType[] = { + V4i32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + }; + TexQSign = FunctionType::get(V4f32Type, ArgsQType, /*isVarArg=*/false); + return false; + } + + bool runOnFunction(Function &F) override { + visit(F); + return false; + } + + const char *getPassName() const override { + return "R600 Texture Intrinsics Replacer"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + } + + void visitCallInst(CallInst &I) { + if (!I.getCalledFunction()) + return; + + StringRef Name = I.getCalledFunction()->getName(); + if (Name == "llvm.AMDGPU.tex") { + ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.tex", "llvm.R600.texc"); + return; + } + if (Name == "llvm.AMDGPU.txl") { + ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txl", "llvm.R600.txlc"); + return; + } + if (Name == "llvm.AMDGPU.txb") { + ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txb", "llvm.R600.txbc"); + return; + } + if (Name == "llvm.AMDGPU.txf") { + ReplaceTXF(I); + return; + } + if (Name == "llvm.AMDGPU.txq") { + ReplaceTexIntrinsic(I, false, TexQSign, "llvm.R600.txq", "llvm.R600.txq"); + return; + } + if (Name == "llvm.AMDGPU.ddx") { + ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddx", "llvm.R600.ddx"); + return; + } + if (Name == "llvm.AMDGPU.ddy") { + ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddy", "llvm.R600.ddy"); + return; + } + } + +}; + +char R600TextureIntrinsicsReplacer::ID = 0; + +} + +FunctionPass *llvm::createR600TextureIntrinsicsReplacer() { + return new R600TextureIntrinsicsReplacer(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R700Instructions.td b/contrib/llvm/lib/Target/AMDGPU/R700Instructions.td new file mode 100644 index 0000000..613a0d7 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R700Instructions.td @@ -0,0 +1,21 @@ +//===-- R700Instructions.td - R700 Instruction defs -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TableGen definitions for instructions which are: +// - Available to R700 and newer VLIW4/VLIW5 GPUs +// - Available only on R700 family GPUs. +// +//===----------------------------------------------------------------------===// + +def isR700 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::R700">; + +let Predicates = [isR700] in { + def SIN_r700 : SIN_Common<0x6E>; + def COS_r700 : COS_Common<0x6F>; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp new file mode 100644 index 0000000..fa4d24a --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -0,0 +1,364 @@ +//===-- SIAnnotateControlFlow.cpp - ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Annotates the control flow with hardware specific intrinsics. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-annotate-control-flow" + +namespace { + +// Complex types used in this pass +typedef std::pair<BasicBlock *, Value *> StackEntry; +typedef SmallVector<StackEntry, 16> StackVector; + +// Intrinsic names the control flow is annotated with +static const char *const IfIntrinsic = "llvm.SI.if"; +static const char *const ElseIntrinsic = "llvm.SI.else"; +static const char *const BreakIntrinsic = "llvm.SI.break"; +static const char *const IfBreakIntrinsic = "llvm.SI.if.break"; +static const char *const ElseBreakIntrinsic = "llvm.SI.else.break"; +static const char *const LoopIntrinsic = "llvm.SI.loop"; +static const char *const EndCfIntrinsic = "llvm.SI.end.cf"; + +class SIAnnotateControlFlow : public FunctionPass { + + static char ID; + + Type *Boolean; + Type *Void; + Type *Int64; + Type *ReturnStruct; + + ConstantInt *BoolTrue; + ConstantInt *BoolFalse; + UndefValue *BoolUndef; + Constant *Int64Zero; + + Constant *If; + Constant *Else; + Constant *Break; + Constant *IfBreak; + Constant *ElseBreak; + Constant *Loop; + Constant *EndCf; + + DominatorTree *DT; + StackVector Stack; + + LoopInfo *LI; + + bool isTopOfStack(BasicBlock *BB); + + Value *popSaved(); + + void push(BasicBlock *BB, Value *Saved); + + bool isElse(PHINode *Phi); + + void eraseIfUnused(PHINode *Phi); + + void openIf(BranchInst *Term); + + void insertElse(BranchInst *Term); + + Value *handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L); + + void handleLoop(BranchInst *Term); + + void closeControlFlow(BasicBlock *BB); + +public: + SIAnnotateControlFlow(): + FunctionPass(ID) { } + + bool doInitialization(Module &M) override; + + bool runOnFunction(Function &F) override; + + const char *getPassName() const override { + return "SI annotate control flow"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + FunctionPass::getAnalysisUsage(AU); + } + +}; + +} // end anonymous namespace + +char SIAnnotateControlFlow::ID = 0; + +/// \brief Initialize all the types and constants used in the pass +bool SIAnnotateControlFlow::doInitialization(Module &M) { + LLVMContext &Context = M.getContext(); + + Void = Type::getVoidTy(Context); + Boolean = Type::getInt1Ty(Context); + Int64 = Type::getInt64Ty(Context); + ReturnStruct = StructType::get(Boolean, Int64, (Type *)nullptr); + + BoolTrue = ConstantInt::getTrue(Context); + BoolFalse = ConstantInt::getFalse(Context); + BoolUndef = UndefValue::get(Boolean); + Int64Zero = ConstantInt::get(Int64, 0); + + If = M.getOrInsertFunction( + IfIntrinsic, ReturnStruct, Boolean, (Type *)nullptr); + + Else = M.getOrInsertFunction( + ElseIntrinsic, ReturnStruct, Int64, (Type *)nullptr); + + Break = M.getOrInsertFunction( + BreakIntrinsic, Int64, Int64, (Type *)nullptr); + + IfBreak = M.getOrInsertFunction( + IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr); + + ElseBreak = M.getOrInsertFunction( + ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr); + + Loop = M.getOrInsertFunction( + LoopIntrinsic, Boolean, Int64, (Type *)nullptr); + + EndCf = M.getOrInsertFunction( + EndCfIntrinsic, Void, Int64, (Type *)nullptr); + + return false; +} + +/// \brief Is BB the last block saved on the stack ? +bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) { + return !Stack.empty() && Stack.back().first == BB; +} + +/// \brief Pop the last saved value from the control flow stack +Value *SIAnnotateControlFlow::popSaved() { + return Stack.pop_back_val().second; +} + +/// \brief Push a BB and saved value to the control flow stack +void SIAnnotateControlFlow::push(BasicBlock *BB, Value *Saved) { + Stack.push_back(std::make_pair(BB, Saved)); +} + +/// \brief Can the condition represented by this PHI node treated like +/// an "Else" block? +bool SIAnnotateControlFlow::isElse(PHINode *Phi) { + BasicBlock *IDom = DT->getNode(Phi->getParent())->getIDom()->getBlock(); + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { + if (Phi->getIncomingBlock(i) == IDom) { + + if (Phi->getIncomingValue(i) != BoolTrue) + return false; + + } else { + if (Phi->getIncomingValue(i) != BoolFalse) + return false; + + } + } + return true; +} + +// \brief Erase "Phi" if it is not used any more +void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) { + if (!Phi->hasNUsesOrMore(1)) + Phi->eraseFromParent(); +} + +/// \brief Open a new "If" block +void SIAnnotateControlFlow::openIf(BranchInst *Term) { + Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term); + Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); + push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); +} + +/// \brief Close the last "If" block and open a new "Else" block +void SIAnnotateControlFlow::insertElse(BranchInst *Term) { + Value *Ret = CallInst::Create(Else, popSaved(), "", Term); + Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); + push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); +} + +/// \brief Recursively handle the condition leading to a loop +Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, + llvm::Loop *L) { + + // Only search through PHI nodes which are inside the loop. If we try this + // with PHI nodes that are outside of the loop, we end up inserting new PHI + // nodes outside of the loop which depend on values defined inside the loop. + // This will break the module with + // 'Instruction does not dominate all users!' errors. + PHINode *Phi = nullptr; + if ((Phi = dyn_cast<PHINode>(Cond)) && L->contains(Phi)) { + + BasicBlock *Parent = Phi->getParent(); + PHINode *NewPhi = PHINode::Create(Int64, 0, "", &Parent->front()); + Value *Ret = NewPhi; + + // Handle all non-constant incoming values first + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { + Value *Incoming = Phi->getIncomingValue(i); + BasicBlock *From = Phi->getIncomingBlock(i); + if (isa<ConstantInt>(Incoming)) { + NewPhi->addIncoming(Broken, From); + continue; + } + + Phi->setIncomingValue(i, BoolFalse); + Value *PhiArg = handleLoopCondition(Incoming, Broken, L); + NewPhi->addIncoming(PhiArg, From); + } + + BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock(); + + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { + + Value *Incoming = Phi->getIncomingValue(i); + if (Incoming != BoolTrue) + continue; + + BasicBlock *From = Phi->getIncomingBlock(i); + if (From == IDom) { + CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt()); + if (OldEnd && OldEnd->getCalledFunction() == EndCf) { + Value *Args[] = { OldEnd->getArgOperand(0), NewPhi }; + Ret = CallInst::Create(ElseBreak, Args, "", OldEnd); + continue; + } + } + TerminatorInst *Insert = From->getTerminator(); + Value *PhiArg = CallInst::Create(Break, Broken, "", Insert); + NewPhi->setIncomingValue(i, PhiArg); + } + eraseIfUnused(Phi); + return Ret; + + } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) { + BasicBlock *Parent = Inst->getParent(); + Instruction *Insert; + if (L->contains(Inst)) { + Insert = Parent->getTerminator(); + } else { + Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime(); + } + Value *Args[] = { Cond, Broken }; + return CallInst::Create(IfBreak, Args, "", Insert); + + } else { + llvm_unreachable("Unhandled loop condition!"); + } + return 0; +} + +/// \brief Handle a back edge (loop) +void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { + BasicBlock *BB = Term->getParent(); + llvm::Loop *L = LI->getLoopFor(BB); + BasicBlock *Target = Term->getSuccessor(1); + PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front()); + + Value *Cond = Term->getCondition(); + Term->setCondition(BoolTrue); + Value *Arg = handleLoopCondition(Cond, Broken, L); + + for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target); + PI != PE; ++PI) { + + Broken->addIncoming(*PI == BB ? Arg : Int64Zero, *PI); + } + + Term->setCondition(CallInst::Create(Loop, Arg, "", Term)); + push(Term->getSuccessor(0), Arg); +}/// \brief Close the last opened control flow +void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { + llvm::Loop *L = LI->getLoopFor(BB); + + if (L && L->getHeader() == BB) { + // We can't insert an EndCF call into a loop header, because it will + // get executed on every iteration of the loop, when it should be + // executed only once before the loop. + SmallVector <BasicBlock*, 8> Latches; + L->getLoopLatches(Latches); + + std::vector<BasicBlock*> Preds; + for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) { + if (std::find(Latches.begin(), Latches.end(), *PI) == Latches.end()) + Preds.push_back(*PI); + } + BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false); + } + + CallInst::Create(EndCf, popSaved(), "", &*BB->getFirstInsertionPt()); +} + +/// \brief Annotate the control flow with intrinsics so the backend can +/// recognize if/then/else and loops. +bool SIAnnotateControlFlow::runOnFunction(Function &F) { + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + + for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()), + E = df_end(&F.getEntryBlock()); I != E; ++I) { + + BranchInst *Term = dyn_cast<BranchInst>((*I)->getTerminator()); + + if (!Term || Term->isUnconditional()) { + if (isTopOfStack(*I)) + closeControlFlow(*I); + continue; + } + + if (I.nodeVisited(Term->getSuccessor(1))) { + if (isTopOfStack(*I)) + closeControlFlow(*I); + handleLoop(Term); + continue; + } + + if (isTopOfStack(*I)) { + PHINode *Phi = dyn_cast<PHINode>(Term->getCondition()); + if (Phi && Phi->getParent() == *I && isElse(Phi)) { + insertElse(Term); + eraseIfUnused(Phi); + continue; + } + closeControlFlow(*I); + } + openIf(Term); + } + + assert(Stack.empty()); + return true; +} + +/// \brief Create the annotation pass +FunctionPass *llvm::createSIAnnotateControlFlowPass() { + return new SIAnnotateControlFlow(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h new file mode 100644 index 0000000..aa1e352 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h @@ -0,0 +1,198 @@ +//===-- SIDefines.h - SI Helper Macros ----------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#include "llvm/MC/MCInstrDesc.h" + +#ifndef LLVM_LIB_TARGET_R600_SIDEFINES_H +#define LLVM_LIB_TARGET_R600_SIDEFINES_H + +namespace SIInstrFlags { +// This needs to be kept in sync with the field bits in InstSI. +enum { + SALU = 1 << 3, + VALU = 1 << 4, + + SOP1 = 1 << 5, + SOP2 = 1 << 6, + SOPC = 1 << 7, + SOPK = 1 << 8, + SOPP = 1 << 9, + + VOP1 = 1 << 10, + VOP2 = 1 << 11, + VOP3 = 1 << 12, + VOPC = 1 << 13, + + MUBUF = 1 << 14, + MTBUF = 1 << 15, + SMRD = 1 << 16, + DS = 1 << 17, + MIMG = 1 << 18, + FLAT = 1 << 19, + WQM = 1 << 20, + VGPRSpill = 1 << 21, + VOPAsmPrefer32Bit = 1 << 22 +}; +} + +namespace llvm { +namespace AMDGPU { + enum OperandType { + /// Operand with register or 32-bit immediate + OPERAND_REG_IMM32 = llvm::MCOI::OPERAND_FIRST_TARGET, + /// Operand with register or inline constant + OPERAND_REG_INLINE_C + }; +} +} + +namespace SIInstrFlags { + enum Flags { + // First 4 bits are the instruction encoding + VM_CNT = 1 << 0, + EXP_CNT = 1 << 1, + LGKM_CNT = 1 << 2 + }; + + // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. + // The result is true if any of these tests are true. + enum ClassFlags { + S_NAN = 1 << 0, // Signaling NaN + Q_NAN = 1 << 1, // Quiet NaN + N_INFINITY = 1 << 2, // Negative infinity + N_NORMAL = 1 << 3, // Negative normal + N_SUBNORMAL = 1 << 4, // Negative subnormal + N_ZERO = 1 << 5, // Negative zero + P_ZERO = 1 << 6, // Positive zero + P_SUBNORMAL = 1 << 7, // Positive subnormal + P_NORMAL = 1 << 8, // Positive normal + P_INFINITY = 1 << 9 // Positive infinity + }; +} + +namespace SISrcMods { + enum { + NEG = 1 << 0, + ABS = 1 << 1 + }; +} + +namespace SIOutMods { + enum { + NONE = 0, + MUL2 = 1, + MUL4 = 2, + DIV2 = 3 + }; +} + +#define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 +#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS 0x00B02C +#define S_00B02C_EXTRA_LDS_SIZE(x) (((x) & 0xFF) << 8) +#define R_00B128_SPI_SHADER_PGM_RSRC1_VS 0x00B128 +#define R_00B228_SPI_SHADER_PGM_RSRC1_GS 0x00B228 +#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 +#define S_00B028_VGPRS(x) (((x) & 0x3F) << 0) +#define S_00B028_SGPRS(x) (((x) & 0x0F) << 6) + +#define R_00B84C_COMPUTE_PGM_RSRC2 0x00B84C +#define S_00B84C_SCRATCH_EN(x) (((x) & 0x1) << 0) +#define G_00B84C_SCRATCH_EN(x) (((x) >> 0) & 0x1) +#define C_00B84C_SCRATCH_EN 0xFFFFFFFE +#define S_00B84C_USER_SGPR(x) (((x) & 0x1F) << 1) +#define G_00B84C_USER_SGPR(x) (((x) >> 1) & 0x1F) +#define C_00B84C_USER_SGPR 0xFFFFFFC1 +#define S_00B84C_TGID_X_EN(x) (((x) & 0x1) << 7) +#define G_00B84C_TGID_X_EN(x) (((x) >> 7) & 0x1) +#define C_00B84C_TGID_X_EN 0xFFFFFF7F +#define S_00B84C_TGID_Y_EN(x) (((x) & 0x1) << 8) +#define G_00B84C_TGID_Y_EN(x) (((x) >> 8) & 0x1) +#define C_00B84C_TGID_Y_EN 0xFFFFFEFF +#define S_00B84C_TGID_Z_EN(x) (((x) & 0x1) << 9) +#define G_00B84C_TGID_Z_EN(x) (((x) >> 9) & 0x1) +#define C_00B84C_TGID_Z_EN 0xFFFFFDFF +#define S_00B84C_TG_SIZE_EN(x) (((x) & 0x1) << 10) +#define G_00B84C_TG_SIZE_EN(x) (((x) >> 10) & 0x1) +#define C_00B84C_TG_SIZE_EN 0xFFFFFBFF +#define S_00B84C_TIDIG_COMP_CNT(x) (((x) & 0x03) << 11) +#define G_00B84C_TIDIG_COMP_CNT(x) (((x) >> 11) & 0x03) +#define C_00B84C_TIDIG_COMP_CNT 0xFFFFE7FF +/* CIK */ +#define S_00B84C_EXCP_EN_MSB(x) (((x) & 0x03) << 13) +#define G_00B84C_EXCP_EN_MSB(x) (((x) >> 13) & 0x03) +#define C_00B84C_EXCP_EN_MSB 0xFFFF9FFF +/* */ +#define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15) +#define G_00B84C_LDS_SIZE(x) (((x) >> 15) & 0x1FF) +#define C_00B84C_LDS_SIZE 0xFF007FFF +#define S_00B84C_EXCP_EN(x) (((x) & 0x7F) << 24) +#define G_00B84C_EXCP_EN(x) (((x) >> 24) & 0x7F) +#define C_00B84C_EXCP_EN + +#define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC +#define R_0286D0_SPI_PS_INPUT_ADDR 0x0286D0 + +#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 +#define S_00B848_VGPRS(x) (((x) & 0x3F) << 0) +#define G_00B848_VGPRS(x) (((x) >> 0) & 0x3F) +#define C_00B848_VGPRS 0xFFFFFFC0 +#define S_00B848_SGPRS(x) (((x) & 0x0F) << 6) +#define G_00B848_SGPRS(x) (((x) >> 6) & 0x0F) +#define C_00B848_SGPRS 0xFFFFFC3F +#define S_00B848_PRIORITY(x) (((x) & 0x03) << 10) +#define G_00B848_PRIORITY(x) (((x) >> 10) & 0x03) +#define C_00B848_PRIORITY 0xFFFFF3FF +#define S_00B848_FLOAT_MODE(x) (((x) & 0xFF) << 12) +#define G_00B848_FLOAT_MODE(x) (((x) >> 12) & 0xFF) +#define C_00B848_FLOAT_MODE 0xFFF00FFF +#define S_00B848_PRIV(x) (((x) & 0x1) << 20) +#define G_00B848_PRIV(x) (((x) >> 20) & 0x1) +#define C_00B848_PRIV 0xFFEFFFFF +#define S_00B848_DX10_CLAMP(x) (((x) & 0x1) << 21) +#define G_00B848_DX10_CLAMP(x) (((x) >> 21) & 0x1) +#define C_00B848_DX10_CLAMP 0xFFDFFFFF +#define S_00B848_DEBUG_MODE(x) (((x) & 0x1) << 22) +#define G_00B848_DEBUG_MODE(x) (((x) >> 22) & 0x1) +#define C_00B848_DEBUG_MODE 0xFFBFFFFF +#define S_00B848_IEEE_MODE(x) (((x) & 0x1) << 23) +#define G_00B848_IEEE_MODE(x) (((x) >> 23) & 0x1) +#define C_00B848_IEEE_MODE 0xFF7FFFFF + + +// Helpers for setting FLOAT_MODE +#define FP_ROUND_ROUND_TO_NEAREST 0 +#define FP_ROUND_ROUND_TO_INF 1 +#define FP_ROUND_ROUND_TO_NEGINF 2 +#define FP_ROUND_ROUND_TO_ZERO 3 + +// Bits 3:0 control rounding mode. 1:0 control single precision, 3:2 double +// precision. +#define FP_ROUND_MODE_SP(x) ((x) & 0x3) +#define FP_ROUND_MODE_DP(x) (((x) & 0x3) << 2) + +#define FP_DENORM_FLUSH_IN_FLUSH_OUT 0 +#define FP_DENORM_FLUSH_OUT 1 +#define FP_DENORM_FLUSH_IN 2 +#define FP_DENORM_FLUSH_NONE 3 + + +// Bits 7:4 control denormal handling. 5:4 control single precision, 6:7 double +// precision. +#define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4) +#define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6) + +#define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860 +#define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12) + +#define R_0286E8_SPI_TMPRING_SIZE 0x0286E8 +#define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12) + + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp new file mode 100644 index 0000000..636750d --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp @@ -0,0 +1,90 @@ +//===-- SIFixControlFlowLiveIntervals.cpp - Fix CF live intervals ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Spilling of EXEC masks used for control flow messes up control flow +/// lowering, so mark all live intervals associated with CF instructions as +/// non-spillable. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-fix-cf-live-intervals" + +namespace { + +class SIFixControlFlowLiveIntervals : public MachineFunctionPass { +public: + static char ID; + +public: + SIFixControlFlowLiveIntervals() : MachineFunctionPass(ID) { + initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Fix CF Live Intervals"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LiveIntervals>(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIFixControlFlowLiveIntervals, DEBUG_TYPE, + "SI Fix CF Live Intervals", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_END(SIFixControlFlowLiveIntervals, DEBUG_TYPE, + "SI Fix CF Live Intervals", false, false) + +char SIFixControlFlowLiveIntervals::ID = 0; + +char &llvm::SIFixControlFlowLiveIntervalsID = SIFixControlFlowLiveIntervals::ID; + +FunctionPass *llvm::createSIFixControlFlowLiveIntervalsPass() { + return new SIFixControlFlowLiveIntervals(); +} + +bool SIFixControlFlowLiveIntervals::runOnMachineFunction(MachineFunction &MF) { + LiveIntervals *LIS = &getAnalysis<LiveIntervals>(); + + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + switch (MI.getOpcode()) { + case AMDGPU::SI_IF: + case AMDGPU::SI_ELSE: + case AMDGPU::SI_BREAK: + case AMDGPU::SI_IF_BREAK: + case AMDGPU::SI_ELSE_BREAK: + case AMDGPU::SI_END_CF: { + unsigned Reg = MI.getOperand(0).getReg(); + LIS->getInterval(Reg).markNotSpillable(); + break; + } + default: + break; + } + } + } + + return false; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp new file mode 100644 index 0000000..f59d994 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -0,0 +1,373 @@ +//===-- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Copies from VGPR to SGPR registers are illegal and the register coalescer +/// will sometimes generate these illegal copies in situations like this: +/// +/// Register Class <vsrc> is the union of <vgpr> and <sgpr> +/// +/// BB0: +/// %vreg0 <sgpr> = SCALAR_INST +/// %vreg1 <vsrc> = COPY %vreg0 <sgpr> +/// ... +/// BRANCH %cond BB1, BB2 +/// BB1: +/// %vreg2 <vgpr> = VECTOR_INST +/// %vreg3 <vsrc> = COPY %vreg2 <vgpr> +/// BB2: +/// %vreg4 <vsrc> = PHI %vreg1 <vsrc>, <BB#0>, %vreg3 <vrsc>, <BB#1> +/// %vreg5 <vgpr> = VECTOR_INST %vreg4 <vsrc> +/// +/// +/// The coalescer will begin at BB0 and eliminate its copy, then the resulting +/// code will look like this: +/// +/// BB0: +/// %vreg0 <sgpr> = SCALAR_INST +/// ... +/// BRANCH %cond BB1, BB2 +/// BB1: +/// %vreg2 <vgpr> = VECTOR_INST +/// %vreg3 <vsrc> = COPY %vreg2 <vgpr> +/// BB2: +/// %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <vsrc>, <BB#1> +/// %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr> +/// +/// Now that the result of the PHI instruction is an SGPR, the register +/// allocator is now forced to constrain the register class of %vreg3 to +/// <sgpr> so we end up with final code like this: +/// +/// BB0: +/// %vreg0 <sgpr> = SCALAR_INST +/// ... +/// BRANCH %cond BB1, BB2 +/// BB1: +/// %vreg2 <vgpr> = VECTOR_INST +/// %vreg3 <sgpr> = COPY %vreg2 <vgpr> +/// BB2: +/// %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <sgpr>, <BB#1> +/// %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr> +/// +/// Now this code contains an illegal copy from a VGPR to an SGPR. +/// +/// In order to avoid this problem, this pass searches for PHI instructions +/// which define a <vsrc> register and constrains its definition class to +/// <vgpr> if the user of the PHI's definition register is a vector instruction. +/// If the PHI's definition class is constrained to <vgpr> then the coalescer +/// will be unable to perform the COPY removal from the above example which +/// ultimately led to the creation of an illegal COPY. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "sgpr-copies" + +namespace { + +class SIFixSGPRCopies : public MachineFunctionPass { +public: + static char ID; + + SIFixSGPRCopies() : MachineFunctionPass(ID) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Fix SGPR copies"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace + +INITIALIZE_PASS(SIFixSGPRCopies, DEBUG_TYPE, + "SI Fix SGPR copies", false, false) + +char SIFixSGPRCopies::ID = 0; + +char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID; + +FunctionPass *llvm::createSIFixSGPRCopiesPass() { + return new SIFixSGPRCopies(); +} + +static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + if (!MI.getOperand(i).isReg() || + !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) + continue; + + if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg()))) + return true; + } + return false; +} + +static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> +getCopyRegClasses(const MachineInstr &Copy, + const SIRegisterInfo &TRI, + const MachineRegisterInfo &MRI) { + unsigned DstReg = Copy.getOperand(0).getReg(); + unsigned SrcReg = Copy.getOperand(1).getReg(); + + const TargetRegisterClass *SrcRC = + TargetRegisterInfo::isVirtualRegister(SrcReg) ? + MRI.getRegClass(SrcReg) : + TRI.getPhysRegClass(SrcReg); + + // We don't really care about the subregister here. + // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg()); + + const TargetRegisterClass *DstRC = + TargetRegisterInfo::isVirtualRegister(DstReg) ? + MRI.getRegClass(DstReg) : + TRI.getPhysRegClass(DstReg); + + return std::make_pair(SrcRC, DstRC); +} + +static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, + const TargetRegisterClass *DstRC, + const SIRegisterInfo &TRI) { + return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC); +} + +static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, + const TargetRegisterClass *DstRC, + const SIRegisterInfo &TRI) { + return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC); +} + +// Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. +// +// SGPRx = ... +// SGPRy = REG_SEQUENCE SGPRx, sub0 ... +// VGPRz = COPY SGPRy +// +// ==> +// +// VGPRx = COPY SGPRx +// VGPRz = REG_SEQUENCE VGPRx, sub0 +// +// This exposes immediate folding opportunities when materializing 64-bit +// immediates. +static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, + const SIRegisterInfo *TRI, + const SIInstrInfo *TII, + MachineRegisterInfo &MRI) { + assert(MI.isRegSequence()); + + unsigned DstReg = MI.getOperand(0).getReg(); + if (!TRI->isSGPRClass(MRI.getRegClass(DstReg))) + return false; + + if (!MRI.hasOneUse(DstReg)) + return false; + + MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg); + if (!CopyUse.isCopy()) + return false; + + const TargetRegisterClass *SrcRC, *DstRC; + std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI); + + if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) + return false; + + // TODO: Could have multiple extracts? + unsigned SubReg = CopyUse.getOperand(1).getSubReg(); + if (SubReg != AMDGPU::NoSubRegister) + return false; + + MRI.setRegClass(DstReg, DstRC); + + // SGPRx = ... + // SGPRy = REG_SEQUENCE SGPRx, sub0 ... + // VGPRz = COPY SGPRy + + // => + // VGPRx = COPY SGPRx + // VGPRz = REG_SEQUENCE VGPRx, sub0 + + MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); + + for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { + unsigned SrcReg = MI.getOperand(I).getReg(); + unsigned SrcSubReg = MI.getOperand(I).getSubReg(); + + const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); + assert(TRI->isSGPRClass(SrcRC) && + "Expected SGPR REG_SEQUENCE to only have SGPR inputs"); + + SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg); + const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC); + + unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC); + + BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg) + .addOperand(MI.getOperand(I)); + + MI.getOperand(I).setReg(TmpReg); + } + + CopyUse.eraseFromParent(); + return true; +} + +bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + + SmallVector<MachineInstr *, 16> Worklist; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + MachineInstr &MI = *I; + + switch (MI.getOpcode()) { + default: + continue; + case AMDGPU::COPY: { + // If the destination register is a physical register there isn't really + // much we can do to fix this. + if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg())) + continue; + + const TargetRegisterClass *SrcRC, *DstRC; + std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI); + if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) { + DEBUG(dbgs() << "Fixing VGPR -> SGPR copy: " << MI); + TII->moveToVALU(MI); + } + + break; + } + case AMDGPU::PHI: { + DEBUG(dbgs() << "Fixing PHI: " << MI); + unsigned Reg = MI.getOperand(0).getReg(); + if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) + break; + + // If a PHI node defines an SGPR and any of its operands are VGPRs, + // then we need to move it to the VALU. + // + // Also, if a PHI node defines an SGPR and has all SGPR operands + // we must move it to the VALU, because the SGPR operands will + // all end up being assigned the same register, which means + // there is a potential for a conflict if different threads take + // different control flow paths. + // + // For Example: + // + // sgpr0 = def; + // ... + // sgpr1 = def; + // ... + // sgpr2 = PHI sgpr0, sgpr1 + // use sgpr2; + // + // Will Become: + // + // sgpr2 = def; + // ... + // sgpr2 = def; + // ... + // use sgpr2 + // + // FIXME: This is OK if the branching decision is made based on an + // SGPR value. + bool SGPRBranch = false; + + // The one exception to this rule is when one of the operands + // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK + // instruction. In this case, there we know the program will + // never enter the second block (the loop) without entering + // the first block (where the condition is computed), so there + // is no chance for values to be over-written. + + bool HasBreakDef = false; + for (unsigned i = 1; i < MI.getNumOperands(); i+=2) { + unsigned Reg = MI.getOperand(i).getReg(); + if (TRI->hasVGPRs(MRI.getRegClass(Reg))) { + TII->moveToVALU(MI); + break; + } + MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg); + assert(DefInstr); + switch(DefInstr->getOpcode()) { + + case AMDGPU::SI_BREAK: + case AMDGPU::SI_IF_BREAK: + case AMDGPU::SI_ELSE_BREAK: + // If we see a PHI instruction that defines an SGPR, then that PHI + // instruction has already been considered and should have + // a *_BREAK as an operand. + case AMDGPU::PHI: + HasBreakDef = true; + break; + } + } + + if (!SGPRBranch && !HasBreakDef) + TII->moveToVALU(MI); + break; + } + case AMDGPU::REG_SEQUENCE: { + if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) || + !hasVGPROperands(MI, TRI)) { + foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI); + continue; + } + + DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); + + TII->moveToVALU(MI); + break; + } + case AMDGPU::INSERT_SUBREG: { + const TargetRegisterClass *DstRC, *Src0RC, *Src1RC; + DstRC = MRI.getRegClass(MI.getOperand(0).getReg()); + Src0RC = MRI.getRegClass(MI.getOperand(1).getReg()); + Src1RC = MRI.getRegClass(MI.getOperand(2).getReg()); + if (TRI->isSGPRClass(DstRC) && + (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) { + DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); + TII->moveToVALU(MI); + } + break; + } + } + } + } + + return true; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp new file mode 100644 index 0000000..8bda283 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp @@ -0,0 +1,219 @@ +//===-- SIFixSGPRLiveRanges.cpp - Fix SGPR live ranges ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file SALU instructions ignore the execution mask, so we need to modify the +/// live ranges of the registers they define in some cases. +/// +/// The main case we need to handle is when a def is used in one side of a +/// branch and not another. For example: +/// +/// %def +/// IF +/// ... +/// ... +/// ELSE +/// %use +/// ... +/// ENDIF +/// +/// Here we need the register allocator to avoid assigning any of the defs +/// inside of the IF to the same register as %def. In traditional live +/// interval analysis %def is not live inside the IF branch, however, since +/// SALU instructions inside of IF will be executed even if the branch is not +/// taken, there is the chance that one of the instructions will overwrite the +/// value of %def, so the use in ELSE will see the wrong value. +/// +/// The strategy we use for solving this is to add an extra use after the ENDIF: +/// +/// %def +/// IF +/// ... +/// ... +/// ELSE +/// %use +/// ... +/// ENDIF +/// %use +/// +/// Adding this use will make the def live throughout the IF branch, which is +/// what we want. + +#include "AMDGPU.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-fix-sgpr-live-ranges" + +namespace { + +class SIFixSGPRLiveRanges : public MachineFunctionPass { +public: + static char ID; + +public: + SIFixSGPRLiveRanges() : MachineFunctionPass(ID) { + initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Fix SGPR live ranges"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LiveVariables>(); + AU.addPreserved<LiveVariables>(); + + AU.addRequired<MachinePostDominatorTree>(); + AU.addPreserved<MachinePostDominatorTree>(); + AU.setPreservesCFG(); + + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE, + "SI Fix SGPR Live Ranges", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveVariables) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE, + "SI Fix SGPR Live Ranges", false, false) + +char SIFixSGPRLiveRanges::ID = 0; + +char &llvm::SIFixSGPRLiveRangesID = SIFixSGPRLiveRanges::ID; + +FunctionPass *llvm::createSIFixSGPRLiveRangesPass() { + return new SIFixSGPRLiveRanges(); +} + +bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>( + MF.getSubtarget().getRegisterInfo()); + bool MadeChange = false; + + MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>(); + SmallVector<unsigned, 16> SGPRLiveRanges; + + LiveVariables *LV = &getAnalysis<LiveVariables>(); + MachineBasicBlock *Entry = &MF.front(); + + // Use a depth first order so that in SSA, we encounter all defs before + // uses. Once the defs of the block have been found, attempt to insert + // SGPR_USE instructions in successor blocks if required. + for (MachineBasicBlock *MBB : depth_first(Entry)) { + for (const MachineInstr &MI : *MBB) { + for (const MachineOperand &MO : MI.defs()) { + // We should never see a live out def of a physical register, so we also + // do not need to worry about implicit_defs(). + unsigned Def = MO.getReg(); + if (TargetRegisterInfo::isVirtualRegister(Def)) { + if (TRI->isSGPRClass(MRI.getRegClass(Def))) { + // Only consider defs that are live outs. We don't care about def / + // use within the same block. + + // LiveVariables does not consider registers that are only used in a + // phi in a sucessor block as live out, unlike LiveIntervals. + // + // This is OK because SIFixSGPRCopies replaced any SGPR phis with + // VGPRs. + if (LV->isLiveOut(Def, *MBB)) + SGPRLiveRanges.push_back(Def); + } + } + } + } + + if (MBB->succ_size() < 2) + continue; + + // We have structured control flow, so the number of successors should be + // two. + assert(MBB->succ_size() == 2); + MachineBasicBlock *SuccA = *MBB->succ_begin(); + MachineBasicBlock *SuccB = *(++MBB->succ_begin()); + MachineBasicBlock *NCD = PDT->findNearestCommonDominator(SuccA, SuccB); + + if (!NCD) + continue; + + MachineBasicBlock::iterator NCDTerm = NCD->getFirstTerminator(); + + if (NCDTerm != NCD->end() && NCDTerm->getOpcode() == AMDGPU::SI_ELSE) { + assert(NCD->succ_size() == 2); + // We want to make sure we insert the Use after the ENDIF, not after + // the ELSE. + NCD = PDT->findNearestCommonDominator(*NCD->succ_begin(), + *(++NCD->succ_begin())); + } + + for (unsigned Reg : SGPRLiveRanges) { + // FIXME: We could be smarter here. If the register is Live-In to one + // block, but the other doesn't have any SGPR defs, then there won't be a + // conflict. Also, if the branch condition is uniform then there will be + // no conflict. + bool LiveInToA = LV->isLiveIn(Reg, *SuccA); + bool LiveInToB = LV->isLiveIn(Reg, *SuccB); + + if (!LiveInToA && !LiveInToB) { + DEBUG(dbgs() << PrintReg(Reg, TRI, 0) + << " is live into neither successor\n"); + continue; + } + + if (LiveInToA && LiveInToB) { + DEBUG(dbgs() << PrintReg(Reg, TRI, 0) + << " is live into both successors\n"); + continue; + } + + // This interval is live in to one successor, but not the other, so + // we need to update its range so it is live in to both. + DEBUG(dbgs() << "Possible SGPR conflict detected for " + << PrintReg(Reg, TRI, 0) + << " BB#" << SuccA->getNumber() + << ", BB#" << SuccB->getNumber() + << " with NCD = BB#" << NCD->getNumber() << '\n'); + + assert(TargetRegisterInfo::isVirtualRegister(Reg) && + "Not expecting to extend live range of physreg"); + + // FIXME: Need to figure out how to update LiveRange here so this pass + // will be able to preserve LiveInterval analysis. + MachineInstr *NCDSGPRUse = + BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(), + TII->get(AMDGPU::SGPR_USE)) + .addReg(Reg, RegState::Implicit); + + MadeChange = true; + LV->HandleVirtRegUse(Reg, NCD, NCDSGPRUse); + + DEBUG(NCDSGPRUse->dump()); + } + } + + return MadeChange; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp new file mode 100644 index 0000000..6230d1e --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -0,0 +1,389 @@ +//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// +// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "si-fold-operands" +using namespace llvm; + +namespace { + +class SIFoldOperands : public MachineFunctionPass { +public: + static char ID; + +public: + SIFoldOperands() : MachineFunctionPass(ID) { + initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Fold Operands"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +struct FoldCandidate { + MachineInstr *UseMI; + unsigned UseOpNo; + MachineOperand *OpToFold; + uint64_t ImmToFold; + + FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) : + UseMI(MI), UseOpNo(OpNo) { + + if (FoldOp->isImm()) { + OpToFold = nullptr; + ImmToFold = FoldOp->getImm(); + } else { + assert(FoldOp->isReg()); + OpToFold = FoldOp; + } + } + + bool isImm() const { + return !OpToFold; + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIFoldOperands, DEBUG_TYPE, + "SI Fold Operands", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(SIFoldOperands, DEBUG_TYPE, + "SI Fold Operands", false, false) + +char SIFoldOperands::ID = 0; + +char &llvm::SIFoldOperandsID = SIFoldOperands::ID; + +FunctionPass *llvm::createSIFoldOperandsPass() { + return new SIFoldOperands(); +} + +static bool isSafeToFold(unsigned Opcode) { + switch(Opcode) { + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B32_e64: + case AMDGPU::V_MOV_B64_PSEUDO: + case AMDGPU::S_MOV_B32: + case AMDGPU::S_MOV_B64: + case AMDGPU::COPY: + return true; + default: + return false; + } +} + +static bool updateOperand(FoldCandidate &Fold, + const TargetRegisterInfo &TRI) { + MachineInstr *MI = Fold.UseMI; + MachineOperand &Old = MI->getOperand(Fold.UseOpNo); + assert(Old.isReg()); + + if (Fold.isImm()) { + Old.ChangeToImmediate(Fold.ImmToFold); + return true; + } + + MachineOperand *New = Fold.OpToFold; + if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) && + TargetRegisterInfo::isVirtualRegister(New->getReg())) { + Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); + return true; + } + + // FIXME: Handle physical registers. + + return false; +} + +static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList, + const MachineInstr *MI) { + for (auto Candidate : FoldList) { + if (Candidate.UseMI == MI) + return true; + } + return false; +} + +static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, + MachineInstr *MI, unsigned OpNo, + MachineOperand *OpToFold, + const SIInstrInfo *TII) { + if (!TII->isOperandLegal(MI, OpNo, OpToFold)) { + + // Special case for v_mac_f32_e64 if we are trying to fold into src2 + unsigned Opc = MI->getOpcode(); + if (Opc == AMDGPU::V_MAC_F32_e64 && + (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { + // Check if changing this to a v_mad_f32 instruction will allow us to + // fold the operand. + MI->setDesc(TII->get(AMDGPU::V_MAD_F32)); + bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII); + if (FoldAsMAD) { + MI->untieRegOperand(OpNo); + return true; + } + MI->setDesc(TII->get(Opc)); + } + + // If we are already folding into another operand of MI, then + // we can't commute the instruction, otherwise we risk making the + // other fold illegal. + if (isUseMIInFoldList(FoldList, MI)) + return false; + + // Operand is not legal, so try to commute the instruction to + // see if this makes it possible to fold. + unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex; + unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex; + bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1); + + if (CanCommute) { + if (CommuteIdx0 == OpNo) + OpNo = CommuteIdx1; + else if (CommuteIdx1 == OpNo) + OpNo = CommuteIdx0; + } + + // One of operands might be an Imm operand, and OpNo may refer to it after + // the call of commuteInstruction() below. Such situations are avoided + // here explicitly as OpNo must be a register operand to be a candidate + // for memory folding. + if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() || + !MI->getOperand(CommuteIdx1).isReg())) + return false; + + if (!CanCommute || + !TII->commuteInstruction(MI, false, CommuteIdx0, CommuteIdx1)) + return false; + + if (!TII->isOperandLegal(MI, OpNo, OpToFold)) + return false; + } + + FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); + return true; +} + +static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, + unsigned UseOpIdx, + std::vector<FoldCandidate> &FoldList, + SmallVectorImpl<MachineInstr *> &CopiesToReplace, + const SIInstrInfo *TII, const SIRegisterInfo &TRI, + MachineRegisterInfo &MRI) { + const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); + + // FIXME: Fold operands with subregs. + if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) || + UseOp.isImplicit())) { + return; + } + + bool FoldingImm = OpToFold.isImm(); + APInt Imm; + + if (FoldingImm) { + unsigned UseReg = UseOp.getReg(); + const TargetRegisterClass *UseRC + = TargetRegisterInfo::isVirtualRegister(UseReg) ? + MRI.getRegClass(UseReg) : + TRI.getPhysRegClass(UseReg); + + Imm = APInt(64, OpToFold.getImm()); + + const MCInstrDesc &FoldDesc = TII->get(OpToFold.getParent()->getOpcode()); + const TargetRegisterClass *FoldRC = + TRI.getRegClass(FoldDesc.OpInfo[0].RegClass); + + // Split 64-bit constants into 32-bits for folding. + if (FoldRC->getSize() == 8 && UseOp.getSubReg()) { + if (UseRC->getSize() != 8) + return; + + if (UseOp.getSubReg() == AMDGPU::sub0) { + Imm = Imm.getLoBits(32); + } else { + assert(UseOp.getSubReg() == AMDGPU::sub1); + Imm = Imm.getHiBits(32); + } + } + + // In order to fold immediates into copies, we need to change the + // copy to a MOV. + if (UseMI->getOpcode() == AMDGPU::COPY) { + unsigned DestReg = UseMI->getOperand(0).getReg(); + const TargetRegisterClass *DestRC + = TargetRegisterInfo::isVirtualRegister(DestReg) ? + MRI.getRegClass(DestReg) : + TRI.getPhysRegClass(DestReg); + + unsigned MovOp = TII->getMovOpcode(DestRC); + if (MovOp == AMDGPU::COPY) + return; + + UseMI->setDesc(TII->get(MovOp)); + CopiesToReplace.push_back(UseMI); + } + } + + // Special case for REG_SEQUENCE: We can't fold literals into + // REG_SEQUENCE instructions, so we have to fold them into the + // uses of REG_SEQUENCE. + if (UseMI->getOpcode() == AMDGPU::REG_SEQUENCE) { + unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); + unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); + + for (MachineRegisterInfo::use_iterator + RSUse = MRI.use_begin(RegSeqDstReg), + RSE = MRI.use_end(); RSUse != RSE; ++RSUse) { + + MachineInstr *RSUseMI = RSUse->getParent(); + if (RSUse->getSubReg() != RegSeqDstSubReg) + continue; + + foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList, + CopiesToReplace, TII, TRI, MRI); + } + return; + } + + const MCInstrDesc &UseDesc = UseMI->getDesc(); + + // Don't fold into target independent nodes. Target independent opcodes + // don't have defined register classes. + if (UseDesc.isVariadic() || + UseDesc.OpInfo[UseOpIdx].RegClass == -1) + return; + + if (FoldingImm) { + MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); + tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); + return; + } + + tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); + + // FIXME: We could try to change the instruction from 64-bit to 32-bit + // to enable more folding opportunites. The shrink operands pass + // already does this. + return; +} + +bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + + if (!isSafeToFold(MI.getOpcode())) + continue; + + unsigned OpSize = TII->getOpSize(MI, 1); + MachineOperand &OpToFold = MI.getOperand(1); + bool FoldingImm = OpToFold.isImm(); + + // FIXME: We could also be folding things like FrameIndexes and + // TargetIndexes. + if (!FoldingImm && !OpToFold.isReg()) + continue; + + // Folding immediates with more than one use will increase program size. + // FIXME: This will also reduce register usage, which may be better + // in some cases. A better heuristic is needed. + if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) && + !MRI.hasOneUse(MI.getOperand(0).getReg())) + continue; + + if (OpToFold.isReg() && + !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg())) + continue; + + // Prevent folding operands backwards in the function. For example, + // the COPY opcode must not be replaced by 1 in this example: + // + // %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3 + // ... + // %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use> + MachineOperand &Dst = MI.getOperand(0); + if (Dst.isReg() && + !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) + continue; + + // We need mutate the operands of new mov instructions to add implicit + // uses of EXEC, but adding them invalidates the use_iterator, so defer + // this. + SmallVector<MachineInstr *, 4> CopiesToReplace; + + std::vector<FoldCandidate> FoldList; + for (MachineRegisterInfo::use_iterator + Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end(); + Use != E; ++Use) { + + MachineInstr *UseMI = Use->getParent(); + + foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList, + CopiesToReplace, TII, TRI, MRI); + } + + // Make sure we add EXEC uses to any new v_mov instructions created. + for (MachineInstr *Copy : CopiesToReplace) + Copy->addImplicitDefUseOperands(MF); + + for (FoldCandidate &Fold : FoldList) { + if (updateOperand(Fold, TRI)) { + // Clear kill flags. + if (!Fold.isImm()) { + assert(Fold.OpToFold && Fold.OpToFold->isReg()); + // FIXME: Probably shouldn't bother trying to fold if not an + // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR + // copies. + MRI.clearKillFlags(Fold.OpToFold->getReg()); + } + DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << + Fold.UseOpNo << " of " << *Fold.UseMI << '\n'); + } + } + } + } + return false; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp new file mode 100644 index 0000000..7d20509 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -0,0 +1,245 @@ +//===----------------------- SIFrameLowering.cpp --------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// + +#include "SIFrameLowering.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/RegisterScavenging.h" + +using namespace llvm; + + +static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo, + const MachineFrameInfo *FrameInfo) { + if (!FuncInfo->hasSpilledSGPRs()) + return false; + + if (FuncInfo->hasSpilledVGPRs()) + return false; + + for (int I = FrameInfo->getObjectIndexBegin(), + E = FrameInfo->getObjectIndexEnd(); I != E; ++I) { + if (!FrameInfo->isSpillSlotObjectIndex(I)) + return false; + } + + return true; +} + +static ArrayRef<MCPhysReg> getAllSGPR128() { + return makeArrayRef(AMDGPU::SReg_128RegClass.begin(), + AMDGPU::SReg_128RegClass.getNumRegs()); +} + +static ArrayRef<MCPhysReg> getAllSGPRs() { + return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), + AMDGPU::SGPR_32RegClass.getNumRegs()); +} + +void SIFrameLowering::emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + if (!MF.getFrameInfo()->hasStackObjects()) + return; + + assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); + + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + // If we only have SGPR spills, we won't actually be using scratch memory + // since these spill to VGPRs. + // + // FIXME: We should be cleaning up these unused SGPR spill frame indices + // somewhere. + if (hasOnlySGPRSpills(MFI, MF.getFrameInfo())) + return; + + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + + // We need to insert initialization of the scratch resource descriptor. + unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); + assert(ScratchRsrcReg != AMDGPU::NoRegister); + + unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + assert(ScratchWaveOffsetReg != AMDGPU::NoRegister); + + unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + + unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; + if (ST.isAmdHsaOS()) { + PreloadedPrivateBufferReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + } + + // If we reserved the original input registers, we don't need to copy to the + // reserved registers. + if (ScratchRsrcReg == PreloadedPrivateBufferReg) { + // We should always reserve these 5 registers at the same time. + assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg && + "scratch wave offset and private segment buffer inconsistent"); + return; + } + + + // We added live-ins during argument lowering, but since they were not used + // they were deleted. We're adding the uses now, so add them back. + MachineRegisterInfo &MRI = MF.getRegInfo(); + MRI.addLiveIn(PreloadedScratchWaveOffsetReg); + MBB.addLiveIn(PreloadedScratchWaveOffsetReg); + + if (ST.isAmdHsaOS()) { + MRI.addLiveIn(PreloadedPrivateBufferReg); + MBB.addLiveIn(PreloadedPrivateBufferReg); + } + + if (!ST.hasSGPRInitBug()) { + // We reserved the last registers for this. Shift it down to the end of those + // which were actually used. + // + // FIXME: It might be safer to use a pseudoregister before replacement. + + // FIXME: We should be able to eliminate unused input registers. We only + // cannot do this for the resources required for scratch access. For now we + // skip over user SGPRs and may leave unused holes. + + // We find the resource first because it has an alignment requirement. + if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4; + // Skip the last 2 elements because the last one is reserved for VCC, and + // this is the 2nd to last element already. + for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) { + // Pick the first unallocated one. Make sure we don't clobber the other + // reserved input we needed. + if (!MRI.isPhysRegUsed(Reg)) { + assert(MRI.isAllocatable(Reg)); + MRI.replaceRegWith(ScratchRsrcReg, Reg); + ScratchRsrcReg = Reg; + MFI->setScratchRSrcReg(ScratchRsrcReg); + break; + } + } + } + + if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + // Skip the last 2 elements because the last one is reserved for VCC, and + // this is the 2nd to last element already. + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); + for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) { + // Pick the first unallocated SGPR. Be careful not to pick an alias of the + // scratch descriptor, since we haven’t added its uses yet. + if (!MRI.isPhysRegUsed(Reg)) { + assert(MRI.isAllocatable(Reg) && + !TRI->isSubRegisterEq(ScratchRsrcReg, Reg)); + + MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); + ScratchWaveOffsetReg = Reg; + MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg); + break; + } + } + } + } + + + assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg)); + + const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); + MachineBasicBlock::iterator I = MBB.begin(); + DebugLoc DL; + + if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { + // Make sure we emit the copy for the offset first. We may have chosen to copy + // the buffer resource into a register that aliases the input offset register. + BuildMI(MBB, I, DL, SMovB32, ScratchWaveOffsetReg) + .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); + } + + if (ST.isAmdHsaOS()) { + // Insert copies from argument register. + assert( + !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) && + !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg)); + + unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); + unsigned Rsrc23 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3); + + unsigned Lo = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub0_sub1); + unsigned Hi = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub2_sub3); + + const MCInstrDesc &SMovB64 = TII->get(AMDGPU::S_MOV_B64); + + BuildMI(MBB, I, DL, SMovB64, Rsrc01) + .addReg(Lo, RegState::Kill); + BuildMI(MBB, I, DL, SMovB64, Rsrc23) + .addReg(Hi, RegState::Kill); + } else { + unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); + unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); + + // Use relocations to get the pointer, and setup the other bits manually. + uint64_t Rsrc23 = TII->getScratchRsrcWords23(); + BuildMI(MBB, I, DL, SMovB32, Rsrc0) + .addExternalSymbol("SCRATCH_RSRC_DWORD0") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc1) + .addExternalSymbol("SCRATCH_RSRC_DWORD1") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc2) + .addImm(Rsrc23 & 0xffffffff) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc3) + .addImm(Rsrc23 >> 32) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + } + + // Make the register selected live throughout the function. + for (MachineBasicBlock &OtherBB : MF) { + if (&OtherBB == &MBB) + continue; + + OtherBB.addLiveIn(ScratchRsrcReg); + OtherBB.addLiveIn(ScratchWaveOffsetReg); + } +} + +void SIFrameLowering::processFunctionBeforeFrameFinalized( + MachineFunction &MF, + RegScavenger *RS) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + + if (!MFI->hasStackObjects()) + return; + + bool MayNeedScavengingEmergencySlot = MFI->hasStackObjects(); + + assert((RS || !MayNeedScavengingEmergencySlot) && + "RegScavenger required if spilling"); + + if (MayNeedScavengingEmergencySlot) { + int ScavengeFI = MFI->CreateSpillStackObject( + AMDGPU::SGPR_32RegClass.getSize(), + AMDGPU::SGPR_32RegClass.getAlignment()); + RS->addScavengingFrameIndex(ScavengeFI); + } +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h new file mode 100644 index 0000000..a9152fd --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -0,0 +1,34 @@ +//===--------------------- SIFrameLowering.h --------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H +#define LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H + +#include "AMDGPUFrameLowering.h" + +namespace llvm { + +class SIFrameLowering final : public AMDGPUFrameLowering { +public: + SIFrameLowering(StackDirection D, unsigned StackAl, int LAO, + unsigned TransAl = 1) : + AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} + ~SIFrameLowering() override {} + + void emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const override; + + void processFunctionBeforeFrameFinalized( + MachineFunction &MF, + RegScavenger *RS = nullptr) const override; +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp new file mode 100644 index 0000000..5448675 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -0,0 +1,2643 @@ +//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Custom DAG lowering for SI +// +//===----------------------------------------------------------------------===// + +#ifdef _MSC_VER +// Provide M_PI. +#define _USE_MATH_DEFINES +#include <cmath> +#endif + +#include "SIISelLowering.h" +#include "AMDGPU.h" +#include "AMDGPUDiagnosticInfoUnsupported.h" +#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/IR/Function.h" +#include "llvm/ADT/SmallString.h" + +using namespace llvm; + +SITargetLowering::SITargetLowering(TargetMachine &TM, + const AMDGPUSubtarget &STI) + : AMDGPUTargetLowering(TM, STI) { + addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); + addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); + + addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); + addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); + + addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); + + addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); + addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); + + addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass); + + addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); + + addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); + addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); + + addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); + addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); + + computeRegisterProperties(STI.getRegisterInfo()); + + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); + + setOperationAction(ISD::ADD, MVT::i32, Legal); + setOperationAction(ISD::ADDC, MVT::i32, Legal); + setOperationAction(ISD::ADDE, MVT::i32, Legal); + setOperationAction(ISD::SUBC, MVT::i32, Legal); + setOperationAction(ISD::SUBE, MVT::i32, Legal); + + setOperationAction(ISD::FSIN, MVT::f32, Custom); + setOperationAction(ISD::FCOS, MVT::f32, Custom); + + setOperationAction(ISD::FMINNUM, MVT::f64, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); + + // We need to custom lower vector stores from local memory + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setOperationAction(ISD::LOAD, MVT::v8i32, Custom); + setOperationAction(ISD::LOAD, MVT::v16i32, Custom); + + setOperationAction(ISD::STORE, MVT::v8i32, Custom); + setOperationAction(ISD::STORE, MVT::v16i32, Custom); + + setOperationAction(ISD::STORE, MVT::i1, Custom); + setOperationAction(ISD::STORE, MVT::v4i32, Custom); + + setOperationAction(ISD::SELECT, MVT::i64, Custom); + setOperationAction(ISD::SELECT, MVT::f64, Promote); + AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); + + setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); + + setOperationAction(ISD::SETCC, MVT::v2i1, Expand); + setOperationAction(ISD::SETCC, MVT::v4i1, Expand); + + setOperationAction(ISD::BSWAP, MVT::i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); + + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); + + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); + + for (MVT VT : MVT::integer_valuetypes()) { + if (VT == MVT::i64) + continue; + + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand); + + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand); + + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); + } + + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand); + } + + for (MVT VT : MVT::fp_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); + + setTruncStoreAction(MVT::i64, MVT::i32, Expand); + setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); + setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); + setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); + + + setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); + + setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); + setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); + + setOperationAction(ISD::LOAD, MVT::i1, Custom); + + setOperationAction(ISD::LOAD, MVT::v2i64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32); + + setOperationAction(ISD::STORE, MVT::v2i64, Promote); + AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32); + + setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); + + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); + setOperationAction(ISD::FrameIndex, MVT::i32, Custom); + + // These should use UDIVREM, so set them to expand + setOperationAction(ISD::UDIV, MVT::i64, Expand); + setOperationAction(ISD::UREM, MVT::i64, Expand); + + setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); + setOperationAction(ISD::SELECT, MVT::i1, Promote); + + setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); + + + setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); + + // We only support LOAD/STORE and vector manipulation ops for vectors + // with > 4 elements. + for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) { + for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { + switch(Op) { + case ISD::LOAD: + case ISD::STORE: + case ISD::BUILD_VECTOR: + case ISD::BITCAST: + case ISD::EXTRACT_VECTOR_ELT: + case ISD::INSERT_VECTOR_ELT: + case ISD::INSERT_SUBVECTOR: + case ISD::EXTRACT_SUBVECTOR: + case ISD::SCALAR_TO_VECTOR: + break; + case ISD::CONCAT_VECTORS: + setOperationAction(Op, VT, Custom); + break; + default: + setOperationAction(Op, VT, Expand); + break; + } + } + } + + // Most operations are naturally 32-bit vector operations. We only support + // load and store of i64 vectors, so promote v2i64 vector operations to v4i32. + for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) { + setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32); + + setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32); + + setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); + } + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { + setOperationAction(ISD::FTRUNC, MVT::f64, Legal); + setOperationAction(ISD::FCEIL, MVT::f64, Legal); + setOperationAction(ISD::FRINT, MVT::f64, Legal); + } + + setOperationAction(ISD::FFLOOR, MVT::f64, Legal); + setOperationAction(ISD::FDIV, MVT::f32, Custom); + setOperationAction(ISD::FDIV, MVT::f64, Custom); + + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::FSUB); + setTargetDAGCombine(ISD::FMINNUM); + setTargetDAGCombine(ISD::FMAXNUM); + setTargetDAGCombine(ISD::SMIN); + setTargetDAGCombine(ISD::SMAX); + setTargetDAGCombine(ISD::UMIN); + setTargetDAGCombine(ISD::UMAX); + setTargetDAGCombine(ISD::SETCC); + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::OR); + setTargetDAGCombine(ISD::UINT_TO_FP); + + // All memory operations. Some folding on the pointer operand is done to help + // matching the constant offsets in the addressing modes. + setTargetDAGCombine(ISD::LOAD); + setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::ATOMIC_LOAD); + setTargetDAGCombine(ISD::ATOMIC_STORE); + setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP); + setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); + setTargetDAGCombine(ISD::ATOMIC_SWAP); + setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD); + setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB); + setTargetDAGCombine(ISD::ATOMIC_LOAD_AND); + setTargetDAGCombine(ISD::ATOMIC_LOAD_OR); + setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR); + setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND); + setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN); + setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX); + setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); + setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); + + setSchedulingPreference(Sched::RegPressure); +} + +//===----------------------------------------------------------------------===// +// TargetLowering queries +//===----------------------------------------------------------------------===// + +bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, + EVT) const { + // SI has some legal vector types, but no legal vector operations. Say no + // shuffles are legal in order to prefer scalarizing some vector operations. + return false; +} + +bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { + // Flat instructions do not have offsets, and only have the register + // address. + return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1); +} + +bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { + // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and + // additionally can do r + r + i with addr64. 32-bit has more addressing + // mode options. Depending on the resource constant, it can also do + // (i64 r0) + (i32 r1) * (i14 i). + // + // Private arrays end up using a scratch buffer most of the time, so also + // assume those use MUBUF instructions. Scratch loads / stores are currently + // implemented as mubuf instructions with offen bit set, so slightly + // different than the normal addr64. + if (!isUInt<12>(AM.BaseOffs)) + return false; + + // FIXME: Since we can split immediate into soffset and immediate offset, + // would it make sense to allow any immediate? + + switch (AM.Scale) { + case 0: // r + i or just i, depending on HasBaseReg. + return true; + case 1: + return true; // We have r + r or r + i. + case 2: + if (AM.HasBaseReg) { + // Reject 2 * r + r. + return false; + } + + // Allow 2 * r as r + r + // Or 2 * r + i is allowed as r + r + i. + return true; + default: // Don't allow n * r + return false; + } +} + +bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, Type *Ty, + unsigned AS) const { + // No global is ever allowed as a base. + if (AM.BaseGV) + return false; + + switch (AS) { + case AMDGPUAS::GLOBAL_ADDRESS: { + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // Assume the we will use FLAT for all global memory accesses + // on VI. + // FIXME: This assumption is currently wrong. On VI we still use + // MUBUF instructions for the r + i addressing mode. As currently + // implemented, the MUBUF instructions only work on buffer < 4GB. + // It may be possible to support > 4GB buffers with MUBUF instructions, + // by setting the stride value in the resource descriptor which would + // increase the size limit to (stride * 4GB). However, this is risky, + // because it has never been validated. + return isLegalFlatAddressingMode(AM); + } + + return isLegalMUBUFAddressingMode(AM); + } + case AMDGPUAS::CONSTANT_ADDRESS: { + // If the offset isn't a multiple of 4, it probably isn't going to be + // correctly aligned. + if (AM.BaseOffs % 4 != 0) + return isLegalMUBUFAddressingMode(AM); + + // There are no SMRD extloads, so if we have to do a small type access we + // will use a MUBUF load. + // FIXME?: We also need to do this if unaligned, but we don't know the + // alignment here. + if (DL.getTypeStoreSize(Ty) < 4) + return isLegalMUBUFAddressingMode(AM); + + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { + // SMRD instructions have an 8-bit, dword offset on SI. + if (!isUInt<8>(AM.BaseOffs / 4)) + return false; + } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) { + // On CI+, this can also be a 32-bit literal constant offset. If it fits + // in 8-bits, it can use a smaller encoding. + if (!isUInt<32>(AM.BaseOffs / 4)) + return false; + } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // On VI, these use the SMEM format and the offset is 20-bit in bytes. + if (!isUInt<20>(AM.BaseOffs)) + return false; + } else + llvm_unreachable("unhandled generation"); + + if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. + return true; + + if (AM.Scale == 1 && AM.HasBaseReg) + return true; + + return false; + } + + case AMDGPUAS::PRIVATE_ADDRESS: + case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: + return isLegalMUBUFAddressingMode(AM); + + case AMDGPUAS::LOCAL_ADDRESS: + case AMDGPUAS::REGION_ADDRESS: { + // Basic, single offset DS instructions allow a 16-bit unsigned immediate + // field. + // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have + // an 8-bit dword offset but we don't know the alignment here. + if (!isUInt<16>(AM.BaseOffs)) + return false; + + if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. + return true; + + if (AM.Scale == 1 && AM.HasBaseReg) + return true; + + return false; + } + case AMDGPUAS::FLAT_ADDRESS: + return isLegalFlatAddressingMode(AM); + + default: + llvm_unreachable("unhandled address space"); + } +} + +bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned AddrSpace, + unsigned Align, + bool *IsFast) const { + if (IsFast) + *IsFast = false; + + // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, + // which isn't a simple VT. + if (!VT.isSimple() || VT == MVT::Other) + return false; + + // TODO - CI+ supports unaligned memory accesses, but this requires driver + // support. + + // XXX - The only mention I see of this in the ISA manual is for LDS direct + // reads the "byte address and must be dword aligned". Is it also true for the + // normal loads and stores? + if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) { + // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte + // aligned, 8 byte access in a single operation using ds_read2/write2_b32 + // with adjacent offsets. + bool AlignedBy4 = (Align % 4 == 0); + if (IsFast) + *IsFast = AlignedBy4; + return AlignedBy4; + } + + // Smaller than dword value must be aligned. + // FIXME: This should be allowed on CI+ + if (VT.bitsLT(MVT::i32)) + return false; + + // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the + // byte-address are ignored, thus forcing Dword alignment. + // This applies to private, global, and constant memory. + if (IsFast) + *IsFast = true; + + return VT.bitsGT(MVT::i32) && Align % 4 == 0; +} + +EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, + unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, + bool MemcpyStrSrc, + MachineFunction &MF) const { + // FIXME: Should account for address space here. + + // The default fallback uses the private pointer size as a guess for a type to + // use. Make sure we switch these to 64-bit accesses. + + if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global + return MVT::v4i32; + + if (Size >= 8 && DstAlign >= 4) + return MVT::v2i32; + + // Use the default. + return MVT::Other; +} + +static bool isFlatGlobalAddrSpace(unsigned AS) { + return AS == AMDGPUAS::GLOBAL_ADDRESS || + AS == AMDGPUAS::FLAT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS; +} + +bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, + unsigned DestAS) const { + return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); +} + + +bool SITargetLowering::isMemOpUniform(const SDNode *N) const { + const MemSDNode *MemNode = cast<MemSDNode>(N); + const Value *Ptr = MemNode->getMemOperand()->getValue(); + + // UndefValue means this is a load of a kernel input. These are uniform. + // Sometimes LDS instructions have constant pointers + if (isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || isa<Constant>(Ptr) || + isa<GlobalValue>(Ptr)) + return true; + + const Instruction *I = dyn_cast_or_null<Instruction>(Ptr); + return I && I->getMetadata("amdgpu.uniform"); +} + +TargetLoweringBase::LegalizeTypeAction +SITargetLowering::getPreferredVectorAction(EVT VT) const { + if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) + return TypeSplitVector; + + return TargetLoweringBase::getPreferredVectorAction(VT); +} + +bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const { + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + return TII->isInlineConstant(Imm); +} + +SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, + SDLoc SL, SDValue Chain, + unsigned Offset, bool Signed) const { + const DataLayout &DL = DAG.getDataLayout(); + MachineFunction &MF = DAG.getMachineFunction(); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo()); + unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); + + Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); + PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); + SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, + MRI.getLiveInVirtReg(InputPtrReg), PtrVT); + SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, + DAG.getConstant(Offset, SL, PtrVT)); + SDValue PtrOffset = DAG.getUNDEF(PtrVT); + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); + + unsigned Align = DL.getABITypeAlignment(Ty); + + ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + if (MemVT.isFloatingPoint()) + ExtTy = ISD::EXTLOAD; + + return DAG.getLoad(ISD::UNINDEXED, ExtTy, + VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT, + false, // isVolatile + true, // isNonTemporal + true, // isInvariant + Align); // Alignment +} + +SDValue SITargetLowering::LowerFormalArguments( + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); + + MachineFunction &MF = DAG.getMachineFunction(); + FunctionType *FType = MF.getFunction()->getFunctionType(); + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + + if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) { + const Function *Fn = MF.getFunction(); + DiagnosticInfoUnsupported NoGraphicsHSA(*Fn, "non-compute shaders with HSA"); + DAG.getContext()->diagnose(NoGraphicsHSA); + return SDValue(); + } + + // FIXME: We currently assume all calling conventions are kernels. + + SmallVector<ISD::InputArg, 16> Splits; + BitVector Skipped(Ins.size()); + + for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { + const ISD::InputArg &Arg = Ins[i]; + + // First check if it's a PS input addr + if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() && + !Arg.Flags.isByVal() && PSInputNum <= 15) { + + if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) { + // We can safely skip PS inputs + Skipped.set(i); + ++PSInputNum; + continue; + } + + Info->markPSInputAllocated(PSInputNum); + if (Arg.Used) + Info->PSInputEna |= 1 << PSInputNum; + + ++PSInputNum; + } + + // Second split vertices into their elements + if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) { + ISD::InputArg NewArg = Arg; + NewArg.Flags.setSplit(); + NewArg.VT = Arg.VT.getVectorElementType(); + + // We REALLY want the ORIGINAL number of vertex elements here, e.g. a + // three or five element vertex only needs three or five registers, + // NOT four or eight. + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); + unsigned NumElements = ParamType->getVectorNumElements(); + + for (unsigned j = 0; j != NumElements; ++j) { + Splits.push_back(NewArg); + NewArg.PartOffset += NewArg.VT.getStoreSize(); + } + + } else if (Info->getShaderType() != ShaderType::COMPUTE) { + Splits.push_back(Arg); + } + } + + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); + + // At least one interpolation mode must be enabled or else the GPU will hang. + // + // Check PSInputAddr instead of PSInputEna. The idea is that if the user set + // PSInputAddr, the user wants to enable some bits after the compilation + // based on run-time states. Since we can't know what the final PSInputEna + // will look like, so we shouldn't do anything here and the user should take + // responsibility for the correct programming. + // + // Otherwise, the following restrictions apply: + // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. + // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be + // enabled too. + if (Info->getShaderType() == ShaderType::PIXEL && + ((Info->getPSInputAddr() & 0x7F) == 0 || + ((Info->getPSInputAddr() & 0xF) == 0 && + Info->isPSInputAllocated(11)))) { + CCInfo.AllocateReg(AMDGPU::VGPR0); + CCInfo.AllocateReg(AMDGPU::VGPR1); + Info->markPSInputAllocated(0); + Info->PSInputEna |= 1; + } + + if (Info->getShaderType() == ShaderType::COMPUTE) { + getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, + Splits); + } + + // FIXME: How should these inputs interact with inreg / custom SGPR inputs? + if (Info->hasPrivateSegmentBuffer()) { + unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); + MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass); + CCInfo.AllocateReg(PrivateSegmentBufferReg); + } + + if (Info->hasDispatchPtr()) { + unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI); + MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(DispatchPtrReg); + } + + if (Info->hasKernargSegmentPtr()) { + unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); + MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(InputPtrReg); + } + + AnalyzeFormalArguments(CCInfo, Splits); + + SmallVector<SDValue, 16> Chains; + + for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { + + const ISD::InputArg &Arg = Ins[i]; + if (Skipped[i]) { + InVals.push_back(DAG.getUNDEF(Arg.VT)); + continue; + } + + CCValAssign &VA = ArgLocs[ArgIdx++]; + MVT VT = VA.getLocVT(); + + if (VA.isMemLoc()) { + VT = Ins[i].VT; + EVT MemVT = Splits[i].VT; + const unsigned Offset = Subtarget->getExplicitKernelArgOffset() + + VA.getLocMemOffset(); + // The first 36 bytes of the input buffer contains information about + // thread group and global sizes. + SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain, + Offset, Ins[i].Flags.isSExt()); + Chains.push_back(Arg.getValue(1)); + + auto *ParamTy = + dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && + ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + // On SI local pointers are just offsets into LDS, so they are always + // less than 16-bits. On CI and newer they could potentially be + // real pointers, so we can't guarantee their size. + Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, + DAG.getValueType(MVT::i16)); + } + + InVals.push_back(Arg); + Info->ABIArgOffset = Offset + MemVT.getStoreSize(); + continue; + } + assert(VA.isRegLoc() && "Parameter must be in a register!"); + + unsigned Reg = VA.getLocReg(); + + if (VT == MVT::i64) { + // For now assume it is a pointer + Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, + &AMDGPU::SReg_64RegClass); + Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); + SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); + InVals.push_back(Copy); + continue; + } + + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); + + Reg = MF.addLiveIn(Reg, RC); + SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); + + if (Arg.VT.isVector()) { + + // Build a vector from the registers + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); + unsigned NumElements = ParamType->getVectorNumElements(); + + SmallVector<SDValue, 4> Regs; + Regs.push_back(Val); + for (unsigned j = 1; j != NumElements; ++j) { + Reg = ArgLocs[ArgIdx++].getLocReg(); + Reg = MF.addLiveIn(Reg, RC); + + SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); + Regs.push_back(Copy); + } + + // Fill up the missing vector elements + NumElements = Arg.VT.getVectorNumElements() - NumElements; + Regs.append(NumElements, DAG.getUNDEF(VT)); + + InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs)); + continue; + } + + InVals.push_back(Val); + } + + // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read + // these from the dispatch pointer. + + // Start adding system SGPRs. + if (Info->hasWorkGroupIDX()) { + unsigned Reg = Info->addWorkGroupIDX(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } else + llvm_unreachable("work group id x is always enabled"); + + if (Info->hasWorkGroupIDY()) { + unsigned Reg = Info->addWorkGroupIDY(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkGroupIDZ()) { + unsigned Reg = Info->addWorkGroupIDZ(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkGroupInfo()) { + unsigned Reg = Info->addWorkGroupInfo(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasPrivateSegmentWaveByteOffset()) { + // Scratch wave offset passed in system SGPR. + unsigned PrivateSegmentWaveByteOffsetReg + = Info->addPrivateSegmentWaveByteOffset(); + + MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); + } + + // Now that we've figured out where the scratch register inputs are, see if + // should reserve the arguments and use them directly. + + bool HasStackObjects = MF.getFrameInfo()->hasStackObjects(); + + if (ST.isAmdHsaOS()) { + // TODO: Assume we will spill without optimizations. + if (HasStackObjects) { + // If we have stack objects, we unquestionably need the private buffer + // resource. For the HSA ABI, this will be the first 4 user SGPR + // inputs. We can reserve those and use them directly. + + unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + Info->setScratchRSrcReg(PrivateSegmentBufferReg); + + unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); + } else { + unsigned ReservedBufferReg + = TRI->reservedPrivateSegmentBufferReg(MF); + unsigned ReservedOffsetReg + = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + + // We tentatively reserve the last registers (skipping the last two + // which may contain VCC). After register allocation, we'll replace + // these with the ones immediately after those which were really + // allocated. In the prologue copies will be inserted from the argument + // to these reserved registers. + Info->setScratchRSrcReg(ReservedBufferReg); + Info->setScratchWaveOffsetReg(ReservedOffsetReg); + } + } else { + unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF); + + // Without HSA, relocations are used for the scratch pointer and the + // buffer resource setup is always inserted in the prologue. Scratch wave + // offset is still in an input SGPR. + Info->setScratchRSrcReg(ReservedBufferReg); + + if (HasStackObjects) { + unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg); + } else { + unsigned ReservedOffsetReg + = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + Info->setScratchWaveOffsetReg(ReservedOffsetReg); + } + } + + if (Info->hasWorkItemIDX()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } else + llvm_unreachable("workitem id x should always be enabled"); + + if (Info->hasWorkItemIDY()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkItemIDZ()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Chains.empty()) + return Chain; + + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); +} + +SDValue SITargetLowering::LowerReturn(SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + SDLoc DL, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + + if (Info->getShaderType() == ShaderType::COMPUTE) + return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, + OutVals, DL, DAG); + + Info->setIfReturnsVoid(Outs.size() == 0); + + SmallVector<ISD::OutputArg, 48> Splits; + SmallVector<SDValue, 48> SplitVals; + + // Split vectors into their elements. + for (unsigned i = 0, e = Outs.size(); i != e; ++i) { + const ISD::OutputArg &Out = Outs[i]; + + if (Out.VT.isVector()) { + MVT VT = Out.VT.getVectorElementType(); + ISD::OutputArg NewOut = Out; + NewOut.Flags.setSplit(); + NewOut.VT = VT; + + // We want the original number of vector elements here, e.g. + // three or five, not four or eight. + unsigned NumElements = Out.ArgVT.getVectorNumElements(); + + for (unsigned j = 0; j != NumElements; ++j) { + SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i], + DAG.getConstant(j, DL, MVT::i32)); + SplitVals.push_back(Elem); + Splits.push_back(NewOut); + NewOut.PartOffset += NewOut.VT.getStoreSize(); + } + } else { + SplitVals.push_back(OutVals[i]); + Splits.push_back(Out); + } + } + + // CCValAssign - represent the assignment of the return value to a location. + SmallVector<CCValAssign, 48> RVLocs; + + // CCState - Info about the registers and stack slots. + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); + + // Analyze outgoing return values. + AnalyzeReturn(CCInfo, Splits); + + SDValue Flag; + SmallVector<SDValue, 48> RetOps; + RetOps.push_back(Chain); // Operand #0 = Chain (updated below) + + // Copy the result values into the output registers. + for (unsigned i = 0, realRVLocIdx = 0; + i != RVLocs.size(); + ++i, ++realRVLocIdx) { + CCValAssign &VA = RVLocs[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + + SDValue Arg = SplitVals[realRVLocIdx]; + + // Copied from other backends. + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: + break; + case CCValAssign::BCvt: + Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); + break; + } + + Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); + Flag = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + } + + // Update chain and glue. + RetOps[0] = Chain; + if (Flag.getNode()) + RetOps.push_back(Flag); + + return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps); +} + +MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( + MachineInstr * MI, MachineBasicBlock * BB) const { + + switch (MI->getOpcode()) { + default: + return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); + case AMDGPU::BRANCH: + return BB; + } + return BB; +} + +bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { + // This currently forces unfolding various combinations of fsub into fma with + // free fneg'd operands. As long as we have fast FMA (controlled by + // isFMAFasterThanFMulAndFAdd), we should perform these. + + // When fma is quarter rate, for f64 where add / sub are at best half rate, + // most of these combines appear to be cycle neutral but save on instruction + // count / code size. + return true; +} + +EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, + EVT VT) const { + if (!VT.isVector()) { + return MVT::i1; + } + return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); +} + +MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT) const { + return MVT::i32; +} + +// Answering this is somewhat tricky and depends on the specific device which +// have different rates for fma or all f64 operations. +// +// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other +// regardless of which device (although the number of cycles differs between +// devices), so it is always profitable for f64. +// +// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable +// only on full rate devices. Normally, we should prefer selecting v_mad_f32 +// which we can always do even without fused FP ops since it returns the same +// result as the separate operations and since it is always full +// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 +// however does not support denormals, so we do report fma as faster if we have +// a fast fma device and require denormals. +// +bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { + VT = VT.getScalarType(); + + if (!VT.isSimple()) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f32: + // This is as fast on some subtargets. However, we always have full rate f32 + // mad available which returns the same result as the separate operations + // which we should prefer over fma. We can't use this if we want to support + // denormals, so only report this in these cases. + return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); + case MVT::f64: + return true; + default: + break; + } + + return false; +} + +//===----------------------------------------------------------------------===// +// Custom DAG Lowering Operations +//===----------------------------------------------------------------------===// + +SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); + case ISD::BRCOND: return LowerBRCOND(Op, DAG); + case ISD::LOAD: { + SDValue Result = LowerLOAD(Op, DAG); + assert((!Result.getNode() || + Result.getNode()->getNumValues() == 2) && + "Load should return a value and a chain"); + return Result; + } + + case ISD::FSIN: + case ISD::FCOS: + return LowerTrig(Op, DAG); + case ISD::SELECT: return LowerSELECT(Op, DAG); + case ISD::FDIV: return LowerFDIV(Op, DAG); + case ISD::STORE: return LowerSTORE(Op, DAG); + case ISD::GlobalAddress: { + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + return LowerGlobalAddress(MFI, Op, DAG); + } + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); + } + return SDValue(); +} + +/// \brief Helper function for LowerBRCOND +static SDNode *findUser(SDValue Value, unsigned Opcode) { + + SDNode *Parent = Value.getNode(); + for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); + I != E; ++I) { + + if (I.getUse().get() != Value) + continue; + + if (I->getOpcode() == Opcode) + return *I; + } + return nullptr; +} + +SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { + + SDLoc SL(Op); + FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op); + unsigned FrameIndex = FINode->getIndex(); + + // A FrameIndex node represents a 32-bit offset into scratch memory. If + // the high bit of a frame index offset were to be set, this would mean + // that it represented an offset of ~2GB * 64 = ~128GB from the start of the + // scratch buffer, with 64 being the number of threads per wave. + // + // If we know the machine uses less than 128GB of scratch, then we can + // amrk the high bit of the FrameIndex node as known zero, + // which is important, because it means in most situations we can + // prove that values derived from FrameIndex nodes are non-negative. + // This enables us to take advantage of more addressing modes when + // accessing scratch buffers, since for scratch reads/writes, the register + // offset must always be positive. + + SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32); + if (Subtarget->enableHugeScratchBuffer()) + return TFI; + + return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI, + DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31))); +} + +/// This transforms the control flow intrinsics to get the branch destination as +/// last parameter, also switches branch target with BR if the need arise +SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, + SelectionDAG &DAG) const { + + SDLoc DL(BRCOND); + + SDNode *Intr = BRCOND.getOperand(1).getNode(); + SDValue Target = BRCOND.getOperand(2); + SDNode *BR = nullptr; + + if (Intr->getOpcode() == ISD::SETCC) { + // As long as we negate the condition everything is fine + SDNode *SetCC = Intr; + assert(SetCC->getConstantOperandVal(1) == 1); + assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == + ISD::SETNE); + Intr = SetCC->getOperand(0).getNode(); + + } else { + // Get the target from BR if we don't negate the condition + BR = findUser(BRCOND, ISD::BR); + Target = BR->getOperand(1); + } + + assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); + + // Build the result and + ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); + + // operands of the new intrinsic call + SmallVector<SDValue, 4> Ops; + Ops.push_back(BRCOND.getOperand(0)); + Ops.append(Intr->op_begin() + 1, Intr->op_end()); + Ops.push_back(Target); + + // build the new intrinsic call + SDNode *Result = DAG.getNode( + Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, + DAG.getVTList(Res), Ops).getNode(); + + if (BR) { + // Give the branch instruction our target + SDValue Ops[] = { + BR->getOperand(0), + BRCOND.getOperand(2) + }; + SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); + DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); + BR = NewBR.getNode(); + } + + SDValue Chain = SDValue(Result, Result->getNumValues() - 1); + + // Copy the intrinsic results to registers + for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { + SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); + if (!CopyToReg) + continue; + + Chain = DAG.getCopyToReg( + Chain, DL, + CopyToReg->getOperand(1), + SDValue(Result, i - 1), + SDValue()); + + DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); + } + + // Remove the old intrinsic from the chain + DAG.ReplaceAllUsesOfValueWith( + SDValue(Intr, Intr->getNumValues() - 1), + Intr->getOperand(0)); + + return Chain; +} + +SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, + SDValue Op, + SelectionDAG &DAG) const { + GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); + + if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) + return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); + + SDLoc DL(GSD); + const GlobalValue *GV = GSD->getGlobal(); + MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace()); + + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); + return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA); +} + +SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, + SDValue V) const { + // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, + // so we will end up with redundant moves to m0. + // + // We can't use S_MOV_B32, because there is no way to specify m0 as the + // destination register. + // + // We have to use them both. Machine cse will combine all the S_MOV_B32 + // instructions and the register coalescer eliminate the extra copies. + SDNode *M0 = DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, V.getValueType(), V); + return DAG.getCopyToReg(Chain, DL, DAG.getRegister(AMDGPU::M0, MVT::i32), + SDValue(M0, 0), SDValue()); // Glue + // A Null SDValue creates + // a glue result. +} + +SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, + SDValue Op, + MVT VT, + unsigned Offset) const { + SDLoc SL(Op); + SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL, + DAG.getEntryNode(), Offset, false); + // The local size values will have the hi 16-bits as zero. + return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, + DAG.getValueType(VT)); +} + +SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + auto MFI = MF.getInfo<SIMachineFunctionInfo>(); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); + + EVT VT = Op.getValueType(); + SDLoc DL(Op); + unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + + // TODO: Should this propagate fast-math-flags? + + switch (IntrinsicID) { + case Intrinsic::amdgcn_dispatch_ptr: + if (!Subtarget->isAmdHsaOS()) { + DiagnosticInfoUnsupported BadIntrin(*MF.getFunction(), + "hsa intrinsic without hsa target"); + DAG.getContext()->diagnose(BadIntrin); + return DAG.getUNDEF(VT); + } + + return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT); + + case Intrinsic::r600_read_ngroups_x: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_X, false); + case Intrinsic::r600_read_ngroups_y: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_Y, false); + case Intrinsic::r600_read_ngroups_z: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_Z, false); + case Intrinsic::r600_read_global_size_x: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_X, false); + case Intrinsic::r600_read_global_size_y: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); + case Intrinsic::r600_read_global_size_z: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); + case Intrinsic::r600_read_local_size_x: + return lowerImplicitZextParam(DAG, Op, MVT::i16, + SI::KernelInputOffsets::LOCAL_SIZE_X); + case Intrinsic::r600_read_local_size_y: + return lowerImplicitZextParam(DAG, Op, MVT::i16, + SI::KernelInputOffsets::LOCAL_SIZE_Y); + case Intrinsic::r600_read_local_size_z: + return lowerImplicitZextParam(DAG, Op, MVT::i16, + SI::KernelInputOffsets::LOCAL_SIZE_Z); + case Intrinsic::AMDGPU_read_workdim: + // Really only 2 bits. + return lowerImplicitZextParam(DAG, Op, MVT::i8, + getImplicitParameterOffset(MFI, GRID_DIM)); + case Intrinsic::r600_read_tgid_x: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT); + case Intrinsic::r600_read_tgid_y: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT); + case Intrinsic::r600_read_tgid_z: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT); + case Intrinsic::r600_read_tidig_x: + return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT); + case Intrinsic::r600_read_tidig_y: + return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT); + case Intrinsic::r600_read_tidig_z: + return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT); + case AMDGPUIntrinsic::SI_load_const: { + SDValue Ops[] = { + Op.getOperand(1), + Op.getOperand(2) + }; + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, + VT.getStoreSize(), 4); + return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, + Op->getVTList(), Ops, VT, MMO); + } + case AMDGPUIntrinsic::SI_sample: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); + case AMDGPUIntrinsic::SI_sampleb: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); + case AMDGPUIntrinsic::SI_sampled: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); + case AMDGPUIntrinsic::SI_samplel: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); + case AMDGPUIntrinsic::SI_vs_load_input: + return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_fract: + case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. + return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1), + DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1))); + case AMDGPUIntrinsic::SI_fs_constant: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); + SDValue Glue = M0.getValue(1); + return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, + DAG.getConstant(2, DL, MVT::i32), // P0 + Op.getOperand(1), Op.getOperand(2), Glue); + } + case AMDGPUIntrinsic::SI_packf16: + if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef()) + return DAG.getUNDEF(MVT::i32); + return Op; + case AMDGPUIntrinsic::SI_fs_interp: { + SDValue IJ = Op.getOperand(4); + SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, + DAG.getConstant(0, DL, MVT::i32)); + SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, + DAG.getConstant(1, DL, MVT::i32)); + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); + SDValue Glue = M0.getValue(1); + SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL, + DAG.getVTList(MVT::f32, MVT::Glue), + I, Op.getOperand(1), Op.getOperand(2), Glue); + Glue = SDValue(P1.getNode(), 1); + return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J, + Op.getOperand(1), Op.getOperand(2), Glue); + } + case Intrinsic::amdgcn_interp_p1: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); + SDValue Glue = M0.getValue(1); + return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3), Glue); + } + case Intrinsic::amdgcn_interp_p2: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); + SDValue Glue = SDValue(M0.getNode(), 1); + return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), + Glue); + } + default: + return AMDGPUTargetLowering::LowerOperation(Op, DAG); + } +} + +SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + SDLoc DL(Op); + SDValue Chain = Op.getOperand(0); + unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + + switch (IntrinsicID) { + case AMDGPUIntrinsic::SI_sendmsg: { + Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); + SDValue Glue = Chain.getValue(1); + return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain, + Op.getOperand(2), Glue); + } + case AMDGPUIntrinsic::SI_tbuffer_store: { + SDValue Ops[] = { + Chain, + Op.getOperand(2), + Op.getOperand(3), + Op.getOperand(4), + Op.getOperand(5), + Op.getOperand(6), + Op.getOperand(7), + Op.getOperand(8), + Op.getOperand(9), + Op.getOperand(10), + Op.getOperand(11), + Op.getOperand(12), + Op.getOperand(13), + Op.getOperand(14) + }; + + EVT VT = Op.getOperand(3).getValueType(); + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOStore, + VT.getStoreSize(), 4); + return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, + Op->getVTList(), Ops, VT, MMO); + } + default: + return SDValue(); + } +} + +SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + LoadSDNode *Load = cast<LoadSDNode>(Op); + + if (Op.getValueType().isVector()) { + assert(Op.getValueType().getVectorElementType() == MVT::i32 && + "Custom lowering for non-i32 vectors hasn't been implemented."); + unsigned NumElements = Op.getValueType().getVectorNumElements(); + assert(NumElements != 2 && "v2 loads are supported for all address spaces."); + + switch (Load->getAddressSpace()) { + default: break; + case AMDGPUAS::CONSTANT_ADDRESS: + if (isMemOpUniform(Load)) + break; + // Non-uniform loads will be selected to MUBUF instructions, so they + // have the same legalization requires ments as global and private + // loads. + // + // Fall-through + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::PRIVATE_ADDRESS: + if (NumElements >= 8) + return SplitVectorLoad(Op, DAG); + + // v4 loads are supported for private and global memory. + if (NumElements <= 4) + break; + // fall-through + case AMDGPUAS::LOCAL_ADDRESS: + // If properly aligned, if we split we might be able to use ds_read_b64. + return SplitVectorLoad(Op, DAG); + } + } + + return AMDGPUTargetLowering::LowerLOAD(Op, DAG); +} + +SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, + const SDValue &Op, + SelectionDAG &DAG) const { + return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3), + Op.getOperand(4)); +} + +SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { + if (Op.getValueType() != MVT::i64) + return SDValue(); + + SDLoc DL(Op); + SDValue Cond = Op.getOperand(0); + + SDValue Zero = DAG.getConstant(0, DL, MVT::i32); + SDValue One = DAG.getConstant(1, DL, MVT::i32); + + SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); + SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); + + SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); + SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); + + SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); + + SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); + SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); + + SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); + + SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi); + return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); +} + +// Catch division cases where we can use shortcuts with rcp and rsq +// instructions. +SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + EVT VT = Op.getValueType(); + bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; + + if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { + if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) && + CLHS->isExactlyValue(1.0)) { + // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to + // the CI documentation has a worst case error of 1 ulp. + // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to + // use it as long as we aren't trying to use denormals. + + // 1.0 / sqrt(x) -> rsq(x) + // + // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP + // error seems really high at 2^29 ULP. + if (RHS.getOpcode() == ISD::FSQRT) + return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); + + // 1.0 / x -> rcp(x) + return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); + } + } + + if (Unsafe) { + // Turn into multiply by the reciprocal. + // x / y -> x * (1.0 / y) + SDNodeFlags Flags; + Flags.setUnsafeAlgebra(true); + SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); + return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags); + } + + return SDValue(); +} + +SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { + SDValue FastLowered = LowerFastFDIV(Op, DAG); + if (FastLowered.getNode()) + return FastLowered; + + // This uses v_rcp_f32 which does not handle denormals. Let this hit a + // selection error for now rather than do something incorrect. + if (Subtarget->hasFP32Denormals()) + return SDValue(); + + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); + + const APFloat K0Val(BitsToFloat(0x6f800000)); + const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); + + const APFloat K1Val(BitsToFloat(0x2f800000)); + const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); + + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); + + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); + + SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); + + SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + + // TODO: Should this propagate fast-math-flags? + + r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); + + SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); + + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + + return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); +} + +SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { + if (DAG.getTarget().Options.UnsafeFPMath) + return LowerFastFDIV(Op, DAG); + + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + SDValue Y = Op.getOperand(1); + + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); + + SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); + + SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X); + + SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); + + SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); + + SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); + + SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); + + SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); + + SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X); + + SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); + + SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64, + NegDivScale0, Mul, DivScale1); + + SDValue Scale; + + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { + // Workaround a hardware bug on SI where the condition output from div_scale + // is not usable. + + const SDValue Hi = DAG.getConstant(1, SL, MVT::i32); + + // Figure out if the scale to use for div_fmas. + SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); + SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); + SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); + SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); + + SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); + SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); + + SDValue Scale0Hi + = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); + SDValue Scale1Hi + = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); + + SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); + SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); + Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); + } else { + Scale = DivScale1.getValue(1); + } + + SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, + Fma4, Fma3, Mul, Scale); + + return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); +} + +SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (VT == MVT::f32) + return LowerFDIV32(Op, DAG); + + if (VT == MVT::f64) + return LowerFDIV64(Op, DAG); + + llvm_unreachable("Unexpected type for fdiv"); +} + +SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + StoreSDNode *Store = cast<StoreSDNode>(Op); + EVT VT = Store->getMemoryVT(); + + // These stores are legal. + if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + if (VT.isVector() && VT.getVectorNumElements() > 4) + return ScalarizeVectorStore(Op, DAG); + return SDValue(); + } + + SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); + if (Ret.getNode()) + return Ret; + + if (VT.isVector() && VT.getVectorNumElements() >= 8) + return SplitVectorStore(Op, DAG); + + if (VT == MVT::i1) + return DAG.getTruncStore(Store->getChain(), DL, + DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), + Store->getBasePtr(), MVT::i1, Store->getMemOperand()); + + return SDValue(); +} + +SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue Arg = Op.getOperand(0); + // TODO: Should this propagate fast-math-flags? + SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, + DAG.getNode(ISD::FMUL, DL, VT, Arg, + DAG.getConstantFP(0.5/M_PI, DL, + VT))); + + switch (Op.getOpcode()) { + case ISD::FCOS: + return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart); + case ISD::FSIN: + return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart); + default: + llvm_unreachable("Wrong trig opcode"); + } +} + +//===----------------------------------------------------------------------===// +// Custom DAG optimizations +//===----------------------------------------------------------------------===// + +SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + EVT ScalarVT = VT.getScalarType(); + if (ScalarVT != MVT::f32) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + SDValue Src = N->getOperand(0); + EVT SrcVT = Src.getValueType(); + + // TODO: We could try to match extracting the higher bytes, which would be + // easier if i8 vectors weren't promoted to i32 vectors, particularly after + // types are legalized. v4i8 -> v4f32 is probably the only case to worry + // about in practice. + if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) { + if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { + SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); + DCI.AddToWorklist(Cvt.getNode()); + return Cvt; + } + } + + // We are primarily trying to catch operations on illegal vector types + // before they are expanded. + // For scalars, we can use the more flexible method of checking masked bits + // after legalization. + if (!DCI.isBeforeLegalize() || + !SrcVT.isVector() || + SrcVT.getVectorElementType() != MVT::i8) { + return SDValue(); + } + + assert(DCI.isBeforeLegalize() && "Unexpected legal type"); + + // Weird sized vectors are a pain to handle, but we know 3 is really the same + // size as 4. + unsigned NElts = SrcVT.getVectorNumElements(); + if (!SrcVT.isSimple() && NElts != 3) + return SDValue(); + + // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to + // prevent a mess from expanding to v4i32 and repacking. + if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { + EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); + EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); + EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); + LoadSDNode *Load = cast<LoadSDNode>(Src); + + unsigned AS = Load->getAddressSpace(); + unsigned Align = Load->getAlignment(); + Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext()); + unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); + + // Don't try to replace the load if we have to expand it due to alignment + // problems. Otherwise we will end up scalarizing the load, and trying to + // repack into the vector for no real reason. + if (Align < ABIAlignment && + !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) { + return SDValue(); + } + + SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, + Load->getChain(), + Load->getBasePtr(), + LoadVT, + Load->getMemOperand()); + + // Make sure successors of the original load stay after it by updating + // them to use the new Chain. + DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1)); + + SmallVector<SDValue, 4> Elts; + if (RegVT.isVector()) + DAG.ExtractVectorElements(NewLoad, Elts); + else + Elts.push_back(NewLoad); + + SmallVector<SDValue, 4> Ops; + + unsigned EltIdx = 0; + for (SDValue Elt : Elts) { + unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx); + for (unsigned I = 0; I < ComponentsInElt; ++I) { + unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I; + SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt); + DCI.AddToWorklist(Cvt.getNode()); + Ops.push_back(Cvt); + } + + ++EltIdx; + } + + assert(Ops.size() == NElts); + + return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops); + } + + return SDValue(); +} + +/// \brief Return true if the given offset Size in bytes can be folded into +/// the immediate offsets of a memory instruction for the given address space. +static bool canFoldOffset(unsigned OffsetSize, unsigned AS, + const AMDGPUSubtarget &STI) { + switch (AS) { + case AMDGPUAS::GLOBAL_ADDRESS: { + // MUBUF instructions a 12-bit offset in bytes. + return isUInt<12>(OffsetSize); + } + case AMDGPUAS::CONSTANT_ADDRESS: { + // SMRD instructions have an 8-bit offset in dwords on SI and + // a 20-bit offset in bytes on VI. + if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return isUInt<20>(OffsetSize); + else + return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); + } + case AMDGPUAS::LOCAL_ADDRESS: + case AMDGPUAS::REGION_ADDRESS: { + // The single offset versions have a 16-bit offset in bytes. + return isUInt<16>(OffsetSize); + } + case AMDGPUAS::PRIVATE_ADDRESS: + // Indirect register addressing does not use any offsets. + default: + return 0; + } +} + +// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) + +// This is a variant of +// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), +// +// The normal DAG combiner will do this, but only if the add has one use since +// that would increase the number of instructions. +// +// This prevents us from seeing a constant offset that can be folded into a +// memory instruction's addressing mode. If we know the resulting add offset of +// a pointer can be folded into an addressing offset, we can replace the pointer +// operand with the add of new constant offset. This eliminates one of the uses, +// and may allow the remaining use to also be simplified. +// +SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, + unsigned AddrSpace, + DAGCombinerInfo &DCI) const { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (N0.getOpcode() != ISD::ADD) + return SDValue(); + + const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1); + if (!CN1) + return SDValue(); + + const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1)); + if (!CAdd) + return SDValue(); + + // If the resulting offset is too large, we can't fold it into the addressing + // mode offset. + APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); + if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *Subtarget)) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + EVT VT = N->getValueType(0); + + SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); + SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32); + + return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); +} + +SDValue SITargetLowering::performAndCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (DCI.isBeforeLegalize()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + + // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> + // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + if (LHS.getOpcode() == ISD::SETCC && + RHS.getOpcode() == ISD::SETCC) { + ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); + ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get(); + + SDValue X = LHS.getOperand(0); + SDValue Y = RHS.getOperand(0); + if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X) + return SDValue(); + + if (LCC == ISD::SETO) { + if (X != LHS.getOperand(1)) + return SDValue(); + + if (RCC == ISD::SETUNE) { + const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1)); + if (!C1 || !C1->isInfinity() || C1->isNegative()) + return SDValue(); + + const uint32_t Mask = SIInstrFlags::N_NORMAL | + SIInstrFlags::N_SUBNORMAL | + SIInstrFlags::N_ZERO | + SIInstrFlags::P_ZERO | + SIInstrFlags::P_SUBNORMAL | + SIInstrFlags::P_NORMAL; + + static_assert(((~(SIInstrFlags::S_NAN | + SIInstrFlags::Q_NAN | + SIInstrFlags::N_INFINITY | + SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask, + "mask not equal"); + + SDLoc DL(N); + return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, + X, DAG.getConstant(Mask, DL, MVT::i32)); + } + } + } + + return SDValue(); +} + +SDValue SITargetLowering::performOrCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) + if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && + RHS.getOpcode() == AMDGPUISD::FP_CLASS) { + SDValue Src = LHS.getOperand(0); + if (Src != RHS.getOperand(0)) + return SDValue(); + + const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); + const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); + if (!CLHS || !CRHS) + return SDValue(); + + // Only 10 bits are used. + static const uint32_t MaxMask = 0x3ff; + + uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; + SDLoc DL(N); + return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, + Src, DAG.getConstant(NewMask, DL, MVT::i32)); + } + + return SDValue(); +} + +SDValue SITargetLowering::performClassCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue Mask = N->getOperand(1); + + // fp_class x, 0 -> false + if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) { + if (CMask->isNullValue()) + return DAG.getConstant(0, SDLoc(N), MVT::i1); + } + + return SDValue(); +} + +static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { + switch (Opc) { + case ISD::FMAXNUM: + return AMDGPUISD::FMAX3; + case ISD::SMAX: + return AMDGPUISD::SMAX3; + case ISD::UMAX: + return AMDGPUISD::UMAX3; + case ISD::FMINNUM: + return AMDGPUISD::FMIN3; + case ISD::SMIN: + return AMDGPUISD::SMIN3; + case ISD::UMIN: + return AMDGPUISD::UMIN3; + default: + llvm_unreachable("Not a min/max opcode"); + } +} + +SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + + unsigned Opc = N->getOpcode(); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + // Only do this if the inner op has one use since this will just increases + // register pressure for no benefit. + + // max(max(a, b), c) + if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { + SDLoc DL(N); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), + DL, + N->getValueType(0), + Op0.getOperand(0), + Op0.getOperand(1), + Op1); + } + + // max(a, max(b, c)) + if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { + SDLoc DL(N); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), + DL, + N->getValueType(0), + Op0, + Op1.getOperand(0), + Op1.getOperand(1)); + } + + return SDValue(); +} + +SDValue SITargetLowering::performSetCCCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + EVT VT = LHS.getValueType(); + + if (VT != MVT::f32 && VT != MVT::f64) + return SDValue(); + + // Match isinf pattern + // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) { + const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); + if (!CRHS) + return SDValue(); + + const APFloat &APF = CRHS->getValueAPF(); + if (APF.isInfinity() && !APF.isNegative()) { + unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY; + return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0), + DAG.getConstant(Mask, SL, MVT::i32)); + } + } + + return SDValue(); +} + +SDValue SITargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + switch (N->getOpcode()) { + default: + return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); + case ISD::SETCC: + return performSetCCCombine(N, DCI); + case ISD::FMAXNUM: // TODO: What about fmax_legacy? + case ISD::FMINNUM: + case ISD::SMAX: + case ISD::SMIN: + case ISD::UMAX: + case ISD::UMIN: { + if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && + N->getValueType(0) != MVT::f64 && + getTargetMachine().getOptLevel() > CodeGenOpt::None) + return performMin3Max3Combine(N, DCI); + break; + } + + case AMDGPUISD::CVT_F32_UBYTE0: + case AMDGPUISD::CVT_F32_UBYTE1: + case AMDGPUISD::CVT_F32_UBYTE2: + case AMDGPUISD::CVT_F32_UBYTE3: { + unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; + + SDValue Src = N->getOperand(0); + APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); + + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLO.ShrinkDemandedConstant(Src, Demanded) || + TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { + DCI.CommitTargetLoweringOpt(TLO); + } + + break; + } + + case ISD::UINT_TO_FP: { + return performUCharToFloatCombine(N, DCI); + } + case ISD::FADD: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + break; + + EVT VT = N->getValueType(0); + if (VT != MVT::f32) + break; + + // Only do this if we are not trying to support denormals. v_mad_f32 does + // not support denormals ever. + if (Subtarget->hasFP32Denormals()) + break; + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // These should really be instruction patterns, but writing patterns with + // source modiifiers is a pain. + + // fadd (fadd (a, a), b) -> mad 2.0, a, b + if (LHS.getOpcode() == ISD::FADD) { + SDValue A = LHS.getOperand(0); + if (A == LHS.getOperand(1)) { + const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); + return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS); + } + } + + // fadd (b, fadd (a, a)) -> mad 2.0, a, b + if (RHS.getOpcode() == ISD::FADD) { + SDValue A = RHS.getOperand(0); + if (A == RHS.getOperand(1)) { + const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); + return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS); + } + } + + return SDValue(); + } + case ISD::FSUB: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + break; + + EVT VT = N->getValueType(0); + + // Try to get the fneg to fold into the source modifier. This undoes generic + // DAG combines and folds them into the mad. + // + // Only do this if we are not trying to support denormals. v_mad_f32 does + // not support denormals ever. + if (VT == MVT::f32 && + !Subtarget->hasFP32Denormals()) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + if (LHS.getOpcode() == ISD::FADD) { + // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) + + SDValue A = LHS.getOperand(0); + if (A == LHS.getOperand(1)) { + const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); + SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS); + + return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS); + } + } + + if (RHS.getOpcode() == ISD::FADD) { + // (fsub c, (fadd a, a)) -> mad -2.0, a, c + + SDValue A = RHS.getOperand(0); + if (A == RHS.getOperand(1)) { + const SDValue NegTwo = DAG.getConstantFP(-2.0, DL, MVT::f32); + return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS); + } + } + + return SDValue(); + } + + break; + } + case ISD::LOAD: + case ISD::STORE: + case ISD::ATOMIC_LOAD: + case ISD::ATOMIC_STORE: + case ISD::ATOMIC_CMP_SWAP: + case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: + case ISD::ATOMIC_SWAP: + case ISD::ATOMIC_LOAD_ADD: + case ISD::ATOMIC_LOAD_SUB: + case ISD::ATOMIC_LOAD_AND: + case ISD::ATOMIC_LOAD_OR: + case ISD::ATOMIC_LOAD_XOR: + case ISD::ATOMIC_LOAD_NAND: + case ISD::ATOMIC_LOAD_MIN: + case ISD::ATOMIC_LOAD_MAX: + case ISD::ATOMIC_LOAD_UMIN: + case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics. + if (DCI.isBeforeLegalize()) + break; + + MemSDNode *MemNode = cast<MemSDNode>(N); + SDValue Ptr = MemNode->getBasePtr(); + + // TODO: We could also do this for multiplies. + unsigned AS = MemNode->getAddressSpace(); + if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { + SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); + if (NewPtr) { + SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end()); + + NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; + return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0); + } + } + break; + } + case ISD::AND: + return performAndCombine(N, DCI); + case ISD::OR: + return performOrCombine(N, DCI); + case AMDGPUISD::FP_CLASS: + return performClassCombine(N, DCI); + } + return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); +} + +/// \brief Analyze the possible immediate value Op +/// +/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate +/// and the immediate value if it's a literal immediate +int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { + + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + + if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { + if (TII->isInlineConstant(Node->getAPIntValue())) + return 0; + + uint64_t Val = Node->getZExtValue(); + return isUInt<32>(Val) ? Val : -1; + } + + if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) { + if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt())) + return 0; + + if (Node->getValueType(0) == MVT::f32) + return FloatToBits(Node->getValueAPF().convertToFloat()); + + return -1; + } + + return -1; +} + +/// \brief Helper function for adjustWritemask +static unsigned SubIdx2Lane(unsigned Idx) { + switch (Idx) { + default: return 0; + case AMDGPU::sub0: return 0; + case AMDGPU::sub1: return 1; + case AMDGPU::sub2: return 2; + case AMDGPU::sub3: return 3; + } +} + +/// \brief Adjust the writemask of MIMG instructions +void SITargetLowering::adjustWritemask(MachineSDNode *&Node, + SelectionDAG &DAG) const { + SDNode *Users[4] = { }; + unsigned Lane = 0; + unsigned OldDmask = Node->getConstantOperandVal(0); + unsigned NewDmask = 0; + + // Try to figure out the used register components + for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); + I != E; ++I) { + + // Abort if we can't understand the usage + if (!I->isMachineOpcode() || + I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) + return; + + // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. + // Note that subregs are packed, i.e. Lane==0 is the first bit set + // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit + // set, etc. + Lane = SubIdx2Lane(I->getConstantOperandVal(1)); + + // Set which texture component corresponds to the lane. + unsigned Comp; + for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { + assert(Dmask); + Comp = countTrailingZeros(Dmask); + Dmask &= ~(1 << Comp); + } + + // Abort if we have more than one user per component + if (Users[Lane]) + return; + + Users[Lane] = *I; + NewDmask |= 1 << Comp; + } + + // Abort if there's no change + if (NewDmask == OldDmask) + return; + + // Adjust the writemask in the node + std::vector<SDValue> Ops; + Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); + Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end()); + Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); + + // If we only got one lane, replace it with a copy + // (if NewDmask has only one bit set...) + if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { + SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(), + MVT::i32); + SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + SDLoc(), Users[Lane]->getValueType(0), + SDValue(Node, 0), RC); + DAG.ReplaceAllUsesWith(Users[Lane], Copy); + return; + } + + // Update the users of the node with the new indices + for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { + + SDNode *User = Users[i]; + if (!User) + continue; + + SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); + DAG.UpdateNodeOperands(User, User->getOperand(0), Op); + + switch (Idx) { + default: break; + case AMDGPU::sub0: Idx = AMDGPU::sub1; break; + case AMDGPU::sub1: Idx = AMDGPU::sub2; break; + case AMDGPU::sub2: Idx = AMDGPU::sub3; break; + } + } +} + +static bool isFrameIndexOp(SDValue Op) { + if (Op.getOpcode() == ISD::AssertZext) + Op = Op.getOperand(0); + + return isa<FrameIndexSDNode>(Op); +} + +/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG) +/// with frame index operands. +/// LLVM assumes that inputs are to these instructions are registers. +void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, + SelectionDAG &DAG) const { + + SmallVector<SDValue, 8> Ops; + for (unsigned i = 0; i < Node->getNumOperands(); ++i) { + if (!isFrameIndexOp(Node->getOperand(i))) { + Ops.push_back(Node->getOperand(i)); + continue; + } + + SDLoc DL(Node); + Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, + Node->getOperand(i).getValueType(), + Node->getOperand(i)), 0)); + } + + DAG.UpdateNodeOperands(Node, Ops); +} + +/// \brief Fold the instructions after selecting them. +SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, + SelectionDAG &DAG) const { + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + + if (TII->isMIMG(Node->getMachineOpcode())) + adjustWritemask(Node, DAG); + + if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG || + Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) { + legalizeTargetIndependentNode(Node, DAG); + return Node; + } + return Node; +} + +/// \brief Assign the register class depending on the number of +/// bits set in the writemask +void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, + SDNode *Node) const { + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + + MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + + if (TII->isVOP3(MI->getOpcode())) { + // Make sure constant bus requirements are respected. + TII->legalizeOperandsVOP3(MRI, MI); + return; + } + + if (TII->isMIMG(*MI)) { + unsigned VReg = MI->getOperand(0).getReg(); + unsigned Writemask = MI->getOperand(1).getImm(); + unsigned BitsSet = 0; + for (unsigned i = 0; i < 4; ++i) + BitsSet += Writemask & (1 << i) ? 1 : 0; + + const TargetRegisterClass *RC; + switch (BitsSet) { + default: return; + case 1: RC = &AMDGPU::VGPR_32RegClass; break; + case 2: RC = &AMDGPU::VReg_64RegClass; break; + case 3: RC = &AMDGPU::VReg_96RegClass; break; + } + + unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); + MI->setDesc(TII->get(NewOpcode)); + MRI.setRegClass(VReg, RC); + return; + } + + // Replace unused atomics with the no return version. + int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode()); + if (NoRetAtomicOp != -1) { + if (!Node->hasAnyUseOfValue(0)) { + MI->setDesc(TII->get(NoRetAtomicOp)); + MI->RemoveOperand(0); + } + + return; + } +} + +static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) { + SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32); + return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); +} + +MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, + SDLoc DL, + SDValue Ptr) const { + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + + // Build the half of the subregister with the constants before building the + // full 128-bit register. If we are building multiple resource descriptors, + // this will allow CSEing of the 2-component register. + const SDValue Ops0[] = { + DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), + buildSMovImm32(DAG, DL, 0), + DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), + DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) + }; + + SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::v2i32, Ops0), 0); + + // Combine the constants and the pointer. + const SDValue Ops1[] = { + DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), + Ptr, + DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), + SubRegHi, + DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) + }; + + return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); +} + +/// \brief Return a resource descriptor with the 'Add TID' bit enabled +/// The TID (Thread ID) is multiplied by the stride value (bits [61:48] +/// of the resource descriptor) to create an offset, which is added to +/// the resource pointer. +MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, + SDLoc DL, + SDValue Ptr, + uint32_t RsrcDword1, + uint64_t RsrcDword2And3) const { + SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); + SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); + if (RsrcDword1) { + PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, + DAG.getConstant(RsrcDword1, DL, MVT::i32)), + 0); + } + + SDValue DataLo = buildSMovImm32(DAG, DL, + RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); + SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); + + const SDValue Ops[] = { + DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), + PtrLo, + DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + PtrHi, + DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32), + DataLo, + DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32), + DataHi, + DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32) + }; + + return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); +} + +SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, + const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const { + SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); + + return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), + cast<RegisterSDNode>(VReg)->getReg(), VT); +} + +//===----------------------------------------------------------------------===// +// SI Inline Assembly Support +//===----------------------------------------------------------------------===// + +std::pair<unsigned, const TargetRegisterClass *> +SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + StringRef Constraint, + MVT VT) const { + + if (Constraint.size() == 1) { + switch (Constraint[0]) { + case 's': + case 'r': + switch (VT.getSizeInBits()) { + default: + return std::make_pair(0U, nullptr); + case 32: + return std::make_pair(0U, &AMDGPU::SGPR_32RegClass); + case 64: + return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); + case 128: + return std::make_pair(0U, &AMDGPU::SReg_128RegClass); + case 256: + return std::make_pair(0U, &AMDGPU::SReg_256RegClass); + } + + case 'v': + switch (VT.getSizeInBits()) { + default: + return std::make_pair(0U, nullptr); + case 32: + return std::make_pair(0U, &AMDGPU::VGPR_32RegClass); + case 64: + return std::make_pair(0U, &AMDGPU::VReg_64RegClass); + case 96: + return std::make_pair(0U, &AMDGPU::VReg_96RegClass); + case 128: + return std::make_pair(0U, &AMDGPU::VReg_128RegClass); + case 256: + return std::make_pair(0U, &AMDGPU::VReg_256RegClass); + case 512: + return std::make_pair(0U, &AMDGPU::VReg_512RegClass); + } + } + } + + if (Constraint.size() > 1) { + const TargetRegisterClass *RC = nullptr; + if (Constraint[1] == 'v') { + RC = &AMDGPU::VGPR_32RegClass; + } else if (Constraint[1] == 's') { + RC = &AMDGPU::SGPR_32RegClass; + } + + if (RC) { + uint32_t Idx; + bool Failed = Constraint.substr(2).getAsInteger(10, Idx); + if (!Failed && Idx < RC->getNumRegs()) + return std::make_pair(RC->getRegister(Idx), RC); + } + } + return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); +} + +SITargetLowering::ConstraintType +SITargetLowering::getConstraintType(StringRef Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: break; + case 's': + case 'v': + return C_RegisterClass; + } + } + return TargetLowering::getConstraintType(Constraint); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h new file mode 100644 index 0000000..f01b2c0 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -0,0 +1,138 @@ +//===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief SI DAG Lowering interface definition +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_SIISELLOWERING_H +#define LLVM_LIB_TARGET_R600_SIISELLOWERING_H + +#include "AMDGPUISelLowering.h" +#include "SIInstrInfo.h" + +namespace llvm { + +class SITargetLowering : public AMDGPUTargetLowering { + SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL, + SDValue Chain, unsigned Offset, bool Signed) const; + SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op, + SelectionDAG &DAG) const; + SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, + SelectionDAG &DAG) const override; + + SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, + MVT VT, unsigned Offset) const; + + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const; + SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + + void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; + + SDValue performUCharToFloatCombine(SDNode *N, + DAGCombinerInfo &DCI) const; + SDValue performSHLPtrCombine(SDNode *N, + unsigned AS, + DAGCombinerInfo &DCI) const; + SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; + + SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; + + bool isLegalFlatAddressingMode(const AddrMode &AM) const; + bool isLegalMUBUFAddressingMode(const AddrMode &AM) const; +public: + SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI); + + bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/, + EVT /*VT*/) const override; + + bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, + unsigned AS) const override; + + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, + unsigned Align, + bool *IsFast) const override; + + EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, + unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, + bool MemcpyStrSrc, + MachineFunction &MF) const override; + + bool isMemOpUniform(const SDNode *N) const; + bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; + + TargetLoweringBase::LegalizeTypeAction + getPreferredVectorAction(EVT VT) const override; + + bool shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const override; + + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const override; + + SDValue LowerReturn(SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + SDLoc DL, SelectionDAG &DAG) const override; + + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI, + MachineBasicBlock * BB) const override; + bool enableAggressiveFMAFusion(EVT VT) const override; + EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, + EVT VT) const override; + MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override; + bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; + void AdjustInstrPostInstrSelection(MachineInstr *MI, + SDNode *Node) const override; + + int32_t analyzeImmediate(const SDNode *N) const; + SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const override; + void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const; + + MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const; + MachineSDNode *buildRSRC(SelectionDAG &DAG, + SDLoc DL, + SDValue Ptr, + uint32_t RsrcDword1, + uint64_t RsrcDword2And3) const; + std::pair<unsigned, const TargetRegisterClass *> + getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + StringRef Constraint, MVT VT) const override; + ConstraintType getConstraintType(StringRef Constraint) const override; + SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const; +}; + +} // End namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp new file mode 100644 index 0000000..94e6147 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -0,0 +1,508 @@ +//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Insert wait instructions for memory reads and writes. +/// +/// Memory reads and writes are issued asynchronously, so we need to insert +/// S_WAITCNT instructions when we want to access any of their results or +/// overwrite any register that's used asynchronously. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIDefines.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +namespace { + +/// \brief One variable for each of the hardware counters +typedef union { + struct { + unsigned VM; + unsigned EXP; + unsigned LGKM; + } Named; + unsigned Array[3]; + +} Counters; + +typedef enum { + OTHER, + SMEM, + VMEM +} InstType; + +typedef Counters RegCounters[512]; +typedef std::pair<unsigned, unsigned> RegInterval; + +class SIInsertWaits : public MachineFunctionPass { + +private: + static char ID; + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + const MachineRegisterInfo *MRI; + + /// \brief Constant hardware limits + static const Counters WaitCounts; + + /// \brief Constant zero value + static const Counters ZeroCounts; + + /// \brief Counter values we have already waited on. + Counters WaitedOn; + + /// \brief Counter values for last instruction issued. + Counters LastIssued; + + /// \brief Registers used by async instructions. + RegCounters UsedRegs; + + /// \brief Registers defined by async instructions. + RegCounters DefinedRegs; + + /// \brief Different export instruction types seen since last wait. + unsigned ExpInstrTypesSeen; + + /// \brief Type of the last opcode. + InstType LastOpcodeType; + + bool LastInstWritesM0; + + /// \brief Whether the machine function returns void + bool ReturnsVoid; + + /// \brief Get increment/decrement amount for this instruction. + Counters getHwCounts(MachineInstr &MI); + + /// \brief Is operand relevant for async execution? + bool isOpRelevant(MachineOperand &Op); + + /// \brief Get register interval an operand affects. + RegInterval getRegInterval(const TargetRegisterClass *RC, + const MachineOperand &Reg) const; + + /// \brief Handle instructions async components + void pushInstruction(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I); + + /// \brief Insert the actual wait instruction + bool insertWait(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const Counters &Counts); + + /// \brief Do we need def2def checks? + bool unorderedDefines(MachineInstr &MI); + + /// \brief Resolve all operand dependencies to counter requirements + Counters handleOperands(MachineInstr &MI); + + /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG. + void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); + +public: + SIInsertWaits(TargetMachine &tm) : + MachineFunctionPass(ID), + TII(nullptr), + TRI(nullptr), + ExpInstrTypesSeen(0) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI insert wait instructions"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace + +char SIInsertWaits::ID = 0; + +const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } }; +const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } }; + +FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) { + return new SIInsertWaits(tm); +} + +Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { + uint64_t TSFlags = MI.getDesc().TSFlags; + Counters Result = { { 0, 0, 0 } }; + + Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT); + + // Only consider stores or EXP for EXP_CNT + Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT && + (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore())); + + // LGKM may uses larger values + if (TSFlags & SIInstrFlags::LGKM_CNT) { + + if (TII->isSMRD(MI)) { + + if (MI.getNumOperands() != 0) { + assert(MI.getOperand(0).isReg() && + "First LGKM operand must be a register!"); + + // XXX - What if this is a write into a super register? + const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0); + unsigned Size = RC->getSize(); + Result.Named.LGKM = Size > 4 ? 2 : 1; + } else { + // s_dcache_inv etc. do not have a a destination register. Assume we + // want a wait on these. + // XXX - What is the right value? + Result.Named.LGKM = 1; + } + } else { + // DS + Result.Named.LGKM = 1; + } + + } else { + Result.Named.LGKM = 0; + } + + return Result; +} + +bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { + // Constants are always irrelevant + if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) + return false; + + // Defines are always relevant + if (Op.isDef()) + return true; + + // For exports all registers are relevant + MachineInstr &MI = *Op.getParent(); + if (MI.getOpcode() == AMDGPU::EXP) + return true; + + // For stores the stored value is also relevant + if (!MI.getDesc().mayStore()) + return false; + + // Check if this operand is the value being stored. + // Special case for DS instructions, since the address + // operand comes before the value operand and it may have + // multiple data operands. + + if (TII->isDS(MI)) { + MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data); + if (Data && Op.isIdenticalTo(*Data)) + return true; + + MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0); + if (Data0 && Op.isIdenticalTo(*Data0)) + return true; + + MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1); + if (Data1 && Op.isIdenticalTo(*Data1)) + return true; + + return false; + } + + // NOTE: This assumes that the value operand is before the + // address operand, and that there is only one value operand. + for (MachineInstr::mop_iterator I = MI.operands_begin(), + E = MI.operands_end(); I != E; ++I) { + + if (I->isReg() && I->isUse()) + return Op.isIdenticalTo(*I); + } + + return false; +} + +RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC, + const MachineOperand &Reg) const { + unsigned Size = RC->getSize(); + assert(Size >= 4); + + RegInterval Result; + Result.first = TRI->getEncodingValue(Reg.getReg()); + Result.second = Result.first + Size / 4; + + return Result; +} + +void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { + + // Get the hardware counter increments and sum them up + Counters Increment = getHwCounts(*I); + Counters Limit = ZeroCounts; + unsigned Sum = 0; + + for (unsigned i = 0; i < 3; ++i) { + LastIssued.Array[i] += Increment.Array[i]; + if (Increment.Array[i]) + Limit.Array[i] = LastIssued.Array[i]; + Sum += Increment.Array[i]; + } + + // If we don't increase anything then that's it + if (Sum == 0) { + LastOpcodeType = OTHER; + return; + } + + if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >= + AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM + // or SMEM clause, respectively. + // + // The temporary workaround is to break the clauses with S_NOP. + // + // The proper solution would be to allocate registers such that all source + // and destination registers don't overlap, e.g. this is illegal: + // r0 = load r2 + // r2 = load r0 + if ((LastOpcodeType == SMEM && TII->isSMRD(*I)) || + (LastOpcodeType == VMEM && Increment.Named.VM)) { + // Insert a NOP to break the clause. + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)) + .addImm(0); + LastInstWritesM0 = false; + } + + if (TII->isSMRD(*I)) + LastOpcodeType = SMEM; + else if (Increment.Named.VM) + LastOpcodeType = VMEM; + } + + // Remember which export instructions we have seen + if (Increment.Named.EXP) { + ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2; + } + + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { + MachineOperand &Op = I->getOperand(i); + if (!isOpRelevant(Op)) + continue; + + const TargetRegisterClass *RC = TII->getOpRegClass(*I, i); + RegInterval Interval = getRegInterval(RC, Op); + for (unsigned j = Interval.first; j < Interval.second; ++j) { + + // Remember which registers we define + if (Op.isDef()) + DefinedRegs[j] = Limit; + + // and which one we are using + if (Op.isUse()) + UsedRegs[j] = Limit; + } + } +} + +bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const Counters &Required) { + + // End of program? No need to wait on anything + // A function not returning void needs to wait, because other bytecode will + // be appended after it and we don't know what it will be. + if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid) + return false; + + // Figure out if the async instructions execute in order + bool Ordered[3]; + + // VM_CNT is always ordered + Ordered[0] = true; + + // EXP_CNT is unordered if we have both EXP & VM-writes + Ordered[1] = ExpInstrTypesSeen == 3; + + // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS + Ordered[2] = false; + + // The values we are going to put into the S_WAITCNT instruction + Counters Counts = WaitCounts; + + // Do we really need to wait? + bool NeedWait = false; + + for (unsigned i = 0; i < 3; ++i) { + + if (Required.Array[i] <= WaitedOn.Array[i]) + continue; + + NeedWait = true; + + if (Ordered[i]) { + unsigned Value = LastIssued.Array[i] - Required.Array[i]; + + // Adjust the value to the real hardware possibilities. + Counts.Array[i] = std::min(Value, WaitCounts.Array[i]); + + } else + Counts.Array[i] = 0; + + // Remember on what we have waited on. + WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i]; + } + + if (!NeedWait) + return false; + + // Reset EXP_CNT instruction types + if (Counts.Named.EXP == 0) + ExpInstrTypesSeen = 0; + + // Build the wait instruction + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm((Counts.Named.VM & 0xF) | + ((Counts.Named.EXP & 0x7) << 4) | + ((Counts.Named.LGKM & 0x7) << 8)); + + LastOpcodeType = OTHER; + LastInstWritesM0 = false; + return true; +} + +/// \brief helper function for handleOperands +static void increaseCounters(Counters &Dst, const Counters &Src) { + + for (unsigned i = 0; i < 3; ++i) + Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]); +} + +Counters SIInsertWaits::handleOperands(MachineInstr &MI) { + + Counters Result = ZeroCounts; + + // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, + // but we also want to wait for any other outstanding transfers before + // signalling other hardware blocks + if (MI.getOpcode() == AMDGPU::S_SENDMSG) + return LastIssued; + + // For each register affected by this instruction increase the result + // sequence. + // + // TODO: We could probably just look at explicit operands if we removed VCC / + // EXEC from SMRD dest reg classes. + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + MachineOperand &Op = MI.getOperand(i); + if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) + continue; + + const TargetRegisterClass *RC = TII->getOpRegClass(MI, i); + RegInterval Interval = getRegInterval(RC, Op); + for (unsigned j = Interval.first; j < Interval.second; ++j) { + + if (Op.isDef()) { + increaseCounters(Result, UsedRegs[j]); + increaseCounters(Result, DefinedRegs[j]); + } + + if (Op.isUse()) + increaseCounters(Result, DefinedRegs[j]); + } + } + + return Result; +} + +void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { + if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() < + AMDGPUSubtarget::VOLCANIC_ISLANDS) + return; + + // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG. + if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) { + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0); + LastInstWritesM0 = false; + return; + } + + // Set whether this instruction sets M0 + LastInstWritesM0 = false; + + unsigned NumOperands = I->getNumOperands(); + for (unsigned i = 0; i < NumOperands; i++) { + const MachineOperand &Op = I->getOperand(i); + + if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0) + LastInstWritesM0 = true; + } +} + +// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" +// around other non-memory instructions. +bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { + bool Changes = false; + + TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + TRI = + static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + + MRI = &MF.getRegInfo(); + + WaitedOn = ZeroCounts; + LastIssued = ZeroCounts; + LastOpcodeType = OTHER; + LastInstWritesM0 = false; + ReturnsVoid = MF.getInfo<SIMachineFunctionInfo>()->returnsVoid(); + + memset(&UsedRegs, 0, sizeof(UsedRegs)); + memset(&DefinedRegs, 0, sizeof(DefinedRegs)); + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + + // Wait for everything before a barrier. + if (I->getOpcode() == AMDGPU::S_BARRIER) + Changes |= insertWait(MBB, I, LastIssued); + else + Changes |= insertWait(MBB, I, handleOperands(*I)); + + pushInstruction(MBB, I); + handleSendMsg(MBB, I); + } + + // Wait for everything at the end of the MBB + Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); + + // Functions returning something shouldn't contain S_ENDPGM, because other + // bytecode will be appended after it. + if (!ReturnsVoid) { + MachineBasicBlock::iterator I = MBB.getFirstTerminator(); + if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM) + I->eraseFromParent(); + } + } + + return Changes; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td new file mode 100644 index 0000000..0e883f6 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -0,0 +1,691 @@ +//===-- SIInstrFormats.td - SI Instruction Encodings ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// SI Instruction format definitions. +// +//===----------------------------------------------------------------------===// + +class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : + AMDGPUInst<outs, ins, asm, pattern>, PredicateControl { + + field bits<1> VM_CNT = 0; + field bits<1> EXP_CNT = 0; + field bits<1> LGKM_CNT = 0; + + field bits<1> SALU = 0; + field bits<1> VALU = 0; + + field bits<1> SOP1 = 0; + field bits<1> SOP2 = 0; + field bits<1> SOPC = 0; + field bits<1> SOPK = 0; + field bits<1> SOPP = 0; + + field bits<1> VOP1 = 0; + field bits<1> VOP2 = 0; + field bits<1> VOP3 = 0; + field bits<1> VOPC = 0; + + field bits<1> MUBUF = 0; + field bits<1> MTBUF = 0; + field bits<1> SMRD = 0; + field bits<1> DS = 0; + field bits<1> MIMG = 0; + field bits<1> FLAT = 0; + field bits<1> WQM = 0; + field bits<1> VGPRSpill = 0; + + // This bit tells the assembler to use the 32-bit encoding in case it + // is unable to infer the encoding from the operands. + field bits<1> VOPAsmPrefer32Bit = 0; + + // These need to be kept in sync with the enum in SIInstrFlags. + let TSFlags{0} = VM_CNT; + let TSFlags{1} = EXP_CNT; + let TSFlags{2} = LGKM_CNT; + + let TSFlags{3} = SALU; + let TSFlags{4} = VALU; + + let TSFlags{5} = SOP1; + let TSFlags{6} = SOP2; + let TSFlags{7} = SOPC; + let TSFlags{8} = SOPK; + let TSFlags{9} = SOPP; + + let TSFlags{10} = VOP1; + let TSFlags{11} = VOP2; + let TSFlags{12} = VOP3; + let TSFlags{13} = VOPC; + + let TSFlags{14} = MUBUF; + let TSFlags{15} = MTBUF; + let TSFlags{16} = SMRD; + let TSFlags{17} = DS; + let TSFlags{18} = MIMG; + let TSFlags{19} = FLAT; + let TSFlags{20} = WQM; + let TSFlags{21} = VGPRSpill; + let TSFlags{22} = VOPAsmPrefer32Bit; + + let SchedRW = [Write32Bit]; +} + +class Enc32 { + field bits<32> Inst; + int Size = 4; +} + +class Enc64 { + field bits<64> Inst; + int Size = 8; +} + +class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">; + +let Uses = [EXEC] in { + +class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let VALU = 1; +} + +class VOPCCommon <dag ins, string asm, list<dag> pattern> : + VOPAnyCommon <(outs), ins, asm, pattern> { + + let VOPC = 1; + let Size = 4; + let Defs = [VCC]; +} + +class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> : + VOPAnyCommon <outs, ins, asm, pattern> { + + let VOP1 = 1; + let Size = 4; +} + +class VOP2Common <dag outs, dag ins, string asm, list<dag> pattern> : + VOPAnyCommon <outs, ins, asm, pattern> { + + let VOP2 = 1; + let Size = 4; +} + +class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> : + VOPAnyCommon <outs, ins, asm, pattern> { + + // Using complex patterns gives VOP3 patterns a very high complexity rating, + // but standalone patterns are almost always prefered, so we need to adjust the + // priority lower. The goal is to use a high number to reduce complexity to + // zero (or less than zero). + let AddedComplexity = -1000; + + let VOP3 = 1; + let VALU = 1; + + let AsmMatchConverter = "cvtVOP3"; + let isCodeGenOnly = 0; + + int Size = 8; + + // Because SGPRs may be allowed if there are multiple operands, we + // need a post-isel hook to insert copies in order to avoid + // violating constant bus requirements. + let hasPostISelHook = 1; +} + +} // End Uses = [EXEC] + +//===----------------------------------------------------------------------===// +// Scalar operations +//===----------------------------------------------------------------------===// + +class SOP1e <bits<8> op> : Enc32 { + bits<7> sdst; + bits<8> ssrc0; + + let Inst{7-0} = ssrc0; + let Inst{15-8} = op; + let Inst{22-16} = sdst; + let Inst{31-23} = 0x17d; //encoding; +} + +class SOP2e <bits<7> op> : Enc32 { + bits<7> sdst; + bits<8> ssrc0; + bits<8> ssrc1; + + let Inst{7-0} = ssrc0; + let Inst{15-8} = ssrc1; + let Inst{22-16} = sdst; + let Inst{29-23} = op; + let Inst{31-30} = 0x2; // encoding +} + +class SOPCe <bits<7> op> : Enc32 { + bits<8> ssrc0; + bits<8> ssrc1; + + let Inst{7-0} = ssrc0; + let Inst{15-8} = ssrc1; + let Inst{22-16} = op; + let Inst{31-23} = 0x17e; +} + +class SOPKe <bits<5> op> : Enc32 { + bits <7> sdst; + bits <16> simm16; + + let Inst{15-0} = simm16; + let Inst{22-16} = sdst; + let Inst{27-23} = op; + let Inst{31-28} = 0xb; //encoding +} + +class SOPK64e <bits<5> op> : Enc64 { + bits <7> sdst = 0; + bits <16> simm16; + bits <32> imm; + + let Inst{15-0} = simm16; + let Inst{22-16} = sdst; + let Inst{27-23} = op; + let Inst{31-28} = 0xb; + + let Inst{63-32} = imm; +} + +class SOPPe <bits<7> op> : Enc32 { + bits <16> simm16; + + let Inst{15-0} = simm16; + let Inst{22-16} = op; + let Inst{31-23} = 0x17f; // encoding +} + +class SMRDe <bits<5> op, bits<1> imm> : Enc32 { + bits<7> sdst; + bits<7> sbase; + bits<8> offset; + + let Inst{7-0} = offset; + let Inst{8} = imm; + let Inst{14-9} = sbase{6-1}; + let Inst{21-15} = sdst; + let Inst{26-22} = op; + let Inst{31-27} = 0x18; //encoding +} + +class SMRD_IMMe_ci <bits<5> op> : Enc64 { + bits<7> sdst; + bits<7> sbase; + bits<32> offset; + + let Inst{7-0} = 0xff; + let Inst{8} = 0; + let Inst{14-9} = sbase{6-1}; + let Inst{21-15} = sdst; + let Inst{26-22} = op; + let Inst{31-27} = 0x18; //encoding + let Inst{63-32} = offset; +} + +let SchedRW = [WriteSALU] in { +class SOP1 <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern> { + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let isCodeGenOnly = 0; + let SALU = 1; + let SOP1 = 1; +} + +class SOP2 <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let isCodeGenOnly = 0; + let SALU = 1; + let SOP2 = 1; + + let UseNamedOperandTable = 1; +} + +class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern>, SOPCe <op> { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; + let SOPC = 1; + let isCodeGenOnly = 0; + let Defs = [SCC]; + + let UseNamedOperandTable = 1; +} + +class SOPK <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins , asm, pattern> { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; + let SOPK = 1; + + let UseNamedOperandTable = 1; +} + +class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> : + InstSI <(outs), ins, asm, pattern >, SOPPe <op> { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; + let SOPP = 1; + + let UseNamedOperandTable = 1; +} + +} // let SchedRW = [WriteSALU] + +class SMRD <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern> { + + let LGKM_CNT = 1; + let SMRD = 1; + let mayStore = 0; + let mayLoad = 1; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let SchedRW = [WriteSMEM]; +} + +//===----------------------------------------------------------------------===// +// Vector ALU operations +//===----------------------------------------------------------------------===// + +class VOP1e <bits<8> op> : Enc32 { + bits<8> vdst; + bits<9> src0; + + let Inst{8-0} = src0; + let Inst{16-9} = op; + let Inst{24-17} = vdst; + let Inst{31-25} = 0x3f; //encoding +} + +class VOP2e <bits<6> op> : Enc32 { + bits<8> vdst; + bits<9> src0; + bits<8> src1; + + let Inst{8-0} = src0; + let Inst{16-9} = src1; + let Inst{24-17} = vdst; + let Inst{30-25} = op; + let Inst{31} = 0x0; //encoding +} + +class VOP2_MADKe <bits<6> op> : Enc64 { + + bits<8> vdst; + bits<9> src0; + bits<8> vsrc1; + bits<32> src2; + + let Inst{8-0} = src0; + let Inst{16-9} = vsrc1; + let Inst{24-17} = vdst; + let Inst{30-25} = op; + let Inst{31} = 0x0; // encoding + let Inst{63-32} = src2; +} + +class VOP3e <bits<9> op> : Enc64 { + bits<8> vdst; + bits<2> src0_modifiers; + bits<9> src0; + bits<2> src1_modifiers; + bits<9> src1; + bits<2> src2_modifiers; + bits<9> src2; + bits<1> clamp; + bits<2> omod; + + let Inst{7-0} = vdst; + let Inst{8} = src0_modifiers{1}; + let Inst{9} = src1_modifiers{1}; + let Inst{10} = src2_modifiers{1}; + let Inst{11} = clamp; + let Inst{25-17} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = src0; + let Inst{49-41} = src1; + let Inst{58-50} = src2; + let Inst{60-59} = omod; + let Inst{61} = src0_modifiers{0}; + let Inst{62} = src1_modifiers{0}; + let Inst{63} = src2_modifiers{0}; +} + +class VOP3be <bits<9> op> : Enc64 { + bits<8> vdst; + bits<2> src0_modifiers; + bits<9> src0; + bits<2> src1_modifiers; + bits<9> src1; + bits<2> src2_modifiers; + bits<9> src2; + bits<7> sdst; + bits<2> omod; + + let Inst{7-0} = vdst; + let Inst{14-8} = sdst; + let Inst{25-17} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = src0; + let Inst{49-41} = src1; + let Inst{58-50} = src2; + let Inst{60-59} = omod; + let Inst{61} = src0_modifiers{0}; + let Inst{62} = src1_modifiers{0}; + let Inst{63} = src2_modifiers{0}; +} + +class VOPCe <bits<8> op> : Enc32 { + bits<9> src0; + bits<8> vsrc1; + + let Inst{8-0} = src0; + let Inst{16-9} = vsrc1; + let Inst{24-17} = op; + let Inst{31-25} = 0x3e; +} + +class VINTRPe <bits<2> op> : Enc32 { + bits<8> vdst; + bits<8> vsrc; + bits<2> attrchan; + bits<6> attr; + + let Inst{7-0} = vsrc; + let Inst{9-8} = attrchan; + let Inst{15-10} = attr; + let Inst{17-16} = op; + let Inst{25-18} = vdst; + let Inst{31-26} = 0x32; // encoding +} + +class DSe <bits<8> op> : Enc64 { + bits<8> vdst; + bits<1> gds; + bits<8> addr; + bits<8> data0; + bits<8> data1; + bits<8> offset0; + bits<8> offset1; + + let Inst{7-0} = offset0; + let Inst{15-8} = offset1; + let Inst{17} = gds; + let Inst{25-18} = op; + let Inst{31-26} = 0x36; //encoding + let Inst{39-32} = addr; + let Inst{47-40} = data0; + let Inst{55-48} = data1; + let Inst{63-56} = vdst; +} + +class MUBUFe <bits<7> op> : Enc64 { + bits<12> offset; + bits<1> offen; + bits<1> idxen; + bits<1> glc; + bits<1> addr64; + bits<1> lds; + bits<8> vaddr; + bits<8> vdata; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; + + let Inst{11-0} = offset; + let Inst{12} = offen; + let Inst{13} = idxen; + let Inst{14} = glc; + let Inst{15} = addr64; + let Inst{16} = lds; + let Inst{24-18} = op; + let Inst{31-26} = 0x38; //encoding + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{54} = slc; + let Inst{55} = tfe; + let Inst{63-56} = soffset; +} + +class MTBUFe <bits<3> op> : Enc64 { + bits<8> vdata; + bits<12> offset; + bits<1> offen; + bits<1> idxen; + bits<1> glc; + bits<1> addr64; + bits<4> dfmt; + bits<3> nfmt; + bits<8> vaddr; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; + + let Inst{11-0} = offset; + let Inst{12} = offen; + let Inst{13} = idxen; + let Inst{14} = glc; + let Inst{15} = addr64; + let Inst{18-16} = op; + let Inst{22-19} = dfmt; + let Inst{25-23} = nfmt; + let Inst{31-26} = 0x3a; //encoding + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{54} = slc; + let Inst{55} = tfe; + let Inst{63-56} = soffset; +} + +class MIMGe <bits<7> op> : Enc64 { + bits<8> vdata; + bits<4> dmask; + bits<1> unorm; + bits<1> glc; + bits<1> da; + bits<1> r128; + bits<1> tfe; + bits<1> lwe; + bits<1> slc; + bits<8> vaddr; + bits<7> srsrc; + bits<7> ssamp; + + let Inst{11-8} = dmask; + let Inst{12} = unorm; + let Inst{13} = glc; + let Inst{14} = da; + let Inst{15} = r128; + let Inst{16} = tfe; + let Inst{17} = lwe; + let Inst{24-18} = op; + let Inst{25} = slc; + let Inst{31-26} = 0x3c; + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{57-53} = ssamp{6-2}; +} + +class FLATe<bits<7> op> : Enc64 { + bits<8> addr; + bits<8> data; + bits<8> vdst; + bits<1> slc; + bits<1> glc; + bits<1> tfe; + + // 15-0 is reserved. + let Inst{16} = glc; + let Inst{17} = slc; + let Inst{24-18} = op; + let Inst{31-26} = 0x37; // Encoding. + let Inst{39-32} = addr; + let Inst{47-40} = data; + // 54-48 is reserved. + let Inst{55} = tfe; + let Inst{63-56} = vdst; +} + +class EXPe : Enc64 { + bits<4> en; + bits<6> tgt; + bits<1> compr; + bits<1> done; + bits<1> vm; + bits<8> vsrc0; + bits<8> vsrc1; + bits<8> vsrc2; + bits<8> vsrc3; + + let Inst{3-0} = en; + let Inst{9-4} = tgt; + let Inst{10} = compr; + let Inst{11} = done; + let Inst{12} = vm; + let Inst{31-26} = 0x3e; + let Inst{39-32} = vsrc0; + let Inst{47-40} = vsrc1; + let Inst{55-48} = vsrc2; + let Inst{63-56} = vsrc3; +} + +let Uses = [EXEC] in { + +class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : + VOP1Common <outs, ins, asm, pattern>, + VOP1e<op> { + let isCodeGenOnly = 0; +} + +class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> : + VOP2Common <outs, ins, asm, pattern>, VOP2e<op> { + let isCodeGenOnly = 0; +} + +class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> : + VOPCCommon <ins, asm, pattern>, VOPCe <op>; + +class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { + let mayLoad = 1; + let mayStore = 0; + let hasSideEffects = 0; +} + +} // End Uses = [EXEC] + +//===----------------------------------------------------------------------===// +// Vector I/O operations +//===----------------------------------------------------------------------===// + +class DS <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { + + let LGKM_CNT = 1; + let DS = 1; + let UseNamedOperandTable = 1; + let Uses = [M0, EXEC]; + + // Most instruction load and store data, so set this as the default. + let mayLoad = 1; + let mayStore = 1; + + let hasSideEffects = 0; + let AsmMatchConverter = "cvtDS"; + let SchedRW = [WriteLDS]; +} + +class MUBUF <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern> { + + let VM_CNT = 1; + let EXP_CNT = 1; + let MUBUF = 1; + let Uses = [EXEC]; + + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let AsmMatchConverter = "cvtMubuf"; + let SchedRW = [WriteVMEM]; +} + +class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern> { + + let VM_CNT = 1; + let EXP_CNT = 1; + let MTBUF = 1; + let Uses = [EXEC]; + + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let SchedRW = [WriteVMEM]; +} + +class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern>, FLATe <op> { + let FLAT = 1; + // Internally, FLAT instruction are executed as both an LDS and a + // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT + // and are not considered done until both have been decremented. + let VM_CNT = 1; + let LGKM_CNT = 1; + + let Uses = [EXEC, FLAT_SCR]; // M0 + + let UseNamedOperandTable = 1; + let hasSideEffects = 0; + let AsmMatchConverter = "cvtFlat"; + let SchedRW = [WriteVMEM]; +} + +class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern>, MIMGe <op> { + + let VM_CNT = 1; + let EXP_CNT = 1; + let MIMG = 1; + let Uses = [EXEC]; + + let hasSideEffects = 0; // XXX ???? +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp new file mode 100644 index 0000000..1e10d25 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -0,0 +1,3093 @@ +//===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief SI Implementation of TargetInstrInfo. +// +//===----------------------------------------------------------------------===// + + +#include "SIInstrInfo.h" +#include "AMDGPUTargetMachine.h" +#include "SIDefines.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) + : AMDGPUInstrInfo(st), RI() {} + +//===----------------------------------------------------------------------===// +// TargetInstrInfo callbacks +//===----------------------------------------------------------------------===// + +static unsigned getNumOperandsNoGlue(SDNode *Node) { + unsigned N = Node->getNumOperands(); + while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) + --N; + return N; +} + +static SDValue findChainOperand(SDNode *Load) { + SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); + assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); + return LastOp; +} + +/// \brief Returns true if both nodes have the same value for the given +/// operand \p Op, or if both nodes do not have this operand. +static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { + unsigned Opc0 = N0->getMachineOpcode(); + unsigned Opc1 = N1->getMachineOpcode(); + + int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); + int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); + + if (Op0Idx == -1 && Op1Idx == -1) + return true; + + + if ((Op0Idx == -1 && Op1Idx != -1) || + (Op1Idx == -1 && Op0Idx != -1)) + return false; + + // getNamedOperandIdx returns the index for the MachineInstr's operands, + // which includes the result as the first operand. We are indexing into the + // MachineSDNode's operands, so we need to skip the result operand to get + // the real index. + --Op0Idx; + --Op1Idx; + + return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); +} + +bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, + AliasAnalysis *AA) const { + // TODO: The generic check fails for VALU instructions that should be + // rematerializable due to implicit reads of exec. We really want all of the + // generic logic for this except for this. + switch (MI->getOpcode()) { + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B32_e64: + case AMDGPU::V_MOV_B64_PSEUDO: + return true; + default: + return false; + } +} + +bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, + int64_t &Offset0, + int64_t &Offset1) const { + if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) + return false; + + unsigned Opc0 = Load0->getMachineOpcode(); + unsigned Opc1 = Load1->getMachineOpcode(); + + // Make sure both are actually loads. + if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) + return false; + + if (isDS(Opc0) && isDS(Opc1)) { + + // FIXME: Handle this case: + if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) + return false; + + // Check base reg. + if (Load0->getOperand(1) != Load1->getOperand(1)) + return false; + + // Check chain. + if (findChainOperand(Load0) != findChainOperand(Load1)) + return false; + + // Skip read2 / write2 variants for simplicity. + // TODO: We should report true if the used offsets are adjacent (excluded + // st64 versions). + if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || + AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) + return false; + + Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); + Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); + return true; + } + + if (isSMRD(Opc0) && isSMRD(Opc1)) { + assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); + + // Check base reg. + if (Load0->getOperand(0) != Load1->getOperand(0)) + return false; + + const ConstantSDNode *Load0Offset = + dyn_cast<ConstantSDNode>(Load0->getOperand(1)); + const ConstantSDNode *Load1Offset = + dyn_cast<ConstantSDNode>(Load1->getOperand(1)); + + if (!Load0Offset || !Load1Offset) + return false; + + // Check chain. + if (findChainOperand(Load0) != findChainOperand(Load1)) + return false; + + Offset0 = Load0Offset->getZExtValue(); + Offset1 = Load1Offset->getZExtValue(); + return true; + } + + // MUBUF and MTBUF can access the same addresses. + if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { + + // MUBUF and MTBUF have vaddr at different indices. + if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || + findChainOperand(Load0) != findChainOperand(Load1) || + !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || + !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) + return false; + + int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); + int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); + + if (OffIdx0 == -1 || OffIdx1 == -1) + return false; + + // getNamedOperandIdx returns the index for MachineInstrs. Since they + // inlcude the output in the operand list, but SDNodes don't, we need to + // subtract the index by one. + --OffIdx0; + --OffIdx1; + + SDValue Off0 = Load0->getOperand(OffIdx0); + SDValue Off1 = Load1->getOperand(OffIdx1); + + // The offset might be a FrameIndexSDNode. + if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) + return false; + + Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); + Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); + return true; + } + + return false; +} + +static bool isStride64(unsigned Opc) { + switch (Opc) { + case AMDGPU::DS_READ2ST64_B32: + case AMDGPU::DS_READ2ST64_B64: + case AMDGPU::DS_WRITE2ST64_B32: + case AMDGPU::DS_WRITE2ST64_B64: + return true; + default: + return false; + } +} + +bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, + unsigned &Offset, + const TargetRegisterInfo *TRI) const { + unsigned Opc = LdSt->getOpcode(); + + if (isDS(*LdSt)) { + const MachineOperand *OffsetImm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset); + if (OffsetImm) { + // Normal, single offset LDS instruction. + const MachineOperand *AddrReg = getNamedOperand(*LdSt, + AMDGPU::OpName::addr); + + BaseReg = AddrReg->getReg(); + Offset = OffsetImm->getImm(); + return true; + } + + // The 2 offset instructions use offset0 and offset1 instead. We can treat + // these as a load with a single offset if the 2 offsets are consecutive. We + // will use this for some partially aligned loads. + const MachineOperand *Offset0Imm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset0); + const MachineOperand *Offset1Imm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset1); + + uint8_t Offset0 = Offset0Imm->getImm(); + uint8_t Offset1 = Offset1Imm->getImm(); + + if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { + // Each of these offsets is in element sized units, so we need to convert + // to bytes of the individual reads. + + unsigned EltSize; + if (LdSt->mayLoad()) + EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2; + else { + assert(LdSt->mayStore()); + int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); + EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize(); + } + + if (isStride64(Opc)) + EltSize *= 64; + + const MachineOperand *AddrReg = getNamedOperand(*LdSt, + AMDGPU::OpName::addr); + BaseReg = AddrReg->getReg(); + Offset = EltSize * Offset0; + return true; + } + + return false; + } + + if (isMUBUF(*LdSt) || isMTBUF(*LdSt)) { + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) + return false; + + const MachineOperand *AddrReg = getNamedOperand(*LdSt, + AMDGPU::OpName::vaddr); + if (!AddrReg) + return false; + + const MachineOperand *OffsetImm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset); + BaseReg = AddrReg->getReg(); + Offset = OffsetImm->getImm(); + return true; + } + + if (isSMRD(*LdSt)) { + const MachineOperand *OffsetImm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset); + if (!OffsetImm) + return false; + + const MachineOperand *SBaseReg = getNamedOperand(*LdSt, + AMDGPU::OpName::sbase); + BaseReg = SBaseReg->getReg(); + Offset = OffsetImm->getImm(); + return true; + } + + return false; +} + +bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, + MachineInstr *SecondLdSt, + unsigned NumLoads) const { + // TODO: This needs finer tuning + if (NumLoads > 4) + return false; + + if (isDS(*FirstLdSt) && isDS(*SecondLdSt)) + return true; + + if (isSMRD(*FirstLdSt) && isSMRD(*SecondLdSt)) + return true; + + if ((isMUBUF(*FirstLdSt) || isMTBUF(*FirstLdSt)) && + (isMUBUF(*SecondLdSt) || isMTBUF(*SecondLdSt))) + return true; + + return false; +} + +void +SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + + // If we are trying to copy to or from SCC, there is a bug somewhere else in + // the backend. While it may be theoretically possible to do this, it should + // never be necessary. + assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); + + static const int16_t Sub0_15[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, + AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, + }; + + static const int16_t Sub0_15_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, + AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, + AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, + AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, + }; + + static const int16_t Sub0_7[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + }; + + static const int16_t Sub0_7_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, + AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, + }; + + static const int16_t Sub0_3[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + }; + + static const int16_t Sub0_3_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, + }; + + static const int16_t Sub0_2[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, + }; + + static const int16_t Sub0_1[] = { + AMDGPU::sub0, AMDGPU::sub1, + }; + + unsigned Opcode; + ArrayRef<int16_t> SubIndices; + bool Forward; + + if (AMDGPU::SReg_32RegClass.contains(DestReg)) { + assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + + } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { + if (DestReg == AMDGPU::VCC) { + if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) + .addReg(SrcReg, getKillRegState(KillSrc)); + } else { + // FIXME: Hack until VReg_1 removed. + assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32)) + .addImm(0) + .addReg(SrcReg, getKillRegState(KillSrc)); + } + + return; + } + + assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + + } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { + assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); + Opcode = AMDGPU::S_MOV_B64; + SubIndices = Sub0_3_64; + + } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { + assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); + Opcode = AMDGPU::S_MOV_B64; + SubIndices = Sub0_7_64; + + } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { + assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); + Opcode = AMDGPU::S_MOV_B64; + SubIndices = Sub0_15_64; + + } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { + assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || + AMDGPU::SReg_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + + } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || + AMDGPU::SReg_64RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_1; + + } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_96RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_2; + + } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || + AMDGPU::SReg_128RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_3; + + } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_256RegClass.contains(SrcReg) || + AMDGPU::SReg_256RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_7; + + } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_512RegClass.contains(SrcReg) || + AMDGPU::SReg_512RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_15; + + } else { + llvm_unreachable("Can't copy register!"); + } + + if (RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg)) + Forward = true; + else + Forward = false; + + for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { + unsigned SubIdx; + if (Forward) + SubIdx = SubIndices[Idx]; + else + SubIdx = SubIndices[SubIndices.size() - Idx - 1]; + + MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, + get(Opcode), RI.getSubReg(DestReg, SubIdx)); + + Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); + + if (Idx == SubIndices.size() - 1) + Builder.addReg(SrcReg, RegState::Kill | RegState::Implicit); + + if (Idx == 0) + Builder.addReg(DestReg, RegState::Define | RegState::Implicit); + } +} + +int SIInstrInfo::commuteOpcode(const MachineInstr &MI) const { + const unsigned Opcode = MI.getOpcode(); + + int NewOpc; + + // Try to map original to commuted opcode + NewOpc = AMDGPU::getCommuteRev(Opcode); + if (NewOpc != -1) + // Check if the commuted (REV) opcode exists on the target. + return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; + + // Try to map commuted to original opcode + NewOpc = AMDGPU::getCommuteOrig(Opcode); + if (NewOpc != -1) + // Check if the original (non-REV) opcode exists on the target. + return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; + + return Opcode; +} + +unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { + + if (DstRC->getSize() == 4) { + return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; + } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { + return AMDGPU::S_MOV_B64; + } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { + return AMDGPU::V_MOV_B64_PSEUDO; + } + return AMDGPU::COPY; +} + +static unsigned getSGPRSpillSaveOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_S32_SAVE; + case 8: + return AMDGPU::SI_SPILL_S64_SAVE; + case 16: + return AMDGPU::SI_SPILL_S128_SAVE; + case 32: + return AMDGPU::SI_SPILL_S256_SAVE; + case 64: + return AMDGPU::SI_SPILL_S512_SAVE; + default: + llvm_unreachable("unknown register size"); + } +} + +static unsigned getVGPRSpillSaveOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_V32_SAVE; + case 8: + return AMDGPU::SI_SPILL_V64_SAVE; + case 16: + return AMDGPU::SI_SPILL_V128_SAVE; + case 32: + return AMDGPU::SI_SPILL_V256_SAVE; + case 64: + return AMDGPU::SI_SPILL_V512_SAVE; + default: + llvm_unreachable("unknown register size"); + } +} + +void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, + int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + MachineFunction *MF = MBB.getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + DebugLoc DL = MBB.findDebugLoc(MI); + + unsigned Size = FrameInfo->getObjectSize(FrameIndex); + unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); + MachinePointerInfo PtrInfo + = MachinePointerInfo::getFixedStack(*MF, FrameIndex); + MachineMemOperand *MMO + = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, + Size, Align); + + if (RI.isSGPRClass(RC)) { + MFI->setHasSpilledSGPRs(); + + // We are only allowed to create one new instruction when spilling + // registers, so we need to use pseudo instruction for spilling + // SGPRs. + unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize()); + BuildMI(MBB, MI, DL, get(Opcode)) + .addReg(SrcReg) // src + .addFrameIndex(FrameIndex) // frame_idx + .addMemOperand(MMO); + + return; + } + + if (!ST.isVGPRSpillingEnabled(MFI)) { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" + " spill register"); + BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) + .addReg(SrcReg); + + return; + } + + assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); + + unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); + MFI->setHasSpilledVGPRs(); + BuildMI(MBB, MI, DL, get(Opcode)) + .addReg(SrcReg) // src + .addFrameIndex(FrameIndex) // frame_idx + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset + .addMemOperand(MMO); +} + +static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_S32_RESTORE; + case 8: + return AMDGPU::SI_SPILL_S64_RESTORE; + case 16: + return AMDGPU::SI_SPILL_S128_RESTORE; + case 32: + return AMDGPU::SI_SPILL_S256_RESTORE; + case 64: + return AMDGPU::SI_SPILL_S512_RESTORE; + default: + llvm_unreachable("unknown register size"); + } +} + +static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_V32_RESTORE; + case 8: + return AMDGPU::SI_SPILL_V64_RESTORE; + case 16: + return AMDGPU::SI_SPILL_V128_RESTORE; + case 32: + return AMDGPU::SI_SPILL_V256_RESTORE; + case 64: + return AMDGPU::SI_SPILL_V512_RESTORE; + default: + llvm_unreachable("unknown register size"); + } +} + +void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + MachineFunction *MF = MBB.getParent(); + const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + DebugLoc DL = MBB.findDebugLoc(MI); + unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); + unsigned Size = FrameInfo->getObjectSize(FrameIndex); + + MachinePointerInfo PtrInfo + = MachinePointerInfo::getFixedStack(*MF, FrameIndex); + + MachineMemOperand *MMO = MF->getMachineMemOperand( + PtrInfo, MachineMemOperand::MOLoad, Size, Align); + + if (RI.isSGPRClass(RC)) { + // FIXME: Maybe this should not include a memoperand because it will be + // lowered to non-memory instructions. + unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize()); + BuildMI(MBB, MI, DL, get(Opcode), DestReg) + .addFrameIndex(FrameIndex) // frame_idx + .addMemOperand(MMO); + + return; + } + + if (!ST.isVGPRSpillingEnabled(MFI)) { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" + " restore register"); + BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); + + return; + } + + assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); + + unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); + BuildMI(MBB, MI, DL, get(Opcode), DestReg) + .addFrameIndex(FrameIndex) // frame_idx + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset + .addMemOperand(MMO); +} + +/// \param @Offset Offset in bytes of the FrameIndex being spilled +unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + RegScavenger *RS, unsigned TmpReg, + unsigned FrameOffset, + unsigned Size) const { + MachineFunction *MF = MBB.getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>(); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo*>(ST.getRegisterInfo()); + DebugLoc DL = MBB.findDebugLoc(MI); + unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); + unsigned WavefrontSize = ST.getWavefrontSize(); + + unsigned TIDReg = MFI->getTIDReg(); + if (!MFI->hasCalculatedTID()) { + MachineBasicBlock &Entry = MBB.getParent()->front(); + MachineBasicBlock::iterator Insert = Entry.front(); + DebugLoc DL = Insert->getDebugLoc(); + + TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass); + if (TIDReg == AMDGPU::NoRegister) + return TIDReg; + + + if (MFI->getShaderType() == ShaderType::COMPUTE && + WorkGroupSize > WavefrontSize) { + + unsigned TIDIGXReg + = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X); + unsigned TIDIGYReg + = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y); + unsigned TIDIGZReg + = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z); + unsigned InputPtrReg = + TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); + for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { + if (!Entry.isLiveIn(Reg)) + Entry.addLiveIn(Reg); + } + + RS->enterBasicBlock(&Entry); + // FIXME: Can we scavenge an SReg_64 and access the subregs? + unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); + unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); + BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) + .addReg(InputPtrReg) + .addImm(SI::KernelInputOffsets::NGROUPS_Z); + BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) + .addReg(InputPtrReg) + .addImm(SI::KernelInputOffsets::NGROUPS_Y); + + // NGROUPS.X * NGROUPS.Y + BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) + .addReg(STmp1) + .addReg(STmp0); + // (NGROUPS.X * NGROUPS.Y) * TIDIG.X + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) + .addReg(STmp1) + .addReg(TIDIGXReg); + // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) + .addReg(STmp0) + .addReg(TIDIGYReg) + .addReg(TIDReg); + // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z + BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) + .addReg(TIDReg) + .addReg(TIDIGZReg); + } else { + // Get the wave id + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), + TIDReg) + .addImm(-1) + .addImm(0); + + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), + TIDReg) + .addImm(-1) + .addReg(TIDReg); + } + + BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), + TIDReg) + .addImm(2) + .addReg(TIDReg); + MFI->setTIDReg(TIDReg); + } + + // Add FrameIndex to LDS offset + unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize); + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) + .addImm(LDSOffset) + .addReg(TIDReg); + + return TmpReg; +} + +void SIInstrInfo::insertWaitStates(MachineBasicBlock::iterator MI, + int Count) const { + while (Count > 0) { + int Arg; + if (Count >= 8) + Arg = 7; + else + Arg = Count - 1; + Count -= 8; + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP)) + .addImm(Arg); + } +} + +bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MBB.findDebugLoc(MI); + switch (MI->getOpcode()) { + default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); + + case AMDGPU::SGPR_USE: + // This is just a placeholder for register allocation. + MI->eraseFromParent(); + break; + + case AMDGPU::V_MOV_B64_PSEUDO: { + unsigned Dst = MI->getOperand(0).getReg(); + unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); + unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); + + const MachineOperand &SrcOp = MI->getOperand(1); + // FIXME: Will this work for 64-bit floating point immediates? + assert(!SrcOp.isFPImm()); + if (SrcOp.isImm()) { + APInt Imm(64, SrcOp.getImm()); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) + .addImm(Imm.getLoBits(32).getZExtValue()) + .addReg(Dst, RegState::Implicit); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) + .addImm(Imm.getHiBits(32).getZExtValue()) + .addReg(Dst, RegState::Implicit); + } else { + assert(SrcOp.isReg()); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) + .addReg(Dst, RegState::Implicit); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) + .addReg(Dst, RegState::Implicit); + } + MI->eraseFromParent(); + break; + } + + case AMDGPU::V_CNDMASK_B64_PSEUDO: { + unsigned Dst = MI->getOperand(0).getReg(); + unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); + unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); + unsigned Src0 = MI->getOperand(1).getReg(); + unsigned Src1 = MI->getOperand(2).getReg(); + const MachineOperand &SrcCond = MI->getOperand(3); + + BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) + .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) + .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) + .addOperand(SrcCond); + BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) + .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) + .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) + .addOperand(SrcCond); + MI->eraseFromParent(); + break; + } + + case AMDGPU::SI_CONSTDATA_PTR: { + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo *>(ST.getRegisterInfo()); + MachineFunction &MF = *MBB.getParent(); + unsigned Reg = MI->getOperand(0).getReg(); + unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0); + unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1); + + // Create a bundle so these instructions won't be re-ordered by the + // post-RA scheduler. + MIBundleBuilder Bundler(MBB, MI); + Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); + + // Add 32-bit offset from this instruction to the start of the + // constant data. + Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) + .addReg(RegLo) + .addOperand(MI->getOperand(1))); + Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) + .addReg(RegHi) + .addImm(0)); + + llvm::finalizeBundle(MBB, Bundler.begin()); + + MI->eraseFromParent(); + break; + } + } + return true; +} + +/// Commutes the operands in the given instruction. +/// The commutable operands are specified by their indices OpIdx0 and OpIdx1. +/// +/// Do not call this method for a non-commutable instruction or for +/// non-commutable pair of operand indices OpIdx0 and OpIdx1. +/// Even though the instruction is commutable, the method may still +/// fail to commute the operands, null pointer is returned in such cases. +MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx0, + unsigned OpIdx1) const { + int CommutedOpcode = commuteOpcode(*MI); + if (CommutedOpcode == -1) + return nullptr; + + int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src0); + MachineOperand &Src0 = MI->getOperand(Src0Idx); + if (!Src0.isReg()) + return nullptr; + + int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src1); + + if ((OpIdx0 != static_cast<unsigned>(Src0Idx) || + OpIdx1 != static_cast<unsigned>(Src1Idx)) && + (OpIdx0 != static_cast<unsigned>(Src1Idx) || + OpIdx1 != static_cast<unsigned>(Src0Idx))) + return nullptr; + + MachineOperand &Src1 = MI->getOperand(Src1Idx); + + + if (isVOP2(*MI)) { + const MCInstrDesc &InstrDesc = MI->getDesc(); + // For VOP2 instructions, any operand type is valid to use for src0. Make + // sure we can use the src1 as src0. + // + // We could be stricter here and only allow commuting if there is a reason + // to do so. i.e. if both operands are VGPRs there is no real benefit, + // although MachineCSE attempts to find matches by commuting. + const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) + return nullptr; + } + + if (!Src1.isReg()) { + // Allow commuting instructions with Imm operands. + if (NewMI || !Src1.isImm() || + (!isVOP2(*MI) && !isVOP3(*MI))) { + return nullptr; + } + // Be sure to copy the source modifiers to the right place. + if (MachineOperand *Src0Mods + = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { + MachineOperand *Src1Mods + = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers); + + int Src0ModsVal = Src0Mods->getImm(); + if (!Src1Mods && Src0ModsVal != 0) + return nullptr; + + // XXX - This assert might be a lie. It might be useful to have a neg + // modifier with 0.0. + int Src1ModsVal = Src1Mods->getImm(); + assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates"); + + Src1Mods->setImm(Src0ModsVal); + Src0Mods->setImm(Src1ModsVal); + } + + unsigned Reg = Src0.getReg(); + unsigned SubReg = Src0.getSubReg(); + if (Src1.isImm()) + Src0.ChangeToImmediate(Src1.getImm()); + else + llvm_unreachable("Should only have immediates"); + + Src1.ChangeToRegister(Reg, false); + Src1.setSubReg(SubReg); + } else { + MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1); + } + + if (MI) + MI->setDesc(get(CommutedOpcode)); + + return MI; +} + +// This needs to be implemented because the source modifiers may be inserted +// between the true commutable operands, and the base +// TargetInstrInfo::commuteInstruction uses it. +bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx0, + unsigned &SrcOpIdx1) const { + const MCInstrDesc &MCID = MI->getDesc(); + if (!MCID.isCommutable()) + return false; + + unsigned Opc = MI->getOpcode(); + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + if (Src0Idx == -1) + return false; + + // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on + // immediate. Also, immediate src0 operand is not handled in + // SIInstrInfo::commuteInstruction(); + if (!MI->getOperand(Src0Idx).isReg()) + return false; + + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + if (Src1Idx == -1) + return false; + + MachineOperand &Src1 = MI->getOperand(Src1Idx); + if (Src1.isImm()) { + // SIInstrInfo::commuteInstruction() does support commuting the immediate + // operand src1 in 2 and 3 operand instructions. + if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode())) + return false; + } else if (Src1.isReg()) { + // If any source modifiers are set, the generic instruction commuting won't + // understand how to copy the source modifiers. + if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || + hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) + return false; + } else + return false; + + return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); +} + +MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned DstReg, + unsigned SrcReg) const { + return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32), + DstReg) .addReg(SrcReg); +} + +bool SIInstrInfo::isMov(unsigned Opcode) const { + switch(Opcode) { + default: return false; + case AMDGPU::S_MOV_B32: + case AMDGPU::S_MOV_B64: + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B32_e64: + return true; + } +} + +static void removeModOperands(MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::src0_modifiers); + int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::src1_modifiers); + int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::src2_modifiers); + + MI.RemoveOperand(Src2ModIdx); + MI.RemoveOperand(Src1ModIdx); + MI.RemoveOperand(Src0ModIdx); +} + +bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, + unsigned Reg, MachineRegisterInfo *MRI) const { + if (!MRI->hasOneNonDBGUse(Reg)) + return false; + + unsigned Opc = UseMI->getOpcode(); + if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { + // Don't fold if we are using source modifiers. The new VOP2 instructions + // don't have them. + if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || + hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) || + hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) { + return false; + } + + MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0); + MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1); + MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2); + + // Multiplied part is the constant: Use v_madmk_f32 + // We should only expect these to be on src0 due to canonicalizations. + if (Src0->isReg() && Src0->getReg() == Reg) { + if (!Src1->isReg() || + (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) + return false; + + if (!Src2->isReg() || + (Src2->isReg() && RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))) + return false; + + // We need to do some weird looking operand shuffling since the madmk + // operands are out of the normal expected order with the multiplied + // constant as the last operand. + // + // v_mad_f32 src0, src1, src2 -> v_madmk_f32 src0 * src2K + src1 + // src0 -> src2 K + // src1 -> src0 + // src2 -> src1 + + const int64_t Imm = DefMI->getOperand(1).getImm(); + + // FIXME: This would be a lot easier if we could return a new instruction + // instead of having to modify in place. + + // Remove these first since they are at the end. + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::omod)); + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::clamp)); + + unsigned Src1Reg = Src1->getReg(); + unsigned Src1SubReg = Src1->getSubReg(); + unsigned Src2Reg = Src2->getReg(); + unsigned Src2SubReg = Src2->getSubReg(); + Src0->setReg(Src1Reg); + Src0->setSubReg(Src1SubReg); + Src0->setIsKill(Src1->isKill()); + + Src1->setReg(Src2Reg); + Src1->setSubReg(Src2SubReg); + Src1->setIsKill(Src2->isKill()); + + if (Opc == AMDGPU::V_MAC_F32_e64) { + UseMI->untieRegOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); + } + + Src2->ChangeToImmediate(Imm); + + removeModOperands(*UseMI); + UseMI->setDesc(get(AMDGPU::V_MADMK_F32)); + + bool DeleteDef = MRI->hasOneNonDBGUse(Reg); + if (DeleteDef) + DefMI->eraseFromParent(); + + return true; + } + + // Added part is the constant: Use v_madak_f32 + if (Src2->isReg() && Src2->getReg() == Reg) { + // Not allowed to use constant bus for another operand. + // We can however allow an inline immediate as src0. + if (!Src0->isImm() && + (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) + return false; + + if (!Src1->isReg() || + (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) + return false; + + const int64_t Imm = DefMI->getOperand(1).getImm(); + + // FIXME: This would be a lot easier if we could return a new instruction + // instead of having to modify in place. + + // Remove these first since they are at the end. + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::omod)); + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::clamp)); + + if (Opc == AMDGPU::V_MAC_F32_e64) { + UseMI->untieRegOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); + } + + // ChangingToImmediate adds Src2 back to the instruction. + Src2->ChangeToImmediate(Imm); + + // These come before src2. + removeModOperands(*UseMI); + UseMI->setDesc(get(AMDGPU::V_MADAK_F32)); + + bool DeleteDef = MRI->hasOneNonDBGUse(Reg); + if (DeleteDef) + DefMI->eraseFromParent(); + + return true; + } + } + + return false; +} + +static bool offsetsDoNotOverlap(int WidthA, int OffsetA, + int WidthB, int OffsetB) { + int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; + int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; + int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; + return LowOffset + LowWidth <= HighOffset; +} + +bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, + MachineInstr *MIb) const { + unsigned BaseReg0, Offset0; + unsigned BaseReg1, Offset1; + + if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && + getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { + assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() && + "read2 / write2 not expected here yet"); + unsigned Width0 = (*MIa->memoperands_begin())->getSize(); + unsigned Width1 = (*MIb->memoperands_begin())->getSize(); + if (BaseReg0 == BaseReg1 && + offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { + return true; + } + } + + return false; +} + +bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, + MachineInstr *MIb, + AliasAnalysis *AA) const { + assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && + "MIa must load from or modify a memory location"); + assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && + "MIb must load from or modify a memory location"); + + if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects()) + return false; + + // XXX - Can we relax this between address spaces? + if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) + return false; + + // TODO: Should we check the address space from the MachineMemOperand? That + // would allow us to distinguish objects we know don't alias based on the + // underlying address space, even if it was lowered to a different one, + // e.g. private accesses lowered to use MUBUF instructions on a scratch + // buffer. + if (isDS(*MIa)) { + if (isDS(*MIb)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); + + return !isFLAT(*MIb); + } + + if (isMUBUF(*MIa) || isMTBUF(*MIa)) { + if (isMUBUF(*MIb) || isMTBUF(*MIb)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); + + return !isFLAT(*MIb) && !isSMRD(*MIb); + } + + if (isSMRD(*MIa)) { + if (isSMRD(*MIb)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); + + return !isFLAT(*MIb) && !isMUBUF(*MIa) && !isMTBUF(*MIa); + } + + if (isFLAT(*MIa)) { + if (isFLAT(*MIb)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); + + return false; + } + + return false; +} + +MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, + MachineBasicBlock::iterator &MI, + LiveVariables *LV) const { + + switch (MI->getOpcode()) { + default: return nullptr; + case AMDGPU::V_MAC_F32_e64: break; + case AMDGPU::V_MAC_F32_e32: { + const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); + if (Src0->isImm() && !isInlineConstant(*Src0, 4)) + return nullptr; + break; + } + } + + const MachineOperand *Dst = getNamedOperand(*MI, AMDGPU::OpName::dst); + const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = getNamedOperand(*MI, AMDGPU::OpName::src1); + const MachineOperand *Src2 = getNamedOperand(*MI, AMDGPU::OpName::src2); + + return BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_MAD_F32)) + .addOperand(*Dst) + .addImm(0) // Src0 mods + .addOperand(*Src0) + .addImm(0) // Src1 mods + .addOperand(*Src1) + .addImm(0) // Src mods + .addOperand(*Src2) + .addImm(0) // clamp + .addImm(0); // omod +} + +bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { + int64_t SVal = Imm.getSExtValue(); + if (SVal >= -16 && SVal <= 64) + return true; + + if (Imm.getBitWidth() == 64) { + uint64_t Val = Imm.getZExtValue(); + return (DoubleToBits(0.0) == Val) || + (DoubleToBits(1.0) == Val) || + (DoubleToBits(-1.0) == Val) || + (DoubleToBits(0.5) == Val) || + (DoubleToBits(-0.5) == Val) || + (DoubleToBits(2.0) == Val) || + (DoubleToBits(-2.0) == Val) || + (DoubleToBits(4.0) == Val) || + (DoubleToBits(-4.0) == Val); + } + + // The actual type of the operand does not seem to matter as long + // as the bits match one of the inline immediate values. For example: + // + // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal, + // so it is a legal inline immediate. + // + // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in + // floating-point, so it is a legal inline immediate. + uint32_t Val = Imm.getZExtValue(); + + return (FloatToBits(0.0f) == Val) || + (FloatToBits(1.0f) == Val) || + (FloatToBits(-1.0f) == Val) || + (FloatToBits(0.5f) == Val) || + (FloatToBits(-0.5f) == Val) || + (FloatToBits(2.0f) == Val) || + (FloatToBits(-2.0f) == Val) || + (FloatToBits(4.0f) == Val) || + (FloatToBits(-4.0f) == Val); +} + +bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, + unsigned OpSize) const { + if (MO.isImm()) { + // MachineOperand provides no way to tell the true operand size, since it + // only records a 64-bit value. We need to know the size to determine if a + // 32-bit floating point immediate bit pattern is legal for an integer + // immediate. It would be for any 32-bit integer operand, but would not be + // for a 64-bit one. + + unsigned BitSize = 8 * OpSize; + return isInlineConstant(APInt(BitSize, MO.getImm(), true)); + } + + return false; +} + +bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, + unsigned OpSize) const { + return MO.isImm() && !isInlineConstant(MO, OpSize); +} + +static bool compareMachineOp(const MachineOperand &Op0, + const MachineOperand &Op1) { + if (Op0.getType() != Op1.getType()) + return false; + + switch (Op0.getType()) { + case MachineOperand::MO_Register: + return Op0.getReg() == Op1.getReg(); + case MachineOperand::MO_Immediate: + return Op0.getImm() == Op1.getImm(); + default: + llvm_unreachable("Didn't expect to be comparing these operand types"); + } +} + +bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, + const MachineOperand &MO) const { + const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo]; + + assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); + + if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) + return true; + + if (OpInfo.RegClass < 0) + return false; + + unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); + if (isLiteralConstant(MO, OpSize)) + return RI.opCanUseLiteralConstant(OpInfo.OperandType); + + return RI.opCanUseInlineConstant(OpInfo.OperandType); +} + +bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { + int Op32 = AMDGPU::getVOPe32(Opcode); + if (Op32 == -1) + return false; + + return pseudoToMCOpcode(Op32) != -1; +} + +bool SIInstrInfo::hasModifiers(unsigned Opcode) const { + // The src0_modifier operand is present on all instructions + // that have modifiers. + + return AMDGPU::getNamedOperandIdx(Opcode, + AMDGPU::OpName::src0_modifiers) != -1; +} + +bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, + unsigned OpName) const { + const MachineOperand *Mods = getNamedOperand(MI, OpName); + return Mods && Mods->getImm(); +} + +bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, + const MachineOperand &MO, + unsigned OpSize) const { + // Literal constants use the constant bus. + if (isLiteralConstant(MO, OpSize)) + return true; + + if (!MO.isReg() || !MO.isUse()) + return false; + + if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) + return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); + + // FLAT_SCR is just an SGPR pair. + if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) + return true; + + // EXEC register uses the constant bus. + if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) + return true; + + // SGPRs use the constant bus + if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC || + (!MO.isImplicit() && + (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || + AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) { + return true; + } + + return false; +} + +static unsigned findImplicitSGPRRead(const MachineInstr &MI) { + for (const MachineOperand &MO : MI.implicit_operands()) { + // We only care about reads. + if (MO.isDef()) + continue; + + switch (MO.getReg()) { + case AMDGPU::VCC: + case AMDGPU::M0: + case AMDGPU::FLAT_SCR: + return MO.getReg(); + + default: + break; + } + } + + return AMDGPU::NoRegister; +} + +bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, + StringRef &ErrInfo) const { + uint16_t Opcode = MI->getOpcode(); + const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); + int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); + int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); + + // Make sure the number of operands is correct. + const MCInstrDesc &Desc = get(Opcode); + if (!Desc.isVariadic() && + Desc.getNumOperands() != MI->getNumExplicitOperands()) { + ErrInfo = "Instruction has wrong number of operands."; + return false; + } + + // Make sure the register classes are correct. + for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { + if (MI->getOperand(i).isFPImm()) { + ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " + "all fp values to integers."; + return false; + } + + int RegClass = Desc.OpInfo[i].RegClass; + + switch (Desc.OpInfo[i].OperandType) { + case MCOI::OPERAND_REGISTER: + if (MI->getOperand(i).isImm()) { + ErrInfo = "Illegal immediate value for operand."; + return false; + } + break; + case AMDGPU::OPERAND_REG_IMM32: + break; + case AMDGPU::OPERAND_REG_INLINE_C: + if (isLiteralConstant(MI->getOperand(i), + RI.getRegClass(RegClass)->getSize())) { + ErrInfo = "Illegal immediate value for operand."; + return false; + } + break; + case MCOI::OPERAND_IMMEDIATE: + // Check if this operand is an immediate. + // FrameIndex operands will be replaced by immediates, so they are + // allowed. + if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) { + ErrInfo = "Expected immediate, but got non-immediate"; + return false; + } + // Fall-through + default: + continue; + } + + if (!MI->getOperand(i).isReg()) + continue; + + if (RegClass != -1) { + unsigned Reg = MI->getOperand(i).getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + + const TargetRegisterClass *RC = RI.getRegClass(RegClass); + if (!RC->contains(Reg)) { + ErrInfo = "Operand has incorrect register class."; + return false; + } + } + } + + + // Verify VOP* + if (isVOP1(*MI) || isVOP2(*MI) || isVOP3(*MI) || isVOPC(*MI)) { + // Only look at the true operands. Only a real operand can use the constant + // bus, and we don't want to check pseudo-operands like the source modifier + // flags. + const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; + + unsigned ConstantBusCount = 0; + unsigned SGPRUsed = findImplicitSGPRRead(*MI); + if (SGPRUsed != AMDGPU::NoRegister) + ++ConstantBusCount; + + for (int OpIdx : OpIndices) { + if (OpIdx == -1) + break; + const MachineOperand &MO = MI->getOperand(OpIdx); + if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { + if (MO.isReg()) { + if (MO.getReg() != SGPRUsed) + ++ConstantBusCount; + SGPRUsed = MO.getReg(); + } else { + ++ConstantBusCount; + } + } + } + if (ConstantBusCount > 1) { + ErrInfo = "VOP* instruction uses the constant bus more than once"; + return false; + } + } + + // Verify misc. restrictions on specific instructions. + if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || + Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { + const MachineOperand &Src0 = MI->getOperand(Src0Idx); + const MachineOperand &Src1 = MI->getOperand(Src1Idx); + const MachineOperand &Src2 = MI->getOperand(Src2Idx); + if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { + if (!compareMachineOp(Src0, Src1) && + !compareMachineOp(Src0, Src2)) { + ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; + return false; + } + } + } + + // Make sure we aren't losing exec uses in the td files. This mostly requires + // being careful when using let Uses to try to add other use registers. + if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) { + const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC); + if (!Exec || !Exec->isImplicit()) { + ErrInfo = "VALU instruction does not implicitly read exec mask"; + return false; + } + } + + return true; +} + +unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: return AMDGPU::INSTRUCTION_LIST_END; + case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; + case AMDGPU::COPY: return AMDGPU::COPY; + case AMDGPU::PHI: return AMDGPU::PHI; + case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; + case AMDGPU::S_MOV_B32: + return MI.getOperand(1).isReg() ? + AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; + case AMDGPU::S_ADD_I32: + case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; + case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; + case AMDGPU::S_SUB_I32: + case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; + case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; + case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; + case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32; + case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32; + case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32; + case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32; + case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32; + case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32; + case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32; + case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; + case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; + case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; + case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; + case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; + case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; + case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; + case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; + case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; + case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; + case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; + case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; + case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; + case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; + case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; + case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; + case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; + case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; + case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; + case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; + case AMDGPU::S_LOAD_DWORD_IMM: + case AMDGPU::S_LOAD_DWORD_SGPR: + case AMDGPU::S_LOAD_DWORD_IMM_ci: + return AMDGPU::BUFFER_LOAD_DWORD_ADDR64; + case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX2_SGPR: + case AMDGPU::S_LOAD_DWORDX2_IMM_ci: + return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; + case AMDGPU::S_LOAD_DWORDX4_IMM: + case AMDGPU::S_LOAD_DWORDX4_SGPR: + case AMDGPU::S_LOAD_DWORDX4_IMM_ci: + return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; + case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; + case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; + case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; + case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; + } +} + +bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { + return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; +} + +const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, + unsigned OpNo) const { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MCInstrDesc &Desc = get(MI.getOpcode()); + if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || + Desc.OpInfo[OpNo].RegClass == -1) { + unsigned Reg = MI.getOperand(OpNo).getReg(); + + if (TargetRegisterInfo::isVirtualRegister(Reg)) + return MRI.getRegClass(Reg); + return RI.getPhysRegClass(Reg); + } + + unsigned RCID = Desc.OpInfo[OpNo].RegClass; + return RI.getRegClass(RCID); +} + +bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { + switch (MI.getOpcode()) { + case AMDGPU::COPY: + case AMDGPU::REG_SEQUENCE: + case AMDGPU::PHI: + case AMDGPU::INSERT_SUBREG: + return RI.hasVGPRs(getOpRegClass(MI, 0)); + default: + return RI.hasVGPRs(getOpRegClass(MI, OpNo)); + } +} + +void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { + MachineBasicBlock::iterator I = MI; + MachineBasicBlock *MBB = MI->getParent(); + MachineOperand &MO = MI->getOperand(OpIdx); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass; + const TargetRegisterClass *RC = RI.getRegClass(RCID); + unsigned Opcode = AMDGPU::V_MOV_B32_e32; + if (MO.isReg()) + Opcode = AMDGPU::COPY; + else if (RI.isSGPRClass(RC)) + Opcode = AMDGPU::S_MOV_B32; + + + const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); + if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) + VRC = &AMDGPU::VReg_64RegClass; + else + VRC = &AMDGPU::VGPR_32RegClass; + + unsigned Reg = MRI.createVirtualRegister(VRC); + DebugLoc DL = MBB->findDebugLoc(I); + BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg) + .addOperand(MO); + MO.ChangeToRegister(Reg, false); +} + +unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, + MachineRegisterInfo &MRI, + MachineOperand &SuperReg, + const TargetRegisterClass *SuperRC, + unsigned SubIdx, + const TargetRegisterClass *SubRC) + const { + MachineBasicBlock *MBB = MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + unsigned SubReg = MRI.createVirtualRegister(SubRC); + + if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { + BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) + .addReg(SuperReg.getReg(), 0, SubIdx); + return SubReg; + } + + // Just in case the super register is itself a sub-register, copy it to a new + // value so we don't need to worry about merging its subreg index with the + // SubIdx passed to this function. The register coalescer should be able to + // eliminate this extra copy. + unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); + + BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) + .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); + + BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) + .addReg(NewSuperReg, 0, SubIdx); + + return SubReg; +} + +MachineOperand SIInstrInfo::buildExtractSubRegOrImm( + MachineBasicBlock::iterator MII, + MachineRegisterInfo &MRI, + MachineOperand &Op, + const TargetRegisterClass *SuperRC, + unsigned SubIdx, + const TargetRegisterClass *SubRC) const { + if (Op.isImm()) { + // XXX - Is there a better way to do this? + if (SubIdx == AMDGPU::sub0) + return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF); + if (SubIdx == AMDGPU::sub1) + return MachineOperand::CreateImm(Op.getImm() >> 32); + + llvm_unreachable("Unhandled register index for immediate"); + } + + unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, + SubIdx, SubRC); + return MachineOperand::CreateReg(SubReg, false); +} + +// Change the order of operands from (0, 1, 2) to (0, 2, 1) +void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { + assert(Inst->getNumExplicitOperands() == 3); + MachineOperand Op1 = Inst->getOperand(1); + Inst->RemoveOperand(1); + Inst->addOperand(Op1); +} + +bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, + const MCOperandInfo &OpInfo, + const MachineOperand &MO) const { + if (!MO.isReg()) + return false; + + unsigned Reg = MO.getReg(); + const TargetRegisterClass *RC = + TargetRegisterInfo::isVirtualRegister(Reg) ? + MRI.getRegClass(Reg) : + RI.getPhysRegClass(Reg); + + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); + RC = TRI->getSubRegClass(RC, MO.getSubReg()); + + // In order to be legal, the common sub-class must be equal to the + // class of the current operand. For example: + // + // v_mov_b32 s0 ; Operand defined as vsrc_32 + // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL + // + // s_sendmsg 0, s0 ; Operand defined as m0reg + // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL + + return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; +} + +bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, + const MCOperandInfo &OpInfo, + const MachineOperand &MO) const { + if (MO.isReg()) + return isLegalRegOperand(MRI, OpInfo, MO); + + // Handle non-register types that are treated like immediates. + assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); + return true; +} + +bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, + const MachineOperand *MO) const { + const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + const MCInstrDesc &InstDesc = get(MI->getOpcode()); + const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; + const TargetRegisterClass *DefinedRC = + OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; + if (!MO) + MO = &MI->getOperand(OpIdx); + + if (isVALU(*MI) && + usesConstantBus(MRI, *MO, DefinedRC->getSize())) { + unsigned SGPRUsed = + MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + if (i == OpIdx) + continue; + const MachineOperand &Op = MI->getOperand(i); + if (Op.isReg() && Op.getReg() != SGPRUsed && + usesConstantBus(MRI, Op, getOpSize(*MI, i))) { + return false; + } + } + } + + if (MO->isReg()) { + assert(DefinedRC); + return isLegalRegOperand(MRI, OpInfo, *MO); + } + + + // Handle non-register types that are treated like immediates. + assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); + + if (!DefinedRC) { + // This operand expects an immediate. + return true; + } + + return isImmOperandLegal(MI, OpIdx, *MO); +} + +void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, + MachineInstr *MI) const { + unsigned Opc = MI->getOpcode(); + const MCInstrDesc &InstrDesc = get(Opc); + + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + MachineOperand &Src1 = MI->getOperand(Src1Idx); + + // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 + // we need to only have one constant bus use. + // + // Note we do not need to worry about literal constants here. They are + // disabled for the operand type for instructions because they will always + // violate the one constant bus use rule. + bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister; + if (HasImplicitSGPR) { + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + MachineOperand &Src0 = MI->getOperand(Src0Idx); + + if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) + legalizeOpWithMove(MI, Src0Idx); + } + + // VOP2 src0 instructions support all operand types, so we don't need to check + // their legality. If src1 is already legal, we don't need to do anything. + if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) + return; + + // We do not use commuteInstruction here because it is too aggressive and will + // commute if it is possible. We only want to commute here if it improves + // legality. This can be called a fairly large number of times so don't waste + // compile time pointlessly swapping and checking legality again. + if (HasImplicitSGPR || !MI->isCommutable()) { + legalizeOpWithMove(MI, Src1Idx); + return; + } + + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + MachineOperand &Src0 = MI->getOperand(Src0Idx); + + // If src0 can be used as src1, commuting will make the operands legal. + // Otherwise we have to give up and insert a move. + // + // TODO: Other immediate-like operand kinds could be commuted if there was a + // MachineOperand::ChangeTo* for them. + if ((!Src1.isImm() && !Src1.isReg()) || + !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { + legalizeOpWithMove(MI, Src1Idx); + return; + } + + int CommutedOpc = commuteOpcode(*MI); + if (CommutedOpc == -1) { + legalizeOpWithMove(MI, Src1Idx); + return; + } + + MI->setDesc(get(CommutedOpc)); + + unsigned Src0Reg = Src0.getReg(); + unsigned Src0SubReg = Src0.getSubReg(); + bool Src0Kill = Src0.isKill(); + + if (Src1.isImm()) + Src0.ChangeToImmediate(Src1.getImm()); + else if (Src1.isReg()) { + Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); + Src0.setSubReg(Src1.getSubReg()); + } else + llvm_unreachable("Should only have register or immediate operands"); + + Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); + Src1.setSubReg(Src0SubReg); +} + +// Legalize VOP3 operands. Because all operand types are supported for any +// operand, and since literal constants are not allowed and should never be +// seen, we only need to worry about inserting copies if we use multiple SGPR +// operands. +void SIInstrInfo::legalizeOperandsVOP3( + MachineRegisterInfo &MRI, + MachineInstr *MI) const { + unsigned Opc = MI->getOpcode(); + + int VOP3Idx[3] = { + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) + }; + + // Find the one SGPR operand we are allowed to use. + unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); + + for (unsigned i = 0; i < 3; ++i) { + int Idx = VOP3Idx[i]; + if (Idx == -1) + break; + MachineOperand &MO = MI->getOperand(Idx); + + // We should never see a VOP3 instruction with an illegal immediate operand. + if (!MO.isReg()) + continue; + + if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) + continue; // VGPRs are legal + + if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { + SGPRReg = MO.getReg(); + // We can use one SGPR in each VOP3 instruction. + continue; + } + + // If we make it this far, then the operand is not legal and we must + // legalize it. + legalizeOpWithMove(MI, Idx); + } +} + +void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { + MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + + // Legalize VOP2 + if (isVOP2(*MI)) { + legalizeOperandsVOP2(MRI, MI); + return; + } + + // Legalize VOP3 + if (isVOP3(*MI)) { + legalizeOperandsVOP3(MRI, MI); + return; + } + + // Legalize REG_SEQUENCE and PHI + // The register class of the operands much be the same type as the register + // class of the output. + if (MI->getOpcode() == AMDGPU::PHI) { + const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; + for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { + if (!MI->getOperand(i).isReg() || + !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) + continue; + const TargetRegisterClass *OpRC = + MRI.getRegClass(MI->getOperand(i).getReg()); + if (RI.hasVGPRs(OpRC)) { + VRC = OpRC; + } else { + SRC = OpRC; + } + } + + // If any of the operands are VGPR registers, then they all most be + // otherwise we will create illegal VGPR->SGPR copies when legalizing + // them. + if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) { + if (!VRC) { + assert(SRC); + VRC = RI.getEquivalentVGPRClass(SRC); + } + RC = VRC; + } else { + RC = SRC; + } + + // Update all the operands so they have the same type. + for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { + MachineOperand &Op = MI->getOperand(I); + if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) + continue; + unsigned DstReg = MRI.createVirtualRegister(RC); + + // MI is a PHI instruction. + MachineBasicBlock *InsertBB = MI->getOperand(I + 1).getMBB(); + MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); + + BuildMI(*InsertBB, Insert, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) + .addOperand(Op); + Op.setReg(DstReg); + } + } + + // REG_SEQUENCE doesn't really require operand legalization, but if one has a + // VGPR dest type and SGPR sources, insert copies so all operands are + // VGPRs. This seems to help operand folding / the register coalescer. + if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { + MachineBasicBlock *MBB = MI->getParent(); + const TargetRegisterClass *DstRC = getOpRegClass(*MI, 0); + if (RI.hasVGPRs(DstRC)) { + // Update all the operands so they are VGPR register classes. These may + // not be the same register class because REG_SEQUENCE supports mixing + // subregister index types e.g. sub0_sub1 + sub2 + sub3 + for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { + MachineOperand &Op = MI->getOperand(I); + if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) + continue; + + const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); + const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); + if (VRC == OpRC) + continue; + + unsigned DstReg = MRI.createVirtualRegister(VRC); + + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) + .addOperand(Op); + + Op.setReg(DstReg); + Op.setIsKill(); + } + } + + return; + } + + // Legalize INSERT_SUBREG + // src0 must have the same register class as dst + if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) { + unsigned Dst = MI->getOperand(0).getReg(); + unsigned Src0 = MI->getOperand(1).getReg(); + const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); + const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); + if (DstRC != Src0RC) { + MachineBasicBlock &MBB = *MI->getParent(); + unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0) + .addReg(Src0); + MI->getOperand(1).setReg(NewSrc0); + } + return; + } + + // Legalize MUBUF* instructions + // FIXME: If we start using the non-addr64 instructions for compute, we + // may need to legalize them here. + int SRsrcIdx = + AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); + if (SRsrcIdx != -1) { + // We have an MUBUF instruction + MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx); + unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass; + if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), + RI.getRegClass(SRsrcRC))) { + // The operands are legal. + // FIXME: We may need to legalize operands besided srsrc. + return; + } + + MachineBasicBlock &MBB = *MI->getParent(); + + // Extract the ptr from the resource descriptor. + unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, + &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); + + // Create an empty resource descriptor + unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); + uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); + + // Zero64 = 0 + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), + Zero64) + .addImm(0); + + // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), + SRsrcFormatLo) + .addImm(RsrcDataFormat & 0xFFFFFFFF); + + // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), + SRsrcFormatHi) + .addImm(RsrcDataFormat >> 32); + + // NewSRsrc = {Zero64, SRsrcFormat} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) + .addReg(Zero64) + .addImm(AMDGPU::sub0_sub1) + .addReg(SRsrcFormatLo) + .addImm(AMDGPU::sub2) + .addReg(SRsrcFormatHi) + .addImm(AMDGPU::sub3); + + MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); + unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + if (VAddr) { + // This is already an ADDR64 instruction so we need to add the pointer + // extracted from the resource descriptor to the current value of VAddr. + unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 + DebugLoc DL = MI->getDebugLoc(); + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) + .addReg(SRsrcPtr, 0, AMDGPU::sub0) + .addReg(VAddr->getReg(), 0, AMDGPU::sub0); + + // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) + .addReg(SRsrcPtr, 0, AMDGPU::sub1) + .addReg(VAddr->getReg(), 0, AMDGPU::sub1); + + // NewVaddr = {NewVaddrHi, NewVaddrLo} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) + .addReg(NewVAddrLo) + .addImm(AMDGPU::sub0) + .addReg(NewVAddrHi) + .addImm(AMDGPU::sub1); + } else { + // This instructions is the _OFFSET variant, so we need to convert it to + // ADDR64. + assert(MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() + < AMDGPUSubtarget::VOLCANIC_ISLANDS && + "FIXME: Need to emit flat atomics here"); + + MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); + MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); + MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); + unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); + + // Atomics rith return have have an additional tied operand and are + // missing some of the special bits. + MachineOperand *VDataIn = getNamedOperand(*MI, AMDGPU::OpName::vdata_in); + MachineInstr *Addr64; + + if (!VDataIn) { + // Regular buffer load / store. + MachineInstrBuilder MIB + = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) + .addOperand(*VData) + .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. + // This will be replaced later + // with the new value of vaddr. + .addOperand(*SRsrc) + .addOperand(*SOffset) + .addOperand(*Offset); + + // Atomics do not have this operand. + if (const MachineOperand *GLC + = getNamedOperand(*MI, AMDGPU::OpName::glc)) { + MIB.addImm(GLC->getImm()); + } + + MIB.addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)); + + if (const MachineOperand *TFE + = getNamedOperand(*MI, AMDGPU::OpName::tfe)) { + MIB.addImm(TFE->getImm()); + } + + MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + Addr64 = MIB; + } else { + // Atomics with return. + Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) + .addOperand(*VData) + .addOperand(*VDataIn) + .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. + // This will be replaced later + // with the new value of vaddr. + .addOperand(*SRsrc) + .addOperand(*SOffset) + .addOperand(*Offset) + .addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)) + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + } + + MI->removeFromParent(); + MI = Addr64; + + // NewVaddr = {NewVaddrHi, NewVaddrLo} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) + .addReg(SRsrcPtr, 0, AMDGPU::sub0) + .addImm(AMDGPU::sub0) + .addReg(SRsrcPtr, 0, AMDGPU::sub1) + .addImm(AMDGPU::sub1); + + VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); + SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); + } + + // Update the instruction to use NewVaddr + VAddr->setReg(NewVAddr); + // Update the instruction to use NewSRsrc + SRsrc->setReg(NewSRsrc); + } +} + +void SIInstrInfo::splitSMRD(MachineInstr *MI, + const TargetRegisterClass *HalfRC, + unsigned HalfImmOp, unsigned HalfSGPROp, + MachineInstr *&Lo, MachineInstr *&Hi) const { + + DebugLoc DL = MI->getDebugLoc(); + MachineBasicBlock *MBB = MI->getParent(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + unsigned RegLo = MRI.createVirtualRegister(HalfRC); + unsigned RegHi = MRI.createVirtualRegister(HalfRC); + unsigned HalfSize = HalfRC->getSize(); + const MachineOperand *OffOp = + getNamedOperand(*MI, AMDGPU::OpName::offset); + const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); + + // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes + // on VI. + + bool IsKill = SBase->isKill(); + if (OffOp) { + bool isVI = + MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >= + AMDGPUSubtarget::VOLCANIC_ISLANDS; + unsigned OffScale = isVI ? 1 : 4; + // Handle the _IMM variant + unsigned LoOffset = OffOp->getImm() * OffScale; + unsigned HiOffset = LoOffset + HalfSize; + Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo) + // Use addReg instead of addOperand + // to make sure kill flag is cleared. + .addReg(SBase->getReg(), 0, SBase->getSubReg()) + .addImm(LoOffset / OffScale); + + if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) { + unsigned OffsetSGPR = + MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR) + .addImm(HiOffset); // The offset in register is in bytes. + Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) + .addReg(SBase->getReg(), getKillRegState(IsKill), + SBase->getSubReg()) + .addReg(OffsetSGPR); + } else { + Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi) + .addReg(SBase->getReg(), getKillRegState(IsKill), + SBase->getSubReg()) + .addImm(HiOffset / OffScale); + } + } else { + // Handle the _SGPR variant + MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff); + Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo) + .addReg(SBase->getReg(), 0, SBase->getSubReg()) + .addOperand(*SOff); + unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR) + .addReg(SOff->getReg(), 0, SOff->getSubReg()) + .addImm(HalfSize); + Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) + .addReg(SBase->getReg(), getKillRegState(IsKill), + SBase->getSubReg()) + .addReg(OffsetSGPR); + } + + unsigned SubLo, SubHi; + const TargetRegisterClass *NewDstRC; + switch (HalfSize) { + case 4: + SubLo = AMDGPU::sub0; + SubHi = AMDGPU::sub1; + NewDstRC = &AMDGPU::VReg_64RegClass; + break; + case 8: + SubLo = AMDGPU::sub0_sub1; + SubHi = AMDGPU::sub2_sub3; + NewDstRC = &AMDGPU::VReg_128RegClass; + break; + case 16: + SubLo = AMDGPU::sub0_sub1_sub2_sub3; + SubHi = AMDGPU::sub4_sub5_sub6_sub7; + NewDstRC = &AMDGPU::VReg_256RegClass; + break; + case 32: + SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; + SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15; + NewDstRC = &AMDGPU::VReg_512RegClass; + break; + default: + llvm_unreachable("Unhandled HalfSize"); + } + + unsigned OldDst = MI->getOperand(0).getReg(); + unsigned NewDst = MRI.createVirtualRegister(NewDstRC); + + MRI.replaceRegWith(OldDst, NewDst); + + BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDst) + .addReg(RegLo) + .addImm(SubLo) + .addReg(RegHi) + .addImm(SubHi); +} + +void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, + MachineRegisterInfo &MRI, + SmallVectorImpl<MachineInstr *> &Worklist) const { + MachineBasicBlock *MBB = MI->getParent(); + int DstIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); + assert(DstIdx != -1); + unsigned DstRCID = get(MI->getOpcode()).OpInfo[DstIdx].RegClass; + switch(RI.getRegClass(DstRCID)->getSize()) { + case 4: + case 8: + case 16: { + unsigned NewOpcode = getVALUOp(*MI); + unsigned RegOffset; + unsigned ImmOffset; + + if (MI->getOperand(2).isReg()) { + RegOffset = MI->getOperand(2).getReg(); + ImmOffset = 0; + } else { + assert(MI->getOperand(2).isImm()); + // SMRD instructions take a dword offsets on SI and byte offset on VI + // and MUBUF instructions always take a byte offset. + ImmOffset = MI->getOperand(2).getImm(); + if (MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <= + AMDGPUSubtarget::SEA_ISLANDS) + ImmOffset <<= 2; + RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + + if (isUInt<12>(ImmOffset)) { + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), + RegOffset) + .addImm(0); + } else { + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), + RegOffset) + .addImm(ImmOffset); + ImmOffset = 0; + } + } + + unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); + unsigned DWord0 = RegOffset; + unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); + + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1) + .addImm(0); + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2) + .addImm(RsrcDataFormat & 0xFFFFFFFF); + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3) + .addImm(RsrcDataFormat >> 32); + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc) + .addReg(DWord0) + .addImm(AMDGPU::sub0) + .addReg(DWord1) + .addImm(AMDGPU::sub1) + .addReg(DWord2) + .addImm(AMDGPU::sub2) + .addReg(DWord3) + .addImm(AMDGPU::sub3); + + const MCInstrDesc &NewInstDesc = get(NewOpcode); + const TargetRegisterClass *NewDstRC + = RI.getRegClass(NewInstDesc.OpInfo[0].RegClass); + unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); + unsigned DstReg = MI->getOperand(0).getReg(); + MRI.replaceRegWith(DstReg, NewDstReg); + + MachineInstr *NewInst = + BuildMI(*MBB, MI, MI->getDebugLoc(), NewInstDesc, NewDstReg) + .addOperand(MI->getOperand(1)) // sbase + .addReg(SRsrc) + .addImm(0) + .addImm(ImmOffset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MI->eraseFromParent(); + + legalizeOperands(NewInst); + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); + break; + } + case 32: { + MachineInstr *Lo, *Hi; + splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM, + AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi); + MI->eraseFromParent(); + moveSMRDToVALU(Lo, MRI, Worklist); + moveSMRDToVALU(Hi, MRI, Worklist); + break; + } + + case 64: { + MachineInstr *Lo, *Hi; + splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM, + AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi); + MI->eraseFromParent(); + moveSMRDToVALU(Lo, MRI, Worklist); + moveSMRDToVALU(Hi, MRI, Worklist); + break; + } + } +} + +void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { + SmallVector<MachineInstr *, 128> Worklist; + Worklist.push_back(&TopInst); + + while (!Worklist.empty()) { + MachineInstr *Inst = Worklist.pop_back_val(); + MachineBasicBlock *MBB = Inst->getParent(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + + unsigned Opcode = Inst->getOpcode(); + unsigned NewOpcode = getVALUOp(*Inst); + + // Handle some special cases + switch (Opcode) { + default: + if (isSMRD(*Inst)) { + moveSMRDToVALU(Inst, MRI, Worklist); + continue; + } + break; + case AMDGPU::S_AND_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); + Inst->eraseFromParent(); + continue; + + case AMDGPU::S_OR_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); + Inst->eraseFromParent(); + continue; + + case AMDGPU::S_XOR_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); + Inst->eraseFromParent(); + continue; + + case AMDGPU::S_NOT_B64: + splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); + Inst->eraseFromParent(); + continue; + + case AMDGPU::S_BCNT1_I32_B64: + splitScalar64BitBCNT(Worklist, Inst); + Inst->eraseFromParent(); + continue; + + case AMDGPU::S_BFE_I64: { + splitScalar64BitBFE(Worklist, Inst); + Inst->eraseFromParent(); + continue; + } + + case AMDGPU::S_LSHL_B32: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_LSHLREV_B32_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_ASHR_I32: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_ASHRREV_I32_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_LSHR_B32: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_LSHRREV_B32_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_LSHL_B64: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_LSHLREV_B64; + swapOperands(Inst); + } + break; + case AMDGPU::S_ASHR_I64: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_ASHRREV_I64; + swapOperands(Inst); + } + break; + case AMDGPU::S_LSHR_B64: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_LSHRREV_B64; + swapOperands(Inst); + } + break; + + case AMDGPU::S_ABS_I32: + lowerScalarAbs(Worklist, Inst); + Inst->eraseFromParent(); + continue; + + case AMDGPU::S_BFE_U64: + case AMDGPU::S_BFM_B64: + llvm_unreachable("Moving this op to VALU not implemented"); + } + + if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { + // We cannot move this instruction to the VALU, so we should try to + // legalize its operands instead. + legalizeOperands(Inst); + continue; + } + + // Use the new VALU Opcode. + const MCInstrDesc &NewDesc = get(NewOpcode); + Inst->setDesc(NewDesc); + + // Remove any references to SCC. Vector instructions can't read from it, and + // We're just about to add the implicit use / defs of VCC, and we don't want + // both. + for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) { + MachineOperand &Op = Inst->getOperand(i); + if (Op.isReg() && Op.getReg() == AMDGPU::SCC) + Inst->RemoveOperand(i); + } + + if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { + // We are converting these to a BFE, so we need to add the missing + // operands for the size and offset. + unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; + Inst->addOperand(MachineOperand::CreateImm(0)); + Inst->addOperand(MachineOperand::CreateImm(Size)); + + } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { + // The VALU version adds the second operand to the result, so insert an + // extra 0 operand. + Inst->addOperand(MachineOperand::CreateImm(0)); + } + + Inst->addImplicitDefUseOperands(*Inst->getParent()->getParent()); + + if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { + const MachineOperand &OffsetWidthOp = Inst->getOperand(2); + // If we need to move this to VGPRs, we need to unpack the second operand + // back into the 2 separate ones for bit offset and width. + assert(OffsetWidthOp.isImm() && + "Scalar BFE is only implemented for constant width and offset"); + uint32_t Imm = OffsetWidthOp.getImm(); + + uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. + uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. + Inst->RemoveOperand(2); // Remove old immediate. + Inst->addOperand(MachineOperand::CreateImm(Offset)); + Inst->addOperand(MachineOperand::CreateImm(BitWidth)); + } + + // Update the destination register class. + const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst); + if (!NewDstRC) + continue; + + unsigned DstReg = Inst->getOperand(0).getReg(); + unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); + MRI.replaceRegWith(DstReg, NewDstReg); + + // Legalize the operands + legalizeOperands(Inst); + + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); + } +} + +//===----------------------------------------------------------------------===// +// Indirect addressing callbacks +//===----------------------------------------------------------------------===// + +unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const { + assert(Channel == 0); + return RegIndex; +} + +const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { + return &AMDGPU::VGPR_32RegClass; +} + +void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + DebugLoc DL = Inst->getDebugLoc(); + + MachineOperand &Dest = Inst->getOperand(0); + MachineOperand &Src = Inst->getOperand(1); + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg) + .addImm(0) + .addReg(Src.getReg()); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) + .addReg(Src.getReg()) + .addReg(TmpReg); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); +} + +void SIInstrInfo::splitScalar64BitUnaryOp( + SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst, + unsigned Opcode) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + MachineOperand &Dest = Inst->getOperand(0); + MachineOperand &Src0 = Inst->getOperand(1); + DebugLoc DL = Inst->getDebugLoc(); + + MachineBasicBlock::iterator MII = Inst; + + const MCInstrDesc &InstDesc = get(Opcode); + const TargetRegisterClass *Src0RC = Src0.isReg() ? + MRI.getRegClass(Src0.getReg()) : + &AMDGPU::SGPR_32RegClass; + + const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); + + MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub0, Src0SubRC); + + const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); + const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); + const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); + + unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); + BuildMI(MBB, MII, DL, InstDesc, DestSub0) + .addOperand(SrcReg0Sub0); + + MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub1, Src0SubRC); + + unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); + BuildMI(MBB, MII, DL, InstDesc, DestSub1) + .addOperand(SrcReg0Sub1); + + unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), FullDestReg); + + // We don't need to legalizeOperands here because for a single operand, src0 + // will support any kind of input. + + // Move all users of this moved value. + addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); +} + +void SIInstrInfo::splitScalar64BitBinaryOp( + SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst, + unsigned Opcode) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + MachineOperand &Dest = Inst->getOperand(0); + MachineOperand &Src0 = Inst->getOperand(1); + MachineOperand &Src1 = Inst->getOperand(2); + DebugLoc DL = Inst->getDebugLoc(); + + MachineBasicBlock::iterator MII = Inst; + + const MCInstrDesc &InstDesc = get(Opcode); + const TargetRegisterClass *Src0RC = Src0.isReg() ? + MRI.getRegClass(Src0.getReg()) : + &AMDGPU::SGPR_32RegClass; + + const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); + const TargetRegisterClass *Src1RC = Src1.isReg() ? + MRI.getRegClass(Src1.getReg()) : + &AMDGPU::SGPR_32RegClass; + + const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); + + MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub0, Src0SubRC); + MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, + AMDGPU::sub0, Src1SubRC); + + const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); + const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); + const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); + + unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); + MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) + .addOperand(SrcReg0Sub0) + .addOperand(SrcReg1Sub0); + + MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub1, Src0SubRC); + MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, + AMDGPU::sub1, Src1SubRC); + + unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); + MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) + .addOperand(SrcReg0Sub1) + .addOperand(SrcReg1Sub1); + + unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), FullDestReg); + + // Try to legalize the operands in case we need to swap the order to keep it + // valid. + legalizeOperands(LoHalf); + legalizeOperands(HiHalf); + + // Move all users of this moved vlaue. + addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); +} + +void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + MachineBasicBlock::iterator MII = Inst; + DebugLoc DL = Inst->getDebugLoc(); + + MachineOperand &Dest = Inst->getOperand(0); + MachineOperand &Src = Inst->getOperand(1); + + const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); + const TargetRegisterClass *SrcRC = Src.isReg() ? + MRI.getRegClass(Src.getReg()) : + &AMDGPU::SGPR_32RegClass; + + unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); + + MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, + AMDGPU::sub0, SrcSubRC); + MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, + AMDGPU::sub1, SrcSubRC); + + BuildMI(MBB, MII, DL, InstDesc, MidReg) + .addOperand(SrcRegSub0) + .addImm(0); + + BuildMI(MBB, MII, DL, InstDesc, ResultReg) + .addOperand(SrcRegSub1) + .addReg(MidReg); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + + // We don't need to legalize operands here. src0 for etiher instruction can be + // an SGPR, and the second input is unused or determined here. + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); +} + +void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + DebugLoc DL = Inst->getDebugLoc(); + + MachineOperand &Dest = Inst->getOperand(0); + uint32_t Imm = Inst->getOperand(2).getImm(); + uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. + uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. + + (void) Offset; + + // Only sext_inreg cases handled. + assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 && + BitWidth <= 32 && + Offset == 0 && + "Not implemented"); + + if (BitWidth < 32) { + unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) + .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0) + .addImm(0) + .addImm(BitWidth); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) + .addImm(31) + .addReg(MidRegLo); + + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) + .addReg(MidRegLo) + .addImm(AMDGPU::sub0) + .addReg(MidRegHi) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); + return; + } + + MachineOperand &Src = Inst->getOperand(1); + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) + .addImm(31) + .addReg(Src.getReg(), 0, AMDGPU::sub0); + + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) + .addReg(Src.getReg(), 0, AMDGPU::sub0) + .addImm(AMDGPU::sub0) + .addReg(TmpReg) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); +} + +void SIInstrInfo::addUsersToMoveToVALUWorklist( + unsigned DstReg, + MachineRegisterInfo &MRI, + SmallVectorImpl<MachineInstr *> &Worklist) const { + for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), + E = MRI.use_end(); I != E; ++I) { + MachineInstr &UseMI = *I->getParent(); + if (!canReadVGPR(UseMI, I.getOperandNo())) { + Worklist.push_back(&UseMI); + } + } +} + +const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( + const MachineInstr &Inst) const { + const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); + + switch (Inst.getOpcode()) { + // For target instructions, getOpRegClass just returns the virtual register + // class associated with the operand, so we need to find an equivalent VGPR + // register class in order to move the instruction to the VALU. + case AMDGPU::COPY: + case AMDGPU::PHI: + case AMDGPU::REG_SEQUENCE: + case AMDGPU::INSERT_SUBREG: + if (RI.hasVGPRs(NewDstRC)) + return nullptr; + + NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); + if (!NewDstRC) + return nullptr; + return NewDstRC; + default: + return NewDstRC; + } +} + +// Find the one SGPR operand we are allowed to use. +unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, + int OpIndices[3]) const { + const MCInstrDesc &Desc = MI->getDesc(); + + // Find the one SGPR operand we are allowed to use. + // + // First we need to consider the instruction's operand requirements before + // legalizing. Some operands are required to be SGPRs, such as implicit uses + // of VCC, but we are still bound by the constant bus requirement to only use + // one. + // + // If the operand's class is an SGPR, we can never move it. + + unsigned SGPRReg = findImplicitSGPRRead(*MI); + if (SGPRReg != AMDGPU::NoRegister) + return SGPRReg; + + unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; + const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + + for (unsigned i = 0; i < 3; ++i) { + int Idx = OpIndices[i]; + if (Idx == -1) + break; + + const MachineOperand &MO = MI->getOperand(Idx); + if (!MO.isReg()) + continue; + + // Is this operand statically required to be an SGPR based on the operand + // constraints? + const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); + bool IsRequiredSGPR = RI.isSGPRClass(OpRC); + if (IsRequiredSGPR) + return MO.getReg(); + + // If this could be a VGPR or an SGPR, Check the dynamic register class. + unsigned Reg = MO.getReg(); + const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); + if (RI.isSGPRClass(RegRC)) + UsedSGPRs[i] = Reg; + } + + // We don't have a required SGPR operand, so we have a bit more freedom in + // selecting operands to move. + + // Try to select the most used SGPR. If an SGPR is equal to one of the + // others, we choose that. + // + // e.g. + // V_FMA_F32 v0, s0, s0, s0 -> No moves + // V_FMA_F32 v0, s0, s1, s0 -> Move s1 + + // TODO: If some of the operands are 64-bit SGPRs and some 32, we should + // prefer those. + + if (UsedSGPRs[0] != AMDGPU::NoRegister) { + if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) + SGPRReg = UsedSGPRs[0]; + } + + if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { + if (UsedSGPRs[1] == UsedSGPRs[2]) + SGPRReg = UsedSGPRs[1]; + } + + return SGPRReg; +} + +MachineInstrBuilder SIInstrInfo::buildIndirectWrite( + MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, + unsigned Address, unsigned OffsetReg) const { + const DebugLoc &DL = MBB->findDebugLoc(I); + unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( + getIndirectIndexBegin(*MBB->getParent())); + + return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1)) + .addReg(IndirectBaseReg, RegState::Define) + .addOperand(I->getOperand(0)) + .addReg(IndirectBaseReg) + .addReg(OffsetReg) + .addImm(0) + .addReg(ValueReg); +} + +MachineInstrBuilder SIInstrInfo::buildIndirectRead( + MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, + unsigned Address, unsigned OffsetReg) const { + const DebugLoc &DL = MBB->findDebugLoc(I); + unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( + getIndirectIndexBegin(*MBB->getParent())); + + return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC_V1)) + .addOperand(I->getOperand(0)) + .addOperand(I->getOperand(1)) + .addReg(IndirectBaseReg) + .addReg(OffsetReg) + .addImm(0); + +} + +void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, + const MachineFunction &MF) const { + int End = getIndirectIndexEnd(MF); + int Begin = getIndirectIndexBegin(MF); + + if (End == -1) + return; + + + for (int Index = Begin; Index <= End; ++Index) + Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index)); + + for (int Index = std::max(0, Begin - 1); Index <= End; ++Index) + Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index)); + + for (int Index = std::max(0, Begin - 2); Index <= End; ++Index) + Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index)); + + for (int Index = std::max(0, Begin - 3); Index <= End; ++Index) + Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index)); + + for (int Index = std::max(0, Begin - 7); Index <= End; ++Index) + Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index)); + + for (int Index = std::max(0, Begin - 15); Index <= End; ++Index) + Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index)); +} + +MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, + unsigned OperandName) const { + int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); + if (Idx == -1) + return nullptr; + + return &MI.getOperand(Idx); +} + +uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { + uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; + if (ST.isAmdHsaOS()) { + RsrcDataFormat |= (1ULL << 56); + + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + // Set MTYPE = 2 + RsrcDataFormat |= (2ULL << 59); + } + + return RsrcDataFormat; +} + +uint64_t SIInstrInfo::getScratchRsrcWords23() const { + uint64_t Rsrc23 = getDefaultRsrcDataFormat() | + AMDGPU::RSRC_TID_ENABLE | + 0xffffffff; // Size; + + // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. + // Clear them unless we want a huge stride. + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; + + return Rsrc23; +} + +bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr *MI) const { + unsigned Opc = MI->getOpcode(); + + return isSMRD(Opc); +} + +bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr *MI) const { + unsigned Opc = MI->getOpcode(); + + return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h new file mode 100644 index 0000000..cce1ae7 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -0,0 +1,518 @@ +//===-- SIInstrInfo.h - SI Instruction Info Interface -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface definition for SIInstrInfo. +// +//===----------------------------------------------------------------------===// + + +#ifndef LLVM_LIB_TARGET_R600_SIINSTRINFO_H +#define LLVM_LIB_TARGET_R600_SIINSTRINFO_H + +#include "AMDGPUInstrInfo.h" +#include "SIDefines.h" +#include "SIRegisterInfo.h" + +namespace llvm { + +class SIInstrInfo : public AMDGPUInstrInfo { +private: + const SIRegisterInfo RI; + + unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, + MachineRegisterInfo &MRI, + MachineOperand &SuperReg, + const TargetRegisterClass *SuperRC, + unsigned SubIdx, + const TargetRegisterClass *SubRC) const; + MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, + MachineRegisterInfo &MRI, + MachineOperand &SuperReg, + const TargetRegisterClass *SuperRC, + unsigned SubIdx, + const TargetRegisterClass *SubRC) const; + + void swapOperands(MachineBasicBlock::iterator Inst) const; + + void lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const; + + void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst, unsigned Opcode) const; + + void splitScalar64BitBinaryOp(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst, unsigned Opcode) const; + + void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const; + void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const; + + void addUsersToMoveToVALUWorklist( + unsigned Reg, MachineRegisterInfo &MRI, + SmallVectorImpl<MachineInstr *> &Worklist) const; + + const TargetRegisterClass * + getDestEquivalentVGPRClass(const MachineInstr &Inst) const; + + bool checkInstOffsetsDoNotOverlap(MachineInstr *MIa, + MachineInstr *MIb) const; + + unsigned findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const; + +protected: + MachineInstr *commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx0, + unsigned OpIdx1) const override; + +public: + explicit SIInstrInfo(const AMDGPUSubtarget &st); + + const SIRegisterInfo &getRegisterInfo() const override { + return RI; + } + + bool isReallyTriviallyReMaterializable(const MachineInstr *MI, + AliasAnalysis *AA) const override; + + bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, + int64_t &Offset1, + int64_t &Offset2) const override; + + bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, + unsigned &Offset, + const TargetRegisterInfo *TRI) const final; + + bool shouldClusterLoads(MachineInstr *FirstLdSt, + MachineInstr *SecondLdSt, + unsigned NumLoads) const final; + + void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const override; + + unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + RegScavenger *RS, + unsigned TmpReg, + unsigned Offset, + unsigned Size) const; + + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + + bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + + // \brief Returns an opcode that can be used to move a value to a \p DstRC + // register. If there is no hardware instruction that can store to \p + // DstRC, then AMDGPU::COPY is returned. + unsigned getMovOpcode(const TargetRegisterClass *DstRC) const; + + LLVM_READONLY + int commuteOpcode(const MachineInstr &MI) const; + + bool findCommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const override; + + bool areMemAccessesTriviallyDisjoint( + MachineInstr *MIa, MachineInstr *MIb, + AliasAnalysis *AA = nullptr) const override; + + MachineInstr *buildMovInstr(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned DstReg, unsigned SrcReg) const override; + bool isMov(unsigned Opcode) const override; + + bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, + unsigned Reg, MachineRegisterInfo *MRI) const final; + + unsigned getMachineCSELookAheadLimit() const override { return 500; } + + MachineInstr *convertToThreeAddress(MachineFunction::iterator &MBB, + MachineBasicBlock::iterator &MI, + LiveVariables *LV) const override; + + static bool isSALU(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SALU; + } + + bool isSALU(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SALU; + } + + static bool isVALU(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VALU; + } + + bool isVALU(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VALU; + } + + static bool isSOP1(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SOP1; + } + + bool isSOP1(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOP1; + } + + static bool isSOP2(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SOP2; + } + + bool isSOP2(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOP2; + } + + static bool isSOPC(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SOPC; + } + + bool isSOPC(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOPC; + } + + static bool isSOPK(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SOPK; + } + + bool isSOPK(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOPK; + } + + static bool isSOPP(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SOPP; + } + + bool isSOPP(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOPP; + } + + static bool isVOP1(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VOP1; + } + + bool isVOP1(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VOP1; + } + + static bool isVOP2(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VOP2; + } + + bool isVOP2(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VOP2; + } + + static bool isVOP3(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VOP3; + } + + bool isVOP3(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VOP3; + } + + static bool isVOPC(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VOPC; + } + + bool isVOPC(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VOPC; + } + + static bool isMUBUF(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::MUBUF; + } + + bool isMUBUF(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::MUBUF; + } + + static bool isMTBUF(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::MTBUF; + } + + bool isMTBUF(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::MTBUF; + } + + static bool isSMRD(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SMRD; + } + + bool isSMRD(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SMRD; + } + + static bool isDS(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::DS; + } + + bool isDS(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::DS; + } + + static bool isMIMG(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::MIMG; + } + + bool isMIMG(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::MIMG; + } + + static bool isFLAT(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::FLAT; + } + + bool isFLAT(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::FLAT; + } + + static bool isWQM(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::WQM; + } + + bool isWQM(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::WQM; + } + + static bool isVGPRSpill(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VGPRSpill; + } + + bool isVGPRSpill(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill; + } + + bool isInlineConstant(const APInt &Imm) const; + bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const; + bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const; + + bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, + const MachineOperand &MO) const; + + /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding. + /// This function will return false if you pass it a 32-bit instruction. + bool hasVALU32BitEncoding(unsigned Opcode) const; + + /// \brief Returns true if this operand uses the constant bus. + bool usesConstantBus(const MachineRegisterInfo &MRI, + const MachineOperand &MO, + unsigned OpSize) const; + + /// \brief Return true if this instruction has any modifiers. + /// e.g. src[012]_mod, omod, clamp. + bool hasModifiers(unsigned Opcode) const; + + bool hasModifiersSet(const MachineInstr &MI, + unsigned OpName) const; + + bool verifyInstruction(const MachineInstr *MI, + StringRef &ErrInfo) const override; + + static unsigned getVALUOp(const MachineInstr &MI); + + bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const; + + /// \brief Return the correct register class for \p OpNo. For target-specific + /// instructions, this will return the register class that has been defined + /// in tablegen. For generic instructions, like REG_SEQUENCE it will return + /// the register class of its machine operand. + /// to infer the correct register class base on the other operands. + const TargetRegisterClass *getOpRegClass(const MachineInstr &MI, + unsigned OpNo) const; + + /// \brief Return the size in bytes of the operand OpNo on the given + // instruction opcode. + unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const { + const MCOperandInfo &OpInfo = get(Opcode).OpInfo[OpNo]; + + if (OpInfo.RegClass == -1) { + // If this is an immediate operand, this must be a 32-bit literal. + assert(OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE); + return 4; + } + + return RI.getRegClass(OpInfo.RegClass)->getSize(); + } + + /// \brief This form should usually be preferred since it handles operands + /// with unknown register classes. + unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const { + return getOpRegClass(MI, OpNo)->getSize(); + } + + /// \returns true if it is legal for the operand at index \p OpNo + /// to read a VGPR. + bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const; + + /// \brief Legalize the \p OpIndex operand of this instruction by inserting + /// a MOV. For example: + /// ADD_I32_e32 VGPR0, 15 + /// to + /// MOV VGPR1, 15 + /// ADD_I32_e32 VGPR0, VGPR1 + /// + /// If the operand being legalized is a register, then a COPY will be used + /// instead of MOV. + void legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const; + + /// \brief Check if \p MO is a legal operand if it was the \p OpIdx Operand + /// for \p MI. + bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx, + const MachineOperand *MO = nullptr) const; + + /// \brief Check if \p MO would be a valid operand for the given operand + /// definition \p OpInfo. Note this does not attempt to validate constant bus + /// restrictions (e.g. literal constant usage). + bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, + const MCOperandInfo &OpInfo, + const MachineOperand &MO) const; + + /// \brief Check if \p MO (a register operand) is a legal register for the + /// given operand description. + bool isLegalRegOperand(const MachineRegisterInfo &MRI, + const MCOperandInfo &OpInfo, + const MachineOperand &MO) const; + + /// \brief Legalize operands in \p MI by either commuting it or inserting a + /// copy of src1. + void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr *MI) const; + + /// \brief Fix operands in \p MI to satisfy constant bus requirements. + void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr *MI) const; + + /// \brief Legalize all operands in this instruction. This function may + /// create new instruction and insert them before \p MI. + void legalizeOperands(MachineInstr *MI) const; + + /// \brief Split an SMRD instruction into two smaller loads of half the + // size storing the results in \p Lo and \p Hi. + void splitSMRD(MachineInstr *MI, const TargetRegisterClass *HalfRC, + unsigned HalfImmOp, unsigned HalfSGPROp, + MachineInstr *&Lo, MachineInstr *&Hi) const; + + void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI, + SmallVectorImpl<MachineInstr *> &Worklist) const; + + /// \brief Replace this instruction's opcode with the equivalent VALU + /// opcode. This function will also move the users of \p MI to the + /// VALU if necessary. + void moveToVALU(MachineInstr &MI) const; + + unsigned calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const override; + + const TargetRegisterClass *getIndirectAddrRegClass() const override; + + MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, + unsigned Address, + unsigned OffsetReg) const override; + + MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, + unsigned Address, + unsigned OffsetReg) const override; + void reserveIndirectRegisters(BitVector &Reserved, + const MachineFunction &MF) const; + + void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I, + unsigned SavReg, unsigned IndexReg) const; + + void insertWaitStates(MachineBasicBlock::iterator MI, int Count) const; + + /// \brief Returns the operand named \p Op. If \p MI does not have an + /// operand named \c Op, this function returns nullptr. + LLVM_READONLY + MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const; + + LLVM_READONLY + const MachineOperand *getNamedOperand(const MachineInstr &MI, + unsigned OpName) const { + return getNamedOperand(const_cast<MachineInstr &>(MI), OpName); + } + + /// Get required immediate operand + int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const { + int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName); + return MI.getOperand(Idx).getImm(); + } + + uint64_t getDefaultRsrcDataFormat() const; + uint64_t getScratchRsrcWords23() const; + + bool isLowLatencyInstruction(const MachineInstr *MI) const; + bool isHighLatencyInstruction(const MachineInstr *MI) const; +}; + +namespace AMDGPU { + LLVM_READONLY + int getVOPe64(uint16_t Opcode); + + LLVM_READONLY + int getVOPe32(uint16_t Opcode); + + LLVM_READONLY + int getCommuteRev(uint16_t Opcode); + + LLVM_READONLY + int getCommuteOrig(uint16_t Opcode); + + LLVM_READONLY + int getAddr64Inst(uint16_t Opcode); + + LLVM_READONLY + int getAtomicRetOp(uint16_t Opcode); + + LLVM_READONLY + int getAtomicNoRetOp(uint16_t Opcode); + + const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; + const uint64_t RSRC_TID_ENABLE = 1LL << 55; + +} // End namespace AMDGPU + +namespace SI { +namespace KernelInputOffsets { + +/// Offsets in bytes from the start of the input buffer +enum Offsets { + NGROUPS_X = 0, + NGROUPS_Y = 4, + NGROUPS_Z = 8, + GLOBAL_SIZE_X = 12, + GLOBAL_SIZE_Y = 16, + GLOBAL_SIZE_Z = 20, + LOCAL_SIZE_X = 24, + LOCAL_SIZE_Y = 28, + LOCAL_SIZE_Z = 32 +}; + +} // End namespace KernelInputOffsets +} // End namespace SI + +} // End namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td new file mode 100644 index 0000000..8735277 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -0,0 +1,2929 @@ +//===-- SIInstrInfo.td - SI Instruction Infos -------------*- tablegen -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +def isCI : Predicate<"Subtarget->getGeneration() " + ">= AMDGPUSubtarget::SEA_ISLANDS">; +def isCIOnly : Predicate<"Subtarget->getGeneration() ==" + "AMDGPUSubtarget::SEA_ISLANDS">, + AssemblerPredicate <"FeatureSeaIslands">; + +def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">; + +class vop { + field bits<9> SI3; + field bits<10> VI3; +} + +class vopc <bits<8> si, bits<8> vi = !add(0x40, si)> : vop { + field bits<8> SI = si; + field bits<8> VI = vi; + + field bits<9> SI3 = {0, si{7-0}}; + field bits<10> VI3 = {0, 0, vi{7-0}}; +} + +class vop1 <bits<8> si, bits<8> vi = si> : vop { + field bits<8> SI = si; + field bits<8> VI = vi; + + field bits<9> SI3 = {1, 1, si{6-0}}; + field bits<10> VI3 = !add(0x140, vi); +} + +class vop2 <bits<6> si, bits<6> vi = si> : vop { + field bits<6> SI = si; + field bits<6> VI = vi; + + field bits<9> SI3 = {1, 0, 0, si{5-0}}; + field bits<10> VI3 = {0, 1, 0, 0, vi{5-0}}; +} + +// Specify a VOP2 opcode for SI and VOP3 opcode for VI +// that doesn't have VOP2 encoding on VI +class vop23 <bits<6> si, bits<10> vi> : vop2 <si> { + let VI3 = vi; +} + +class vop3 <bits<9> si, bits<10> vi = {0, si}> : vop { + let SI3 = si; + let VI3 = vi; +} + +class sop1 <bits<8> si, bits<8> vi = si> { + field bits<8> SI = si; + field bits<8> VI = vi; +} + +class sop2 <bits<7> si, bits<7> vi = si> { + field bits<7> SI = si; + field bits<7> VI = vi; +} + +class sopk <bits<5> si, bits<5> vi = si> { + field bits<5> SI = si; + field bits<5> VI = vi; +} + +// Specify an SMRD opcode for SI and SMEM opcode for VI + +// FIXME: This should really be bits<5> si, Tablegen crashes if +// parameter default value is other parameter with different bit size +class smrd<bits<8> si, bits<8> vi = si> { + field bits<5> SI = si{4-0}; + field bits<8> VI = vi; +} + +// Execpt for the NONE field, this must be kept in sync with the SISubtarget enum +// in AMDGPUInstrInfo.cpp +def SISubtarget { + int NONE = -1; + int SI = 0; + int VI = 1; +} + +//===----------------------------------------------------------------------===// +// SI DAG Nodes +//===----------------------------------------------------------------------===// + +def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT", + SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>, + [SDNPMayLoad, SDNPMemOperand] +>; + +def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", + SDTypeProfile<0, 13, + [SDTCisVT<0, v4i32>, // rsrc(SGPR) + SDTCisVT<1, iAny>, // vdata(VGPR) + SDTCisVT<2, i32>, // num_channels(imm) + SDTCisVT<3, i32>, // vaddr(VGPR) + SDTCisVT<4, i32>, // soffset(SGPR) + SDTCisVT<5, i32>, // inst_offset(imm) + SDTCisVT<6, i32>, // dfmt(imm) + SDTCisVT<7, i32>, // nfmt(imm) + SDTCisVT<8, i32>, // offen(imm) + SDTCisVT<9, i32>, // idxen(imm) + SDTCisVT<10, i32>, // glc(imm) + SDTCisVT<11, i32>, // slc(imm) + SDTCisVT<12, i32> // tfe(imm) + ]>, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain] +>; + +def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT", + SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i16>, + SDTCisVT<3, i32>]> +>; + +class SDSample<string opcode> : SDNode <opcode, + SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v32i8>, + SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]> +>; + +def SIsample : SDSample<"AMDGPUISD::SAMPLE">; +def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">; +def SIsampled : SDSample<"AMDGPUISD::SAMPLED">; +def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">; + +def SIconstdata_ptr : SDNode< + "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, i64>, + SDTCisVT<0, i64>]> +>; + +//===----------------------------------------------------------------------===// +// PatFrags for FLAT instructions +//===----------------------------------------------------------------------===// + +class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr), + (ld node:$ptr), [{ + return isFlatLoad(dyn_cast<LoadSDNode>(N)) || + isGlobalLoad(dyn_cast<LoadSDNode>(N)) || + isConstantLoad(cast<LoadSDNode>(N), -1); +}]>; + +def flat_load : flat_ld <load>; +def flat_az_extloadi8 : flat_ld <az_extloadi8>; +def flat_sextloadi8 : flat_ld <sextloadi8>; +def flat_az_extloadi16 : flat_ld <az_extloadi16>; +def flat_sextloadi16 : flat_ld <sextloadi16>; + +class flat_st <SDPatternOperator st> : PatFrag<(ops node:$val, node:$ptr), + (st node:$val, node:$ptr), [{ + return isFlatStore(dyn_cast<StoreSDNode>(N)) || + isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; + +def flat_store: flat_st <store>; +def flat_truncstorei8 : flat_st <truncstorei8>; +def flat_truncstorei16 : flat_st <truncstorei16>; + + +def mubuf_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ + return isGlobalLoad(cast<LoadSDNode>(N)) || + isConstantLoad(cast<LoadSDNode>(N), -1); +}]>; + +def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ + return isConstantLoad(cast<LoadSDNode>(N), -1) && + static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N); +}]>; + +//===----------------------------------------------------------------------===// +// SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1 +// to be glued to the memory instructions. +//===----------------------------------------------------------------------===// + +def SIld_local : SDNode <"ISD::LOAD", SDTLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] +>; + +def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{ + return isLocalLoad(cast<LoadSDNode>(N)); +}]>; + +def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{ + return cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED && + cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD; +}]>; + +def si_load_local_align8 : Aligned8Bytes < + (ops node:$ptr), (si_load_local node:$ptr) +>; + +def si_sextload_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{ + return cast<LoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD; +}]>; +def si_az_extload_local : AZExtLoadBase <si_ld_local>; + +multiclass SIExtLoadLocal <PatFrag ld_node> { + + def _i8 : PatFrag <(ops node:$ptr), (ld_node node:$ptr), + [{return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;}] + >; + + def _i16 : PatFrag <(ops node:$ptr), (ld_node node:$ptr), + [{return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;}] + >; +} + +defm si_sextload_local : SIExtLoadLocal <si_sextload_local>; +defm si_az_extload_local : SIExtLoadLocal <si_az_extload_local>; + +def SIst_local : SDNode <"ISD::STORE", SDTStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue] +>; + +def si_st_local : PatFrag < + (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{ + return isLocalStore(cast<StoreSDNode>(N)); +}]>; + +def si_store_local : PatFrag < + (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED && + !cast<StoreSDNode>(N)->isTruncatingStore(); +}]>; + +def si_store_local_align8 : Aligned8Bytes < + (ops node:$val, node:$ptr), (si_store_local node:$val, node:$ptr) +>; + +def si_truncstore_local : PatFrag < + (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->isTruncatingStore(); +}]>; + +def si_truncstore_local_i8 : PatFrag < + (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i8; +}]>; + +def si_truncstore_local_i16 : PatFrag < + (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i16; +}]>; + +multiclass SIAtomicM0Glue2 <string op_name> { + + def _glue : SDNode <"ISD::ATOMIC_"#op_name, SDTAtomic2, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] + >; + + def _local : local_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>; +} + +defm si_atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; +defm si_atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">; +defm si_atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">; +defm si_atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">; +defm si_atomic_load_or : SIAtomicM0Glue2 <"LOAD_OR">; +defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">; +defm si_atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">; +defm si_atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">; +defm si_atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">; +defm si_atomic_swap : SIAtomicM0Glue2 <"SWAP">; + +def si_atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] +>; + +defm si_atomic_cmp_swap : AtomicCmpSwapLocal <si_atomic_cmp_swap_glue>; + +// Transformation function, extract the lower 32bit of a 64bit immediate +def LO32 : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, SDLoc(N), + MVT::i32); +}]>; + +def LO32f : SDNodeXForm<fpimm, [{ + APInt V = N->getValueAPF().bitcastToAPInt().trunc(32); + return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), MVT::f32); +}]>; + +// Transformation function, extract the upper 32bit of a 64bit immediate +def HI32 : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() >> 32, SDLoc(N), MVT::i32); +}]>; + +def HI32f : SDNodeXForm<fpimm, [{ + APInt V = N->getValueAPF().bitcastToAPInt().lshr(32).trunc(32); + return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), SDLoc(N), + MVT::f32); +}]>; + +def IMM8bitDWORD : PatLeaf <(imm), + [{return (N->getZExtValue() & ~0x3FC) == 0;}] +>; + +def as_dword_i32imm : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() >> 2, SDLoc(N), MVT::i32); +}]>; + +def as_i1imm : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1); +}]>; + +def as_i8imm : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i8); +}]>; + +def as_i16imm : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16); +}]>; + +def as_i32imm: SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32); +}]>; + +def as_i64imm: SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64); +}]>; + +// Copied from the AArch64 backend: +def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{ +return CurDAG->getTargetConstant( + N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32); +}]>; + +// Copied from the AArch64 backend: +def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{ +return CurDAG->getTargetConstant( + N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64); +}]>; + +def IMM8bit : PatLeaf <(imm), + [{return isUInt<8>(N->getZExtValue());}] +>; + +def IMM12bit : PatLeaf <(imm), + [{return isUInt<12>(N->getZExtValue());}] +>; + +def IMM16bit : PatLeaf <(imm), + [{return isUInt<16>(N->getZExtValue());}] +>; + +def IMM20bit : PatLeaf <(imm), + [{return isUInt<20>(N->getZExtValue());}] +>; + +def IMM32bit : PatLeaf <(imm), + [{return isUInt<32>(N->getZExtValue());}] +>; + +def mubuf_vaddr_offset : PatFrag< + (ops node:$ptr, node:$offset, node:$imm_offset), + (add (add node:$ptr, node:$offset), node:$imm_offset) +>; + +class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{ + return isInlineImmediate(N); +}]>; + +class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{ + return isInlineImmediate(N); +}]>; + +class SGPRImm <dag frag> : PatLeaf<frag, [{ + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) { + return false; + } + const SIRegisterInfo *SIRI = + static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); + for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); + U != E; ++U) { + const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo()); + if (RC && SIRI->isSGPRClass(RC)) + return true; + } + return false; +}]>; + +//===----------------------------------------------------------------------===// +// Custom Operands +//===----------------------------------------------------------------------===// + +def FRAMEri32 : Operand<iPTR> { + let MIOperandInfo = (ops i32:$ptr, i32imm:$index); +} + +def SoppBrTarget : AsmOperandClass { + let Name = "SoppBrTarget"; + let ParserMethod = "parseSOppBrTarget"; +} + +def sopp_brtarget : Operand<OtherVT> { + let EncoderMethod = "getSOPPBrEncoding"; + let OperandType = "OPERAND_PCREL"; + let ParserMatchClass = SoppBrTarget; +} + +def const_ga : Operand<iPTR>; + +include "SIInstrFormats.td" +include "VIInstrFormats.td" + +def MubufOffsetMatchClass : AsmOperandClass { + let Name = "MubufOffset"; + let ParserMethod = "parseMubufOptionalOps"; + let RenderMethod = "addImmOperands"; +} + +class DSOffsetBaseMatchClass <string parser> : AsmOperandClass { + let Name = "DSOffset"#parser; + let ParserMethod = parser; + let RenderMethod = "addImmOperands"; + let PredicateMethod = "isDSOffset"; +} + +def DSOffsetMatchClass : DSOffsetBaseMatchClass <"parseDSOptionalOps">; +def DSOffsetGDSMatchClass : DSOffsetBaseMatchClass <"parseDSOffsetOptional">; + +def DSOffset01MatchClass : AsmOperandClass { + let Name = "DSOffset1"; + let ParserMethod = "parseDSOff01OptionalOps"; + let RenderMethod = "addImmOperands"; + let PredicateMethod = "isDSOffset01"; +} + +class GDSBaseMatchClass <string parser> : AsmOperandClass { + let Name = "GDS"#parser; + let PredicateMethod = "isImm"; + let ParserMethod = parser; + let RenderMethod = "addImmOperands"; +} + +def GDSMatchClass : GDSBaseMatchClass <"parseDSOptionalOps">; +def GDS01MatchClass : GDSBaseMatchClass <"parseDSOff01OptionalOps">; + +class GLCBaseMatchClass <string parser> : AsmOperandClass { + let Name = "GLC"#parser; + let PredicateMethod = "isImm"; + let ParserMethod = parser; + let RenderMethod = "addImmOperands"; +} + +def GLCMubufMatchClass : GLCBaseMatchClass <"parseMubufOptionalOps">; +def GLCFlatMatchClass : GLCBaseMatchClass <"parseFlatOptionalOps">; + +class SLCBaseMatchClass <string parser> : AsmOperandClass { + let Name = "SLC"#parser; + let PredicateMethod = "isImm"; + let ParserMethod = parser; + let RenderMethod = "addImmOperands"; +} + +def SLCMubufMatchClass : SLCBaseMatchClass <"parseMubufOptionalOps">; +def SLCFlatMatchClass : SLCBaseMatchClass <"parseFlatOptionalOps">; +def SLCFlatAtomicMatchClass : SLCBaseMatchClass <"parseFlatAtomicOptionalOps">; + +class TFEBaseMatchClass <string parser> : AsmOperandClass { + let Name = "TFE"#parser; + let PredicateMethod = "isImm"; + let ParserMethod = parser; + let RenderMethod = "addImmOperands"; +} + +def TFEMubufMatchClass : TFEBaseMatchClass <"parseMubufOptionalOps">; +def TFEFlatMatchClass : TFEBaseMatchClass <"parseFlatOptionalOps">; +def TFEFlatAtomicMatchClass : TFEBaseMatchClass <"parseFlatAtomicOptionalOps">; + +def OModMatchClass : AsmOperandClass { + let Name = "OMod"; + let PredicateMethod = "isImm"; + let ParserMethod = "parseVOP3OptionalOps"; + let RenderMethod = "addImmOperands"; +} + +def ClampMatchClass : AsmOperandClass { + let Name = "Clamp"; + let PredicateMethod = "isImm"; + let ParserMethod = "parseVOP3OptionalOps"; + let RenderMethod = "addImmOperands"; +} + +class SMRDOffsetBaseMatchClass <string predicate> : AsmOperandClass { + let Name = "SMRDOffset"#predicate; + let PredicateMethod = predicate; + let RenderMethod = "addImmOperands"; +} + +def SMRDOffsetMatchClass : SMRDOffsetBaseMatchClass <"isSMRDOffset">; +def SMRDLiteralOffsetMatchClass : SMRDOffsetBaseMatchClass < + "isSMRDLiteralOffset" +>; + +let OperandType = "OPERAND_IMMEDIATE" in { + +def offen : Operand<i1> { + let PrintMethod = "printOffen"; +} +def idxen : Operand<i1> { + let PrintMethod = "printIdxen"; +} +def addr64 : Operand<i1> { + let PrintMethod = "printAddr64"; +} +def mbuf_offset : Operand<i16> { + let PrintMethod = "printMBUFOffset"; + let ParserMatchClass = MubufOffsetMatchClass; +} +class ds_offset_base <AsmOperandClass mc> : Operand<i16> { + let PrintMethod = "printDSOffset"; + let ParserMatchClass = mc; +} +def ds_offset : ds_offset_base <DSOffsetMatchClass>; +def ds_offset_gds : ds_offset_base <DSOffsetGDSMatchClass>; + +def ds_offset0 : Operand<i8> { + let PrintMethod = "printDSOffset0"; + let ParserMatchClass = DSOffset01MatchClass; +} +def ds_offset1 : Operand<i8> { + let PrintMethod = "printDSOffset1"; + let ParserMatchClass = DSOffset01MatchClass; +} +class gds_base <AsmOperandClass mc> : Operand <i1> { + let PrintMethod = "printGDS"; + let ParserMatchClass = mc; +} +def gds : gds_base <GDSMatchClass>; + +def gds01 : gds_base <GDS01MatchClass>; + +class glc_base <AsmOperandClass mc> : Operand <i1> { + let PrintMethod = "printGLC"; + let ParserMatchClass = mc; +} + +def glc : glc_base <GLCMubufMatchClass>; +def glc_flat : glc_base <GLCFlatMatchClass>; + +class slc_base <AsmOperandClass mc> : Operand <i1> { + let PrintMethod = "printSLC"; + let ParserMatchClass = mc; +} + +def slc : slc_base <SLCMubufMatchClass>; +def slc_flat : slc_base <SLCFlatMatchClass>; +def slc_flat_atomic : slc_base <SLCFlatAtomicMatchClass>; + +class tfe_base <AsmOperandClass mc> : Operand <i1> { + let PrintMethod = "printTFE"; + let ParserMatchClass = mc; +} + +def tfe : tfe_base <TFEMubufMatchClass>; +def tfe_flat : tfe_base <TFEFlatMatchClass>; +def tfe_flat_atomic : tfe_base <TFEFlatAtomicMatchClass>; + +def omod : Operand <i32> { + let PrintMethod = "printOModSI"; + let ParserMatchClass = OModMatchClass; +} + +def ClampMod : Operand <i1> { + let PrintMethod = "printClampSI"; + let ParserMatchClass = ClampMatchClass; +} + +def smrd_offset : Operand <i32> { + let PrintMethod = "printU32ImmOperand"; + let ParserMatchClass = SMRDOffsetMatchClass; +} + +def smrd_literal_offset : Operand <i32> { + let PrintMethod = "printU32ImmOperand"; + let ParserMatchClass = SMRDLiteralOffsetMatchClass; +} + +} // End OperandType = "OPERAND_IMMEDIATE" + +def VOPDstS64 : VOPDstOperand <SReg_64>; + +//===----------------------------------------------------------------------===// +// Complex patterns +//===----------------------------------------------------------------------===// + +def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">; +def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">; + +def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">; +def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">; +def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">; +def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">; +def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">; +def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">; + +def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">; +def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">; +def SMRDSgpr : ComplexPattern<i64, 2, "SelectSMRDSgpr">; +def SMRDBufferImm : ComplexPattern<i32, 1, "SelectSMRDBufferImm">; +def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">; +def SMRDBufferSgpr : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">; + +def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">; +def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">; +def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">; +def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">; +def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">; +def VOP3NoMods : ComplexPattern<untyped, 2, "SelectVOP3NoMods">; + +//===----------------------------------------------------------------------===// +// SI assembler operands +//===----------------------------------------------------------------------===// + +def SIOperand { + int ZERO = 0x80; + int VCC = 0x6A; + int FLAT_SCR = 0x68; +} + +def SRCMODS { + int NONE = 0; + int NEG = 1; +} + +def DSTCLAMP { + int NONE = 0; +} + +def DSTOMOD { + int NONE = 0; +} + +//===----------------------------------------------------------------------===// +// +// SI Instruction multiclass helpers. +// +// Instructions with _32 take 32-bit operands. +// Instructions with _64 take 64-bit operands. +// +// VOP_* instructions can use either a 32-bit or 64-bit encoding. The 32-bit +// encoding is the standard encoding, but instruction that make use of +// any of the instruction modifiers must use the 64-bit encoding. +// +// Instructions with _e32 use the 32-bit encoding. +// Instructions with _e64 use the 64-bit encoding. +// +//===----------------------------------------------------------------------===// + +class SIMCInstr <string pseudo, int subtarget> { + string PseudoInstr = pseudo; + int Subtarget = subtarget; +} + +//===----------------------------------------------------------------------===// +// EXP classes +//===----------------------------------------------------------------------===// + +class EXPCommon : InstSI< + (outs), + (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm, + VGPR_32:$src0, VGPR_32:$src1, VGPR_32:$src2, VGPR_32:$src3), + "exp $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3", + [] > { + + let EXP_CNT = 1; + let Uses = [EXEC]; +} + +multiclass EXP_m { + + let isPseudo = 1, isCodeGenOnly = 1 in { + def "" : EXPCommon, SIMCInstr <"exp", SISubtarget.NONE> ; + } + + def _si : EXPCommon, SIMCInstr <"exp", SISubtarget.SI>, EXPe; + + def _vi : EXPCommon, SIMCInstr <"exp", SISubtarget.VI>, EXPe_vi; +} + +//===----------------------------------------------------------------------===// +// Scalar classes +//===----------------------------------------------------------------------===// + +class SOP1_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + SOP1 <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class SOP1_Real_si <sop1 op, string opName, dag outs, dag ins, string asm> : + SOP1 <outs, ins, asm, []>, + SOP1e <op.SI>, + SIMCInstr<opName, SISubtarget.SI> { + let isCodeGenOnly = 0; + let AssemblerPredicates = [isSICI]; +} + +class SOP1_Real_vi <sop1 op, string opName, dag outs, dag ins, string asm> : + SOP1 <outs, ins, asm, []>, + SOP1e <op.VI>, + SIMCInstr<opName, SISubtarget.VI> { + let isCodeGenOnly = 0; + let AssemblerPredicates = [isVI]; +} + +multiclass SOP1_m <sop1 op, string opName, dag outs, dag ins, string asm, + list<dag> pattern> { + + def "" : SOP1_Pseudo <opName, outs, ins, pattern>; + + def _si : SOP1_Real_si <op, opName, outs, ins, asm>; + + def _vi : SOP1_Real_vi <op, opName, outs, ins, asm>; + +} + +multiclass SOP1_32 <sop1 op, string opName, list<dag> pattern> : SOP1_m < + op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0), + opName#" $dst, $src0", pattern +>; + +multiclass SOP1_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m < + op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0), + opName#" $dst, $src0", pattern +>; + +// no input, 64-bit output. +multiclass SOP1_64_0 <sop1 op, string opName, list<dag> pattern> { + def "" : SOP1_Pseudo <opName, (outs SReg_64:$dst), (ins), pattern>; + + def _si : SOP1_Real_si <op, opName, (outs SReg_64:$dst), (ins), + opName#" $dst"> { + let ssrc0 = 0; + } + + def _vi : SOP1_Real_vi <op, opName, (outs SReg_64:$dst), (ins), + opName#" $dst"> { + let ssrc0 = 0; + } +} + +// 64-bit input, no output +multiclass SOP1_1 <sop1 op, string opName, list<dag> pattern> { + def "" : SOP1_Pseudo <opName, (outs), (ins SReg_64:$src0), pattern>; + + def _si : SOP1_Real_si <op, opName, (outs), (ins SReg_64:$src0), + opName#" $src0"> { + let sdst = 0; + } + + def _vi : SOP1_Real_vi <op, opName, (outs), (ins SReg_64:$src0), + opName#" $src0"> { + let sdst = 0; + } +} + +// 64-bit input, 32-bit output. +multiclass SOP1_32_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m < + op, opName, (outs SReg_32:$dst), (ins SSrc_64:$src0), + opName#" $dst, $src0", pattern +>; + +class SOP2_Pseudo<string opName, dag outs, dag ins, list<dag> pattern> : + SOP2<outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; + let Size = 4; + + // Pseudo instructions have no encodings, but adding this field here allows + // us to do: + // let sdst = xxx in { + // for multiclasses that include both real and pseudo instructions. + field bits<7> sdst = 0; +} + +class SOP2_Real_si<sop2 op, string opName, dag outs, dag ins, string asm> : + SOP2<outs, ins, asm, []>, + SOP2e<op.SI>, + SIMCInstr<opName, SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; +} + +class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm> : + SOP2<outs, ins, asm, []>, + SOP2e<op.VI>, + SIMCInstr<opName, SISubtarget.VI> { + let AssemblerPredicates = [isVI]; +} + +multiclass SOP2_m <sop2 op, string opName, dag outs, dag ins, string asm, + list<dag> pattern> { + + def "" : SOP2_Pseudo <opName, outs, ins, pattern>; + + def _si : SOP2_Real_si <op, opName, outs, ins, asm>; + + def _vi : SOP2_Real_vi <op, opName, outs, ins, asm>; + +} + +multiclass SOP2_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m < + op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1), + opName#" $dst, $src0, $src1", pattern +>; + +multiclass SOP2_64 <sop2 op, string opName, list<dag> pattern> : SOP2_m < + op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1), + opName#" $dst, $src0, $src1", pattern +>; + +multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m < + op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1), + opName#" $dst, $src0, $src1", pattern +>; + +class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt, + string opName, PatLeaf cond> : SOPC < + op, (outs), (ins rc:$src0, rc:$src1), + opName#" $src0, $src1", []> { + let Defs = [SCC]; +} + +class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL> + : SOPC_Helper<op, SSrc_32, i32, opName, cond>; + +class SOPC_64<bits<7> op, string opName, PatLeaf cond = COND_NULL> + : SOPC_Helper<op, SSrc_64, i64, opName, cond>; + +class SOPK_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + SOPK <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class SOPK_Real_si <sopk op, string opName, dag outs, dag ins, string asm> : + SOPK <outs, ins, asm, []>, + SOPKe <op.SI>, + SIMCInstr<opName, SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; + let isCodeGenOnly = 0; +} + +class SOPK_Real_vi <sopk op, string opName, dag outs, dag ins, string asm> : + SOPK <outs, ins, asm, []>, + SOPKe <op.VI>, + SIMCInstr<opName, SISubtarget.VI> { + let AssemblerPredicates = [isVI]; + let isCodeGenOnly = 0; +} + +multiclass SOPK_m <sopk op, string opName, dag outs, dag ins, string opAsm, + string asm = opName#opAsm> { + def "" : SOPK_Pseudo <opName, outs, ins, []>; + + def _si : SOPK_Real_si <op, opName, outs, ins, asm>; + + def _vi : SOPK_Real_vi <op, opName, outs, ins, asm>; + +} + +multiclass SOPK_32 <sopk op, string opName, list<dag> pattern> { + def "" : SOPK_Pseudo <opName, (outs SReg_32:$dst), (ins u16imm:$src0), + pattern>; + + def _si : SOPK_Real_si <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0), + opName#" $dst, $src0">; + + def _vi : SOPK_Real_vi <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0), + opName#" $dst, $src0">; +} + +multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> { + def "" : SOPK_Pseudo <opName, (outs), + (ins SReg_32:$src0, u16imm:$src1), pattern> { + let Defs = [SCC]; + } + + + def _si : SOPK_Real_si <op, opName, (outs), + (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16"> { + let Defs = [SCC]; + } + + def _vi : SOPK_Real_vi <op, opName, (outs), + (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16"> { + let Defs = [SCC]; + } +} + +multiclass SOPK_32TIE <sopk op, string opName, list<dag> pattern> : SOPK_m < + op, opName, (outs SReg_32:$sdst), (ins SReg_32:$src0, u16imm:$simm16), + " $sdst, $simm16" +>; + +multiclass SOPK_IMM32 <sopk op, string opName, dag outs, dag ins, + string argAsm, string asm = opName#argAsm> { + + def "" : SOPK_Pseudo <opName, outs, ins, []>; + + def _si : SOPK <outs, ins, asm, []>, + SOPK64e <op.SI>, + SIMCInstr<opName, SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; + let isCodeGenOnly = 0; + } + + def _vi : SOPK <outs, ins, asm, []>, + SOPK64e <op.VI>, + SIMCInstr<opName, SISubtarget.VI> { + let AssemblerPredicates = [isVI]; + let isCodeGenOnly = 0; + } +} +//===----------------------------------------------------------------------===// +// SMRD classes +//===----------------------------------------------------------------------===// + +class SMRD_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + SMRD <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins, + string asm> : + SMRD <outs, ins, asm, []>, + SMRDe <op, imm>, + SIMCInstr<opName, SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; +} + +class SMRD_Real_vi <bits<8> op, string opName, bit imm, dag outs, dag ins, + string asm, list<dag> pattern = []> : + SMRD <outs, ins, asm, pattern>, + SMEMe_vi <op, imm>, + SIMCInstr<opName, SISubtarget.VI> { + let AssemblerPredicates = [isVI]; +} + +multiclass SMRD_m <smrd op, string opName, bit imm, dag outs, dag ins, + string asm, list<dag> pattern> { + + def "" : SMRD_Pseudo <opName, outs, ins, pattern>; + + def _si : SMRD_Real_si <op.SI, opName, imm, outs, ins, asm>; + + // glc is only applicable to scalar stores, which are not yet + // implemented. + let glc = 0 in { + def _vi : SMRD_Real_vi <op.VI, opName, imm, outs, ins, asm>; + } +} + +multiclass SMRD_Inval <smrd op, string opName, + SDPatternOperator node> { + let hasSideEffects = 1, mayStore = 1 in { + def "" : SMRD_Pseudo <opName, (outs), (ins), [(node)]>; + + let sbase = 0, offset = 0 in { + let sdst = 0 in { + def _si : SMRD_Real_si <op.SI, opName, 0, (outs), (ins), opName>; + } + + let glc = 0, sdata = 0 in { + def _vi : SMRD_Real_vi <op.VI, opName, 0, (outs), (ins), opName>; + } + } + } +} + +class SMEM_Inval <bits<8> op, string opName, SDPatternOperator node> : + SMRD_Real_vi<op, opName, 0, (outs), (ins), opName, [(node)]> { + let hasSideEffects = 1; + let mayStore = 1; + let sbase = 0; + let sdata = 0; + let glc = 0; + let offset = 0; +} + +multiclass SMRD_Helper <smrd op, string opName, RegisterClass baseClass, + RegisterClass dstClass> { + defm _IMM : SMRD_m < + op, opName#"_IMM", 1, (outs dstClass:$dst), + (ins baseClass:$sbase, smrd_offset:$offset), + opName#" $dst, $sbase, $offset", [] + >; + + def _IMM_ci : SMRD < + (outs dstClass:$dst), (ins baseClass:$sbase, smrd_literal_offset:$offset), + opName#" $dst, $sbase, $offset", []>, SMRD_IMMe_ci <op.SI> { + let AssemblerPredicates = [isCIOnly]; + } + + defm _SGPR : SMRD_m < + op, opName#"_SGPR", 0, (outs dstClass:$dst), + (ins baseClass:$sbase, SReg_32:$soff), + opName#" $dst, $sbase, $soff", [] + >; +} + +//===----------------------------------------------------------------------===// +// Vector ALU classes +//===----------------------------------------------------------------------===// + +// This must always be right before the operand being input modified. +def InputMods : OperandWithDefaultOps <i32, (ops (i32 0))> { + let PrintMethod = "printOperandAndMods"; +} + +def InputModsMatchClass : AsmOperandClass { + let Name = "RegWithInputMods"; +} + +def InputModsNoDefault : Operand <i32> { + let PrintMethod = "printOperandAndMods"; + let ParserMatchClass = InputModsMatchClass; +} + +class getNumSrcArgs<ValueType Src0, ValueType Src1, ValueType Src2> { + int ret = + !if (!eq(Src0.Value, untyped.Value), 0, + !if (!eq(Src1.Value, untyped.Value), 1, // VOP1 + !if (!eq(Src2.Value, untyped.Value), 2, // VOP2 + 3))); // VOP3 +} + +// Returns the register class to use for the destination of VOP[123C] +// instructions for the given VT. +class getVALUDstForVT<ValueType VT> { + RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>, + !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>, + !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32>, + VOPDstOperand<SReg_64>))); // else VT == i1 +} + +// Returns the register class to use for source 0 of VOP[12C] +// instructions for the given VT. +class getVOPSrc0ForVT<ValueType VT> { + RegisterOperand ret = !if(!eq(VT.Size, 64), VSrc_64, VSrc_32); +} + +// Returns the register class to use for source 1 of VOP[12C] for the +// given VT. +class getVOPSrc1ForVT<ValueType VT> { + RegisterClass ret = !if(!eq(VT.Size, 64), VReg_64, VGPR_32); +} + +// Returns the register class to use for sources of VOP3 instructions for the +// given VT. +class getVOP3SrcForVT<ValueType VT> { + RegisterOperand ret = + !if(!eq(VT.Size, 64), + VCSrc_64, + !if(!eq(VT.Value, i1.Value), + SCSrc_64, + VCSrc_32 + ) + ); +} + +// Returns 1 if the source arguments have modifiers, 0 if they do not. +// XXX - do f16 instructions? +class hasModifiers<ValueType SrcVT> { + bit ret = !if(!eq(SrcVT.Value, f32.Value), 1, + !if(!eq(SrcVT.Value, f64.Value), 1, 0)); +} + +// Returns the input arguments for VOP[12C] instructions for the given SrcVT. +class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> { + dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0), // VOP1 + !if(!eq(NumSrcArgs, 2), (ins Src0RC:$src0, Src1RC:$src1), // VOP2 + (ins))); +} + +// Returns the input arguments for VOP3 instructions for the given SrcVT. +class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, + RegisterOperand Src2RC, int NumSrcArgs, + bit HasModifiers> { + + dag ret = + !if (!eq(NumSrcArgs, 1), + !if (!eq(HasModifiers, 1), + // VOP1 with modifiers + (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, + ClampMod:$clamp, omod:$omod) + /* else */, + // VOP1 without modifiers + (ins Src0RC:$src0) + /* endif */ ), + !if (!eq(NumSrcArgs, 2), + !if (!eq(HasModifiers, 1), + // VOP 2 with modifiers + (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, + InputModsNoDefault:$src1_modifiers, Src1RC:$src1, + ClampMod:$clamp, omod:$omod) + /* else */, + // VOP2 without modifiers + (ins Src0RC:$src0, Src1RC:$src1) + /* endif */ ) + /* NumSrcArgs == 3 */, + !if (!eq(HasModifiers, 1), + // VOP3 with modifiers + (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, + InputModsNoDefault:$src1_modifiers, Src1RC:$src1, + InputModsNoDefault:$src2_modifiers, Src2RC:$src2, + ClampMod:$clamp, omod:$omod) + /* else */, + // VOP3 without modifiers + (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2) + /* endif */ ))); +} + +// Returns the assembly string for the inputs and outputs of a VOP[12C] +// instruction. This does not add the _e32 suffix, so it can be reused +// by getAsm64. +class getAsm32 <bit HasDst, int NumSrcArgs> { + string dst = "$dst"; + string src0 = ", $src0"; + string src1 = ", $src1"; + string src2 = ", $src2"; + string ret = !if(HasDst, dst, "") # + !if(!eq(NumSrcArgs, 1), src0, "") # + !if(!eq(NumSrcArgs, 2), src0#src1, "") # + !if(!eq(NumSrcArgs, 3), src0#src1#src2, ""); +} + +// Returns the assembly string for the inputs and outputs of a VOP3 +// instruction. +class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers> { + string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); + string src1 = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1_modifiers", + " $src1_modifiers,")); + string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); + string ret = + !if(!eq(HasModifiers, 0), + getAsm32<HasDst, NumSrcArgs>.ret, + "$dst, "#src0#src1#src2#"$clamp"#"$omod"); +} + +class VOPProfile <list<ValueType> _ArgVT> { + + field list<ValueType> ArgVT = _ArgVT; + + field ValueType DstVT = ArgVT[0]; + field ValueType Src0VT = ArgVT[1]; + field ValueType Src1VT = ArgVT[2]; + field ValueType Src2VT = ArgVT[3]; + field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret; + field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret; + field RegisterClass Src1RC32 = getVOPSrc1ForVT<Src1VT>.ret; + field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret; + field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret; + field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret; + + field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1); + field bit HasDst32 = HasDst; + field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret; + field bit HasModifiers = hasModifiers<Src0VT>.ret; + + field dag Outs = !if(HasDst,(outs DstRC:$dst),(outs)); + + // VOP3b instructions are a special case with a second explicit + // output. This is manually overridden for them. + field dag Outs32 = Outs; + field dag Outs64 = Outs; + + field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret; + field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, + HasModifiers>.ret; + + field string Asm32 = getAsm32<HasDst, NumSrcArgs>.ret; + field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers>.ret; +} + +// FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order +// for the instruction patterns to work. +def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>; +def VOP_F16_I16 : VOPProfile <[f16, i32, untyped, untyped]>; +def VOP_I16_F16 : VOPProfile <[i32, f16, untyped, untyped]>; + +def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>; +def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i32, untyped]>; +def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>; + +def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>; + +def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>; +def VOP_F32_F64 : VOPProfile <[f32, f64, untyped, untyped]>; +def VOP_F32_I32 : VOPProfile <[f32, i32, untyped, untyped]>; +def VOP_F64_F32 : VOPProfile <[f64, f32, untyped, untyped]>; +def VOP_F64_F64 : VOPProfile <[f64, f64, untyped, untyped]>; +def VOP_F64_I32 : VOPProfile <[f64, i32, untyped, untyped]>; +def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>; +def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>; +def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>; + +def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>; +def VOP_F32_F32_I32 : VOPProfile <[f32, f32, i32, untyped]>; +def VOP_F64_F64_F64 : VOPProfile <[f64, f64, f64, untyped]>; +def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>; +def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>; +def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>; +def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; + +// Write out to vcc or arbitrary SGPR. +def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> { + let Asm32 = "$dst, vcc, $src0, $src1"; + let Asm64 = "$dst, $sdst, $src0, $src1"; + let Outs32 = (outs DstRC:$dst); + let Outs64 = (outs DstRC:$dst, SReg_64:$sdst); +} + +// Write out to vcc or arbitrary SGPR and read in from vcc or +// arbitrary SGPR. +def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { + // We use VCSrc_32 to exclude literal constants, even though the + // encoding normally allows them since the implicit VCC use means + // using one would always violate the constant bus + // restriction. SGPRs are still allowed because it should + // technically be possible to use VCC again as src0. + let Src0RC32 = VCSrc_32; + let Asm32 = "$dst, vcc, $src0, $src1, vcc"; + let Asm64 = "$dst, $sdst, $src0, $src1, $src2"; + let Outs32 = (outs DstRC:$dst); + let Outs64 = (outs DstRC:$dst, SReg_64:$sdst); + + // Suppress src2 implied by type since the 32-bit encoding uses an + // implicit VCC use. + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1); +} + +class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> { + let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); + let Asm64 = "$vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod"; +} + +def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32> { + // FIXME: Hack to stop printing _e64 + let DstRC = RegisterOperand<VGPR_32>; +} + +def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> { + // FIXME: Hack to stop printing _e64 + let DstRC = RegisterOperand<VReg_64>; +} + +// VOPC instructions are a special case because for the 32-bit +// encoding, we want to display the implicit vcc write as if it were +// an explicit $dst. +class VOPC_Profile<ValueType vt0, ValueType vt1 = vt0> : VOPProfile <[i1, vt0, vt1, untyped]> { + let Asm32 = "vcc, $src0, $src1"; + // The destination for 32-bit encoding is implicit. + let HasDst32 = 0; +} + +class VOPC_Class_Profile<ValueType vt> : VOPC_Profile<vt, i32> { + let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); + let Asm64 = "$dst, $src0_modifiers, $src1"; +} + +def VOPC_I1_F32_F32 : VOPC_Profile<f32>; +def VOPC_I1_F64_F64 : VOPC_Profile<f64>; +def VOPC_I1_I32_I32 : VOPC_Profile<i32>; +def VOPC_I1_I64_I64 : VOPC_Profile<i64>; + +def VOPC_I1_F32_I32 : VOPC_Class_Profile<f32>; +def VOPC_I1_F64_I32 : VOPC_Class_Profile<f64>; + +def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; +def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; +def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>; +def VOP_CNDMASK : VOPProfile <[i32, i32, i32, untyped]> { + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1); + let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, SSrc_64:$src2); + let Asm64 = "$dst, $src0, $src1, $src2"; +} + +def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>; +def VOP_MADK : VOPProfile <[f32, f32, f32, f32]> { + field dag Ins = (ins VCSrc_32:$src0, VGPR_32:$vsrc1, u32imm:$src2); + field string Asm = "$dst, $src0, $vsrc1, $src2"; +} +def VOP_MAC : VOPProfile <[f32, f32, f32, f32]> { + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); + let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3, + HasModifiers>.ret; + let Asm32 = getAsm32<1, 2>.ret; + let Asm64 = getAsm64<1, 2, HasModifiers>.ret; +} +def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>; +def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>; +def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>; + +class SIInstAlias <string asm, Instruction inst, VOPProfile p> : + InstAlias <asm, (inst)>, PredicateControl { + + field bit isCompare; + field bit isCommutable; + + let ResultInst = + !if (p.HasDst32, + !if (!eq(p.NumSrcArgs, 0), + // 1 dst, 0 src + (inst p.DstRC:$dst), + !if (!eq(p.NumSrcArgs, 1), + // 1 dst, 1 src + (inst p.DstRC:$dst, p.Src0RC32:$src0), + !if (!eq(p.NumSrcArgs, 2), + // 1 dst, 2 src + (inst p.DstRC:$dst, p.Src0RC32:$src0, p.Src1RC32:$src1), + // else - unreachable + (inst)))), + // else + !if (!eq(p.NumSrcArgs, 2), + // 0 dst, 2 src + (inst p.Src0RC32:$src0, p.Src1RC32:$src1), + !if (!eq(p.NumSrcArgs, 1), + // 0 dst, 1 src + (inst p.Src0RC32:$src1), + // else + // 0 dst, 0 src + (inst)))); +} + +class SIInstAliasSI <string asm, string op_name, VOPProfile p> : + SIInstAlias <asm, !cast<Instruction>(op_name#"_e32_si"), p> { + let AssemblerPredicate = SIAssemblerPredicate; +} + +class SIInstAliasVI <string asm, string op_name, VOPProfile p> : + SIInstAlias <asm, !cast<Instruction>(op_name#"_e32_vi"), p> { + let AssemblerPredicates = [isVI]; +} + +multiclass SIInstAliasBuilder <string asm, VOPProfile p> { + + def : SIInstAliasSI <asm, NAME, p>; + + def : SIInstAliasVI <asm, NAME, p>; +} + +class VOP <string opName> { + string OpName = opName; +} + +class VOP2_REV <string revOp, bit isOrig> { + string RevOp = revOp; + bit IsOrig = isOrig; +} + +class AtomicNoRet <string noRetOp, bit isRet> { + string NoRetOp = noRetOp; + bit IsRet = isRet; +} + +class VOP1_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : + VOP1Common <outs, ins, "", pattern>, + VOP <opName>, + SIMCInstr <opName#"_e32", SISubtarget.NONE>, + MnemonicAlias<opName#"_e32", opName> { + let isPseudo = 1; + let isCodeGenOnly = 1; + + field bits<8> vdst; + field bits<9> src0; +} + +class VOP1_Real_si <string opName, vop1 op, dag outs, dag ins, string asm> : + VOP1<op.SI, outs, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.SI> { + let AssemblerPredicate = SIAssemblerPredicate; +} + +class VOP1_Real_vi <string opName, vop1 op, dag outs, dag ins, string asm> : + VOP1<op.VI, outs, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.VI> { + let AssemblerPredicates = [isVI]; +} + +multiclass VOP1_m <vop1 op, string opName, VOPProfile p, list<dag> pattern, + string asm = opName#p.Asm32> { + def "" : VOP1_Pseudo <p.Outs, p.Ins32, pattern, opName>; + + def _si : VOP1_Real_si <opName, op, p.Outs, p.Ins32, asm>; + + def _vi : VOP1_Real_vi <opName, op, p.Outs, p.Ins32, asm>; + +} + +multiclass VOP1SI_m <vop1 op, string opName, VOPProfile p, list<dag> pattern, + string asm = opName#p.Asm32> { + + def "" : VOP1_Pseudo <p.Outs, p.Ins32, pattern, opName>; + + def _si : VOP1_Real_si <opName, op, p.Outs, p.Ins32, asm>; +} + +class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : + VOP2Common <outs, ins, "", pattern>, + VOP <opName>, + SIMCInstr<opName#"_e32", SISubtarget.NONE>, + MnemonicAlias<opName#"_e32", opName> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class VOP2_Real_si <string opName, vop2 op, dag outs, dag ins, string asm> : + VOP2 <op.SI, outs, ins, opName#asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; +} + +class VOP2_Real_vi <string opName, vop2 op, dag outs, dag ins, string asm> : + VOP2 <op.VI, outs, ins, opName#asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.VI> { + let AssemblerPredicates = [isVI]; +} + +multiclass VOP2SI_m <vop2 op, string opName, VOPProfile p, list<dag> pattern, + string revOp> { + + def "" : VOP2_Pseudo <p.Outs32, p.Ins32, pattern, opName>, + VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; + + def _si : VOP2_Real_si <opName, op, p.Outs32, p.Ins32, p.Asm32>; +} + +multiclass VOP2_m <vop2 op, string opName, VOPProfile p, list <dag> pattern, + string revOp> { + + def "" : VOP2_Pseudo <p.Outs32, p.Ins32, pattern, opName>, + VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; + + def _si : VOP2_Real_si <opName, op, p.Outs32, p.Ins32, p.Asm32>; + + def _vi : VOP2_Real_vi <opName, op, p.Outs32, p.Ins32, p.Asm32>; + +} + +class VOP3DisableFields <bit HasSrc1, bit HasSrc2, bit HasModifiers> { + + bits<2> src0_modifiers = !if(HasModifiers, ?, 0); + bits<2> src1_modifiers = !if(HasModifiers, !if(HasSrc1, ?, 0), 0); + bits<2> src2_modifiers = !if(HasModifiers, !if(HasSrc2, ?, 0), 0); + bits<2> omod = !if(HasModifiers, ?, 0); + bits<1> clamp = !if(HasModifiers, ?, 0); + bits<9> src1 = !if(HasSrc1, ?, 0); + bits<9> src2 = !if(HasSrc2, ?, 0); +} + +class VOP3DisableModFields <bit HasSrc0Mods, + bit HasSrc1Mods = 0, + bit HasSrc2Mods = 0, + bit HasOutputMods = 0> { + bits<2> src0_modifiers = !if(HasSrc0Mods, ?, 0); + bits<2> src1_modifiers = !if(HasSrc1Mods, ?, 0); + bits<2> src2_modifiers = !if(HasSrc2Mods, ?, 0); + bits<2> omod = !if(HasOutputMods, ?, 0); + bits<1> clamp = !if(HasOutputMods, ?, 0); +} + +class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : + VOP3Common <outs, ins, "", pattern>, + VOP <opName>, + SIMCInstr<opName#"_e64", SISubtarget.NONE>, + MnemonicAlias<opName#"_e64", opName> { + let isPseudo = 1; + let isCodeGenOnly = 1; + + field bit vdst; + field bit src0; +} + +class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> : + VOP3Common <outs, ins, asm, []>, + VOP3e <op>, + SIMCInstr<opName#"_e64", SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; +} + +class VOP3_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> : + VOP3Common <outs, ins, asm, []>, + VOP3e_vi <op>, + SIMCInstr <opName#"_e64", SISubtarget.VI> { + let AssemblerPredicates = [isVI]; +} + +class VOP3b_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> : + VOP3Common <outs, ins, asm, []>, + VOP3be <op>, + SIMCInstr<opName#"_e64", SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; +} + +class VOP3b_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> : + VOP3Common <outs, ins, asm, []>, + VOP3be_vi <op>, + SIMCInstr <opName#"_e64", SISubtarget.VI> { + let AssemblerPredicates = [isVI]; +} + +multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, + string opName, int NumSrcArgs, bit HasMods = 1> { + + def "" : VOP3_Pseudo <outs, ins, pattern, opName>; + + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1), + !if(!eq(NumSrcArgs, 2), 0, 1), + HasMods>; + def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>, + VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1), + !if(!eq(NumSrcArgs, 2), 0, 1), + HasMods>; +} + +multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, bit HasMods = 1> { + + def "" : VOP3_Pseudo <outs, ins, pattern, opName>; + + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<0, 0, HasMods>; + + def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>, + VOP3DisableFields<0, 0, HasMods>; +} + +multiclass VOP3SI_1_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, bit HasMods = 1> { + + def "" : VOP3_Pseudo <outs, ins, pattern, opName>; + + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<0, 0, HasMods>; + // No VI instruction. This class is for SI only. +} + +multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, string revOp, + bit HasMods = 1> { + + def "" : VOP3_Pseudo <outs, ins, pattern, opName>, + VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; + + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 0, HasMods>; + + def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 0, HasMods>; +} + +multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, string revOp, + bit HasMods = 1> { + + def "" : VOP3_Pseudo <outs, ins, pattern, opName>, + VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; + + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 0, HasMods>; + + // No VI instruction. This class is for SI only. +} + +// Two operand VOP3b instruction that may have a 3rd SGPR bool operand +// instead of an implicit VCC as in the VOP2b format. +multiclass VOP3b_2_3_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, string revOp, + bit HasMods = 1, bit useSrc2Input = 0> { + def "" : VOP3_Pseudo <outs, ins, pattern, opName>; + + def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<1, useSrc2Input, HasMods>; + + def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>, + VOP3DisableFields<1, useSrc2Input, HasMods>; +} + +multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, + bit HasMods, bit defExec, + string revOp, list<SchedReadWrite> sched> { + + def "" : VOP3_Pseudo <outs, ins, pattern, opName>, + VOP2_REV<revOp#"_e64", !eq(revOp, opName)> { + let Defs = !if(defExec, [EXEC], []); + let SchedRW = sched; + } + + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 0, HasMods> { + let Defs = !if(defExec, [EXEC], []); + let SchedRW = sched; + } + + def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 0, HasMods> { + let Defs = !if(defExec, [EXEC], []); + let SchedRW = sched; + } +} + +// An instruction that is VOP2 on SI and VOP3 on VI, no modifiers. +multiclass VOP2SI_3VI_m <vop3 op, string opName, dag outs, dag ins, + string asm, list<dag> pattern = []> { + let isPseudo = 1, isCodeGenOnly = 1 in { + def "" : VOPAnyCommon <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE>; + } + + def _si : VOP2 <op.SI3{5-0}, outs, ins, asm, []>, + SIMCInstr <opName, SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; + } + + def _vi : VOP3Common <outs, ins, asm, []>, + VOP3e_vi <op.VI3>, + VOP3DisableFields <1, 0, 0>, + SIMCInstr <opName, SISubtarget.VI> { + let AssemblerPredicates = [isVI]; + } +} + +multiclass VOP1_Helper <vop1 op, string opName, VOPProfile p, list<dag> pat32, + list<dag> pat64> { + + defm _e32 : VOP1_m <op, opName, p, pat32>; + + defm _e64 : VOP3_1_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName, + p.HasModifiers>; +} + +multiclass VOP1Inst <vop1 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag> : VOP1_Helper < + op, opName, P, [], + !if(P.HasModifiers, + [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, + i32:$src0_modifiers, i1:$clamp, i32:$omod))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0))]) +>; + +multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag> { + + defm _e32 : VOP1SI_m <op, opName, P, []>; + + defm _e64 : VOP3SI_1_m <op, P.Outs, P.Ins64, opName#P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, + i32:$src0_modifiers, i1:$clamp, i32:$omod))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0))]), + opName, P.HasModifiers>; +} + +multiclass VOP2_Helper <vop2 op, string opName, VOPProfile p, list<dag> pat32, + list<dag> pat64, string revOp> { + + defm _e32 : VOP2_m <op, opName, p, pat32, revOp>; + + defm _e64 : VOP3_2_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName, + revOp, p.HasModifiers>; +} + +multiclass VOP2Inst <vop2 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag, + string revOp = opName> : VOP2_Helper < + op, opName, P, [], + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + revOp +>; + +multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag, + string revOp = opName> { + + defm _e32 : VOP2SI_m <op, opName, P, [], revOp>; + + defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + opName, revOp, P.HasModifiers>; +} + +multiclass VOP2b_Helper <vop2 op, string opName, VOPProfile p, + list<dag> pat32, list<dag> pat64, + string revOp, bit useSGPRInput> { + + let SchedRW = [Write32Bit, WriteSALU] in { + let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in { + defm _e32 : VOP2_m <op, opName, p, pat32, revOp>; + } + + defm _e64 : VOP3b_2_3_m <op, p.Outs64, p.Ins64, opName#p.Asm64, pat64, + opName, revOp, p.HasModifiers, useSGPRInput>; + } +} + +multiclass VOP2bInst <vop2 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag, + string revOp = opName> : VOP2b_Helper < + op, opName, P, [], + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + revOp, !eq(P.NumSrcArgs, 3) +>; + +// A VOP2 instruction that is VOP3-only on VI. +multiclass VOP2_VI3_Helper <vop23 op, string opName, VOPProfile p, + list<dag> pat32, list<dag> pat64, string revOp> { + + defm _e32 : VOP2SI_m <op, opName, p, pat32, revOp>; + + defm _e64 : VOP3_2_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName, + revOp, p.HasModifiers>; +} + +multiclass VOP2_VI3_Inst <vop23 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag, + string revOp = opName> + : VOP2_VI3_Helper < + op, opName, P, [], + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + revOp +>; + +multiclass VOP2MADK <vop2 op, string opName, list<dag> pattern = []> { + + def "" : VOP2_Pseudo <VOP_MADK.Outs, VOP_MADK.Ins, pattern, opName>; + +let isCodeGenOnly = 0 in { + def _si : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins, + !strconcat(opName, VOP_MADK.Asm), []>, + SIMCInstr <opName#"_e32", SISubtarget.SI>, + VOP2_MADKe <op.SI> { + let AssemblerPredicates = [isSICI]; + } + + def _vi : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins, + !strconcat(opName, VOP_MADK.Asm), []>, + SIMCInstr <opName#"_e32", SISubtarget.VI>, + VOP2_MADKe <op.VI> { + let AssemblerPredicates = [isVI]; + } +} // End isCodeGenOnly = 0 +} + +class VOPC_Pseudo <dag ins, list<dag> pattern, string opName> : + VOPCCommon <ins, "", pattern>, + VOP <opName>, + SIMCInstr<opName#"_e32", SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +multiclass VOPC_m <vopc op, dag ins, string op_asm, list<dag> pattern, + string opName, bit DefExec, VOPProfile p, + list<SchedReadWrite> sched, + string revOpName = "", string asm = opName#"_e32 "#op_asm, + string alias_asm = opName#" "#op_asm> { + def "" : VOPC_Pseudo <ins, pattern, opName> { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let SchedRW = sched; + } + + let AssemblerPredicates = [isSICI] in { + def _si : VOPC<op.SI, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.SI> { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let hasSideEffects = DefExec; + let SchedRW = sched; + } + + } // End AssemblerPredicates = [isSICI] + + let AssemblerPredicates = [isVI] in { + def _vi : VOPC<op.VI, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.VI> { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let hasSideEffects = DefExec; + let SchedRW = sched; + } + + } // End AssemblerPredicates = [isVI] + + defm : SIInstAliasBuilder<alias_asm, p>; +} + +multiclass VOPC_Helper <vopc op, string opName, list<dag> pat32, + list<dag> pat64, bit DefExec, string revOp, + VOPProfile p, list<SchedReadWrite> sched> { + defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>; + + defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$dst), p.Ins64, opName#p.Asm64, pat64, + opName, p.HasModifiers, DefExec, revOp, sched>; +} + +// Special case for class instructions which only have modifiers on +// the 1st source operand. +multiclass VOPC_Class_Helper <vopc op, string opName, list<dag> pat32, + list<dag> pat64, bit DefExec, string revOp, + VOPProfile p, list<SchedReadWrite> sched> { + defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>; + + defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$dst), p.Ins64, opName#p.Asm64, pat64, + opName, p.HasModifiers, DefExec, revOp, sched>, + VOP3DisableModFields<1, 0, 0>; +} + +multiclass VOPCInst <vopc op, string opName, + VOPProfile P, PatLeaf cond = COND_NULL, + string revOp = opName, + bit DefExec = 0, + list<SchedReadWrite> sched = [Write32Bit]> : + VOPC_Helper < + op, opName, [], + !if(P.HasModifiers, + [(set i1:$dst, + (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + cond))], + [(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]), + DefExec, revOp, P, sched +>; + +multiclass VOPCClassInst <vopc op, string opName, VOPProfile P, + bit DefExec = 0, + list<SchedReadWrite> sched> : VOPC_Class_Helper < + op, opName, [], + !if(P.HasModifiers, + [(set i1:$dst, + (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))], + [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]), + DefExec, opName, P, sched +>; + + +multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : + VOPCInst <op, opName, VOPC_I1_F32_F32, cond, revOp>; + +multiclass VOPC_F64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : + VOPCInst <op, opName, VOPC_I1_F64_F64, cond, revOp, 0, [WriteDoubleAdd]>; + +multiclass VOPC_I32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : + VOPCInst <op, opName, VOPC_I1_I32_I32, cond, revOp>; + +multiclass VOPC_I64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : + VOPCInst <op, opName, VOPC_I1_I64_I64, cond, revOp, 0, [Write64Bit]>; + + +multiclass VOPCX <vopc op, string opName, VOPProfile P, + PatLeaf cond = COND_NULL, + list<SchedReadWrite> sched, + string revOp = ""> + : VOPCInst <op, opName, P, cond, revOp, 1, sched>; + +multiclass VOPCX_F32 <vopc op, string opName, string revOp = opName> : + VOPCX <op, opName, VOPC_I1_F32_F32, COND_NULL, [Write32Bit], revOp>; + +multiclass VOPCX_F64 <vopc op, string opName, string revOp = opName> : + VOPCX <op, opName, VOPC_I1_F64_F64, COND_NULL, [WriteDoubleAdd], revOp>; + +multiclass VOPCX_I32 <vopc op, string opName, string revOp = opName> : + VOPCX <op, opName, VOPC_I1_I32_I32, COND_NULL, [Write32Bit], revOp>; + +multiclass VOPCX_I64 <vopc op, string opName, string revOp = opName> : + VOPCX <op, opName, VOPC_I1_I64_I64, COND_NULL, [Write64Bit], revOp>; + +multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm, + list<dag> pat, int NumSrcArgs, bit HasMods> : VOP3_m < + op, outs, ins, opName#" "#asm, pat, opName, NumSrcArgs, HasMods +>; + +multiclass VOPC_CLASS_F32 <vopc op, string opName> : + VOPCClassInst <op, opName, VOPC_I1_F32_I32, 0, [Write32Bit]>; + +multiclass VOPCX_CLASS_F32 <vopc op, string opName> : + VOPCClassInst <op, opName, VOPC_I1_F32_I32, 1, [Write32Bit]>; + +multiclass VOPC_CLASS_F64 <vopc op, string opName> : + VOPCClassInst <op, opName, VOPC_I1_F64_I32, 0, [WriteDoubleAdd]>; + +multiclass VOPCX_CLASS_F64 <vopc op, string opName> : + VOPCClassInst <op, opName, VOPC_I1_F64_I32, 1, [WriteDoubleAdd]>; + +multiclass VOP3Inst <vop3 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag> : VOP3_Helper < + op, opName, (outs P.DstRC.RegClass:$dst), P.Ins64, P.Asm64, + !if(!eq(P.NumSrcArgs, 3), + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1, + P.Src2VT:$src2))]), + !if(!eq(P.NumSrcArgs, 2), + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]) + /* P.NumSrcArgs == 1 */, + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0))]))), + P.NumSrcArgs, P.HasModifiers +>; + +// Special case for v_div_fmas_{f32|f64}, since it seems to be the +// only VOP instruction that implicitly reads VCC. +multiclass VOP3_VCC_Inst <vop3 op, string opName, + VOPProfile P, + SDPatternOperator node = null_frag> : VOP3_Helper < + op, opName, + (outs P.DstRC.RegClass:$dst), + (ins InputModsNoDefault:$src0_modifiers, P.Src0RC64:$src0, + InputModsNoDefault:$src1_modifiers, P.Src1RC64:$src1, + InputModsNoDefault:$src2_modifiers, P.Src2RC64:$src2, + ClampMod:$clamp, + omod:$omod), + "$dst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers)), + (i1 VCC)))], + 3, 1 +>; + +multiclass VOP3bInst <vop op, string opName, VOPProfile P, list<dag> pattern = []> : + VOP3b_2_3_m < + op, P.Outs64, P.Ins64, + opName#" "#P.Asm64, pattern, + opName, "", 1, 1 +>; + +class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat< + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))), + (Inst i32:$src0_modifiers, P.Src0VT:$src0, + i32:$src1_modifiers, P.Src1VT:$src1, + i32:$src2_modifiers, P.Src2VT:$src2, + i1:$clamp, + i32:$omod)>; + +//===----------------------------------------------------------------------===// +// Interpolation opcodes +//===----------------------------------------------------------------------===// + +class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + VINTRPCommon <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class VINTRP_Real_si <bits <2> op, string opName, dag outs, dag ins, + string asm> : + VINTRPCommon <outs, ins, asm, []>, + VINTRPe <op>, + SIMCInstr<opName, SISubtarget.SI>; + +class VINTRP_Real_vi <bits <2> op, string opName, dag outs, dag ins, + string asm> : + VINTRPCommon <outs, ins, asm, []>, + VINTRPe_vi <op>, + SIMCInstr<opName, SISubtarget.VI>; + +multiclass VINTRP_m <bits <2> op, dag outs, dag ins, string asm, + list<dag> pattern = []> { + def "" : VINTRP_Pseudo <NAME, outs, ins, pattern>; + + def _si : VINTRP_Real_si <op, NAME, outs, ins, asm>; + + def _vi : VINTRP_Real_vi <op, NAME, outs, ins, asm>; +} + +//===----------------------------------------------------------------------===// +// Vector I/O classes +//===----------------------------------------------------------------------===// + +class DS_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + DS <outs, ins, "", pattern>, + SIMCInstr <opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class DS_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> : + DS <outs, ins, asm, []>, + DSe <op>, + SIMCInstr <opName, SISubtarget.SI> { + let isCodeGenOnly = 0; +} + +class DS_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> : + DS <outs, ins, asm, []>, + DSe_vi <op>, + SIMCInstr <opName, SISubtarget.VI>; + +class DS_Off16_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> : + DS_Real_si <op,opName, outs, ins, asm> { + + // Single load interpret the 2 i8imm operands as a single i16 offset. + bits<16> offset; + let offset0 = offset{7-0}; + let offset1 = offset{15-8}; + let isCodeGenOnly = 0; +} + +class DS_Off16_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> : + DS_Real_vi <op, opName, outs, ins, asm> { + + // Single load interpret the 2 i8imm operands as a single i16 offset. + bits<16> offset; + let offset0 = offset{7-0}; + let offset1 = offset{15-8}; +} + +multiclass DS_1A_RET <bits<8> op, string opName, RegisterClass rc, + dag outs = (outs rc:$vdst), + dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds), + string asm = opName#" $vdst, $addr"#"$offset$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>; + + let data0 = 0, data1 = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } +} + +multiclass DS_1A_Off8_RET <bits<8> op, string opName, RegisterClass rc, + dag outs = (outs rc:$vdst), + dag ins = (ins VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1, + gds01:$gds), + string asm = opName#" $vdst, $addr"#"$offset0"#"$offset1$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>; + + let data0 = 0, data1 = 0, AsmMatchConverter = "cvtDSOffset01" in { + def _si : DS_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Real_vi <op, opName, outs, ins, asm>; + } +} + +multiclass DS_1A1D_NORET <bits<8> op, string opName, RegisterClass rc, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds), + string asm = opName#" $addr, $data0"#"$offset$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>, + AtomicNoRet<opName, 0>; + + let data1 = 0, vdst = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } +} + +multiclass DS_1A1D_Off8_NORET <bits<8> op, string opName, RegisterClass rc, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1, + ds_offset0:$offset0, ds_offset1:$offset1, gds01:$gds), + string asm = opName#" $addr, $data0, $data1"#"$offset0"#"$offset1"#"$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>; + + let vdst = 0, AsmMatchConverter = "cvtDSOffset01" in { + def _si : DS_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Real_vi <op, opName, outs, ins, asm>; + } +} + +multiclass DS_1A1D_RET <bits<8> op, string opName, RegisterClass rc, + string noRetOp = "", + dag outs = (outs rc:$vdst), + dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds), + string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> { + + let hasPostISelHook = 1 in { + def "" : DS_Pseudo <opName, outs, ins, []>, + AtomicNoRet<noRetOp, 1>; + + let data1 = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } + } +} + +multiclass DS_1A2D_RET_m <bits<8> op, string opName, RegisterClass rc, + string noRetOp = "", dag ins, + dag outs = (outs rc:$vdst), + string asm = opName#" $vdst, $addr, $data0, $data1"#"$offset"#"$gds"> { + + let hasPostISelHook = 1 in { + def "" : DS_Pseudo <opName, outs, ins, []>, + AtomicNoRet<noRetOp, 1>; + + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } +} + +multiclass DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc, + string noRetOp = "", RegisterClass src = rc> : + DS_1A2D_RET_m <op, asm, rc, noRetOp, + (ins VGPR_32:$addr, src:$data0, src:$data1, + ds_offset:$offset, gds:$gds) +>; + +multiclass DS_1A2D_NORET <bits<8> op, string opName, RegisterClass rc, + string noRetOp = opName, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1, + ds_offset:$offset, gds:$gds), + string asm = opName#" $addr, $data0, $data1"#"$offset"#"$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>, + AtomicNoRet<noRetOp, 0>; + + let vdst = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } +} + +multiclass DS_0A_RET <bits<8> op, string opName, + dag outs = (outs VGPR_32:$vdst), + dag ins = (ins ds_offset:$offset, gds:$gds), + string asm = opName#" $vdst"#"$offset"#"$gds"> { + + let mayLoad = 1, mayStore = 1 in { + def "" : DS_Pseudo <opName, outs, ins, []>; + + let addr = 0, data0 = 0, data1 = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } // end addr = 0, data0 = 0, data1 = 0 + } // end mayLoad = 1, mayStore = 1 +} + +multiclass DS_1A_RET_GDS <bits<8> op, string opName, + dag outs = (outs VGPR_32:$vdst), + dag ins = (ins VGPR_32:$addr, ds_offset_gds:$offset), + string asm = opName#" $vdst, $addr"#"$offset gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>; + + let data0 = 0, data1 = 0, gds = 1 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } // end data0 = 0, data1 = 0, gds = 1 +} + +multiclass DS_1A_GDS <bits<8> op, string opName, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr), + string asm = opName#" $addr gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>; + + let vdst = 0, data0 = 0, data1 = 0, offset0 = 0, offset1 = 0, gds = 1 in { + def _si : DS_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Real_vi <op, opName, outs, ins, asm>; + } // end vdst = 0, data = 0, data1 = 0, gds = 1 +} + +multiclass DS_1A <bits<8> op, string opName, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds), + string asm = opName#" $addr"#"$offset"#"$gds"> { + + let mayLoad = 1, mayStore = 1 in { + def "" : DS_Pseudo <opName, outs, ins, []>; + + let vdst = 0, data0 = 0, data1 = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } // let vdst = 0, data0 = 0, data1 = 0 + } // end mayLoad = 1, mayStore = 1 +} + +//===----------------------------------------------------------------------===// +// MTBUF classes +//===----------------------------------------------------------------------===// + +class MTBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + MTBUF <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class MTBUF_Real_si <bits<3> op, string opName, dag outs, dag ins, + string asm> : + MTBUF <outs, ins, asm, []>, + MTBUFe <op>, + SIMCInstr<opName, SISubtarget.SI>; + +class MTBUF_Real_vi <bits<4> op, string opName, dag outs, dag ins, string asm> : + MTBUF <outs, ins, asm, []>, + MTBUFe_vi <op>, + SIMCInstr <opName, SISubtarget.VI>; + +multiclass MTBUF_m <bits<3> op, string opName, dag outs, dag ins, string asm, + list<dag> pattern> { + + def "" : MTBUF_Pseudo <opName, outs, ins, pattern>; + + def _si : MTBUF_Real_si <op, opName, outs, ins, asm>; + + def _vi : MTBUF_Real_vi <{0, op{2}, op{1}, op{0}}, opName, outs, ins, asm>; + +} + +let mayStore = 1, mayLoad = 0 in { + +multiclass MTBUF_Store_Helper <bits<3> op, string opName, + RegisterClass regClass> : MTBUF_m < + op, opName, (outs), + (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, + i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, + SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset), + opName#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt," + #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", [] +>; + +} // mayStore = 1, mayLoad = 0 + +let mayLoad = 1, mayStore = 0 in { + +multiclass MTBUF_Load_Helper <bits<3> op, string opName, + RegisterClass regClass> : MTBUF_m < + op, opName, (outs regClass:$dst), + (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, + i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc, + i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset), + opName#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt," + #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", [] +>; + +} // mayLoad = 1, mayStore = 0 + +//===----------------------------------------------------------------------===// +// MUBUF classes +//===----------------------------------------------------------------------===// + +class mubuf <bits<7> si, bits<7> vi = si> { + field bits<7> SI = si; + field bits<7> VI = vi; +} + +let isCodeGenOnly = 0 in { + +class MUBUF_si <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + MUBUF <outs, ins, asm, pattern>, MUBUFe <op> { + let lds = 0; +} + +} // End let isCodeGenOnly = 0 + +class MUBUF_vi <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + MUBUF <outs, ins, asm, pattern>, MUBUFe_vi <op> { + let lds = 0; +} + +class MUBUFAddr64Table <bit is_addr64, string suffix = ""> { + bit IsAddr64 = is_addr64; + string OpName = NAME # suffix; +} + +class MUBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + MUBUF <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; + + // dummy fields, so that we can use let statements around multiclasses + bits<1> offen; + bits<1> idxen; + bits<8> vaddr; + bits<1> glc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; +} + +class MUBUF_Real_si <mubuf op, string opName, dag outs, dag ins, + string asm> : + MUBUF <outs, ins, asm, []>, + MUBUFe <op.SI>, + SIMCInstr<opName, SISubtarget.SI> { + let lds = 0; +} + +class MUBUF_Real_vi <mubuf op, string opName, dag outs, dag ins, + string asm> : + MUBUF <outs, ins, asm, []>, + MUBUFe_vi <op.VI>, + SIMCInstr<opName, SISubtarget.VI> { + let lds = 0; +} + +multiclass MUBUF_m <mubuf op, string opName, dag outs, dag ins, string asm, + list<dag> pattern> { + + def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, + MUBUFAddr64Table <0>; + + let addr64 = 0, isCodeGenOnly = 0 in { + def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; + } + + def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>; +} + +multiclass MUBUFAddr64_m <mubuf op, string opName, dag outs, + dag ins, string asm, list<dag> pattern> { + + def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, + MUBUFAddr64Table <1>; + + let addr64 = 1, isCodeGenOnly = 0 in { + def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; + } + + // There is no VI version. If the pseudo is selected, it should be lowered + // for VI appropriately. +} + +multiclass MUBUFAtomicOffset_m <mubuf op, string opName, dag outs, dag ins, + string asm, list<dag> pattern, bit is_return> { + + def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, + MUBUFAddr64Table <0, !if(is_return, "_RTN", "")>, + AtomicNoRet<NAME#"_OFFSET", is_return>; + + let offen = 0, idxen = 0, tfe = 0, vaddr = 0 in { + let addr64 = 0 in { + def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; + } + + def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>; + } +} + +multiclass MUBUFAtomicAddr64_m <mubuf op, string opName, dag outs, dag ins, + string asm, list<dag> pattern, bit is_return> { + + def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, + MUBUFAddr64Table <1, !if(is_return, "_RTN", "")>, + AtomicNoRet<NAME#"_ADDR64", is_return>; + + let offen = 0, idxen = 0, addr64 = 1, tfe = 0 in { + def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; + } + + // There is no VI version. If the pseudo is selected, it should be lowered + // for VI appropriately. +} + +multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, + ValueType vt, SDPatternOperator atomic> { + + let mayStore = 1, mayLoad = 1, hasPostISelHook = 1 in { + + // No return variants + let glc = 0 in { + + defm _ADDR64 : MUBUFAtomicAddr64_m < + op, name#"_addr64", (outs), + (ins rc:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0 + >; + + defm _OFFSET : MUBUFAtomicOffset_m < + op, name#"_offset", (outs), + (ins rc:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, + slc:$slc), + name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", [], 0 + >; + } // glc = 0 + + // Variant that return values + let glc = 1, Constraints = "$vdata = $vdata_in", + DisableEncoding = "$vdata_in" in { + + defm _RTN_ADDR64 : MUBUFAtomicAddr64_m < + op, name#"_rtn_addr64", (outs rc:$vdata), + (ins rc:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc", + [(set vt:$vdata, + (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, + i16:$offset, i1:$slc), vt:$vdata_in))], 1 + >; + + defm _RTN_OFFSET : MUBUFAtomicOffset_m < + op, name#"_rtn_offset", (outs rc:$vdata), + (ins rc:$vdata_in, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, slc:$slc), + name#" $vdata, $srsrc, $soffset"#"$offset"#" glc$slc", + [(set vt:$vdata, + (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, + i1:$slc), vt:$vdata_in))], 1 + >; + + } // glc = 1 + + } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1 +} + +// FIXME: tfe can't be an operand because it requires a separate +// opcode because it needs an N+1 register class dest register. +multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass, + ValueType load_vt = i32, + SDPatternOperator ld = null_frag> { + + let mayLoad = 1, mayStore = 0 in { + let offen = 0, idxen = 0, vaddr = 0 in { + defm _OFFSET : MUBUF_m <op, name#"_offset", (outs regClass:$vdata), + (ins SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", + [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc, + i32:$soffset, i16:$offset, + i1:$glc, i1:$slc, i1:$tfe)))]>; + } + + let offen = 1, idxen = 0 in { + defm _OFFEN : MUBUF_m <op, name#"_offen", (outs regClass:$vdata), + (ins VGPR_32:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc, + tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + } + + let offen = 0, idxen = 1 in { + defm _IDXEN : MUBUF_m <op, name#"_idxen", (outs regClass:$vdata), + (ins VGPR_32:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, + slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + } + + let offen = 1, idxen = 1 in { + defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs regClass:$vdata), + (ins VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset idxen offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + } + + let offen = 0, idxen = 0 in { + defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs regClass:$vdata), + (ins VReg_64:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, mbuf_offset:$offset, + glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"# + "$glc"#"$slc"#"$tfe", + [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc, + i64:$vaddr, i32:$soffset, + i16:$offset, i1:$glc, i1:$slc, + i1:$tfe)))]>; + } + } +} + +multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass, + ValueType store_vt = i32, SDPatternOperator st = null_frag> { + let mayLoad = 0, mayStore = 1 in { + defm : MUBUF_m <op, name, (outs), + (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, offen:$offen, idxen:$idxen, glc:$glc, slc:$slc, + tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset"#"$offen"#"$idxen"#"$offset"# + "$glc"#"$slc"#"$tfe", []>; + + let offen = 0, idxen = 0, vaddr = 0 in { + defm _OFFSET : MUBUF_m <op, name#"_offset",(outs), + (ins vdataClass:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", + [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>; + } // offen = 0, idxen = 0, vaddr = 0 + + let offen = 1, idxen = 0 in { + defm _OFFEN : MUBUF_m <op, name#"_offen", (outs), + (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, + slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"# + "$glc"#"$slc"#"$tfe", []>; + } // end offen = 1, idxen = 0 + + let offen = 0, idxen = 1 in { + defm _IDXEN : MUBUF_m <op, name#"_idxen", (outs), + (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, + slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + } + + let offen = 1, idxen = 1 in { + defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs), + (ins vdataClass:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset idxen offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + } + + let offen = 0, idxen = 0 in { + defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs), + (ins vdataClass:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, + tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset addr64"# + "$offset"#"$glc"#"$slc"#"$tfe", + [(st store_vt:$vdata, + (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, + i32:$soffset, i16:$offset, + i1:$glc, i1:$slc, i1:$tfe))]>; + } + } // End mayLoad = 0, mayStore = 1 +} + +// For cache invalidation instructions. +multiclass MUBUF_Invalidate <mubuf op, string opName, SDPatternOperator node> { + let hasSideEffects = 1, mayStore = 1, AsmMatchConverter = "" in { + def "" : MUBUF_Pseudo <opName, (outs), (ins), [(node)]>; + + // Set everything to 0. + let offset = 0, offen = 0, idxen = 0, glc = 0, vaddr = 0, + vdata = 0, srsrc = 0, slc = 0, tfe = 0, soffset = 0 in { + let addr64 = 0 in { + def _si : MUBUF_Real_si <op, opName, (outs), (ins), opName>; + } + + def _vi : MUBUF_Real_vi <op, opName, (outs), (ins), opName>; + } + } // End hasSideEffects = 1, mayStore = 1, AsmMatchConverter = "" +} + +//===----------------------------------------------------------------------===// +// FLAT classes +//===----------------------------------------------------------------------===// + +class flat <bits<7> ci, bits<7> vi = ci> { + field bits<7> CI = ci; + field bits<7> VI = vi; +} + +class FLAT_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + FLAT <0, outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class FLAT_Real_ci <bits<7> op, string opName, dag outs, dag ins, string asm> : + FLAT <op, outs, ins, asm, []>, + SIMCInstr<opName, SISubtarget.SI> { + let AssemblerPredicate = isCIOnly; +} + +class FLAT_Real_vi <bits<7> op, string opName, dag outs, dag ins, string asm> : + FLAT <op, outs, ins, asm, []>, + SIMCInstr<opName, SISubtarget.VI> { + let AssemblerPredicate = VIAssemblerPredicate; +} + +multiclass FLAT_AtomicRet_m <flat op, dag outs, dag ins, string asm, + list<dag> pattern> { + def "" : FLAT_Pseudo <NAME#"_RTN", outs, ins, pattern>, + AtomicNoRet <NAME, 1>; + + def _ci : FLAT_Real_ci <op.CI, NAME#"_RTN", outs, ins, asm>; + + def _vi : FLAT_Real_vi <op.VI, NAME#"_RTN", outs, ins, asm>; +} + +multiclass FLAT_Load_Helper <flat op, string asm_name, + RegisterClass regClass, + dag outs = (outs regClass:$vdst), + dag ins = (ins VReg_64:$addr, glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe), + string asm = asm_name#" $vdst, $addr"#"$glc"#"$slc"#"$tfe"> { + + let data = 0, mayLoad = 1 in { + + def "" : FLAT_Pseudo <NAME, outs, ins, []>; + + def _ci : FLAT_Real_ci <op.CI, NAME, outs, ins, asm>; + + def _vi : FLAT_Real_vi <op.VI, NAME, outs, ins, asm>; + } +} + +multiclass FLAT_Store_Helper <flat op, string asm_name, + RegisterClass vdataClass, + dag outs = (outs), + dag ins = (ins vdataClass:$data, VReg_64:$addr, glc_flat:$glc, + slc_flat:$slc, tfe_flat:$tfe), + string asm = asm_name#" $data, $addr"#"$glc"#"$slc"#"$tfe"> { + + let mayLoad = 0, mayStore = 1, vdst = 0 in { + + def "" : FLAT_Pseudo <NAME, outs, ins, []>; + + def _ci : FLAT_Real_ci <op.CI, NAME, outs, ins, asm>; + + def _vi : FLAT_Real_vi <op.VI, NAME, outs, ins, asm>; + } +} + +multiclass FLAT_ATOMIC <flat op, string asm_name, RegisterClass vdst_rc, + RegisterClass data_rc = vdst_rc, + dag outs_noret = (outs), + string asm_noret = asm_name#" $addr, $data"#"$slc"#"$tfe"> { + + let mayLoad = 1, mayStore = 1, glc = 0, vdst = 0 in { + def "" : FLAT_Pseudo <NAME, outs_noret, + (ins VReg_64:$addr, data_rc:$data, + slc_flat_atomic:$slc, tfe_flat_atomic:$tfe), []>, + AtomicNoRet <NAME, 0>; + + def _ci : FLAT_Real_ci <op.CI, NAME, outs_noret, + (ins VReg_64:$addr, data_rc:$data, + slc_flat_atomic:$slc, tfe_flat_atomic:$tfe), + asm_noret>; + + def _vi : FLAT_Real_vi <op.VI, NAME, outs_noret, + (ins VReg_64:$addr, data_rc:$data, + slc_flat_atomic:$slc, tfe_flat_atomic:$tfe), + asm_noret>; + } + + let glc = 1, hasPostISelHook = 1 in { + defm _RTN : FLAT_AtomicRet_m <op, (outs vdst_rc:$vdst), + (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc, + tfe_flat_atomic:$tfe), + asm_name#" $vdst, $addr, $data glc"#"$slc"#"$tfe", []>; + } +} + +class MIMG_Mask <string op, int channels> { + string Op = op; + int Channels = channels; +} + +class MIMG_NoSampler_Helper <bits<7> op, string asm, + RegisterClass dst_rc, + RegisterClass src_rc> : MIMG < + op, + (outs dst_rc:$vdata), + (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, + i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, + SReg_256:$srsrc), + asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," + #" $tfe, $lwe, $slc, $vaddr, $srsrc", + []> { + let ssamp = 0; + let mayLoad = 1; + let mayStore = 0; + let hasPostISelHook = 1; +} + +multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm, + RegisterClass dst_rc, + int channels> { + def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32>, + MIMG_Mask<asm#"_V1", channels>; + def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>, + MIMG_Mask<asm#"_V2", channels>; + def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>, + MIMG_Mask<asm#"_V4", channels>; +} + +multiclass MIMG_NoSampler <bits<7> op, string asm> { + defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>; + defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>; + defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>; + defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>; +} + +class MIMG_Sampler_Helper <bits<7> op, string asm, + RegisterClass dst_rc, + RegisterClass src_rc, int wqm> : MIMG < + op, + (outs dst_rc:$vdata), + (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, + i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, + SReg_256:$srsrc, SReg_128:$ssamp), + asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," + #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp", + []> { + let mayLoad = 1; + let mayStore = 0; + let hasPostISelHook = 1; + let WQM = wqm; +} + +multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm, + RegisterClass dst_rc, + int channels, int wqm> { + def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm>, + MIMG_Mask<asm#"_V1", channels>; + def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64, wqm>, + MIMG_Mask<asm#"_V2", channels>; + def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128, wqm>, + MIMG_Mask<asm#"_V4", channels>; + def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256, wqm>, + MIMG_Mask<asm#"_V8", channels>; + def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512, wqm>, + MIMG_Mask<asm#"_V16", channels>; +} + +multiclass MIMG_Sampler <bits<7> op, string asm> { + defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, 0>; + defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, 0>; + defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, 0>; + defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, 0>; +} + +multiclass MIMG_Sampler_WQM <bits<7> op, string asm> { + defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, 1>; + defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, 1>; + defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, 1>; + defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, 1>; +} + +class MIMG_Gather_Helper <bits<7> op, string asm, + RegisterClass dst_rc, + RegisterClass src_rc, int wqm> : MIMG < + op, + (outs dst_rc:$vdata), + (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, + i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, + SReg_256:$srsrc, SReg_128:$ssamp), + asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," + #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp", + []> { + let mayLoad = 1; + let mayStore = 0; + + // DMASK was repurposed for GATHER4. 4 components are always + // returned and DMASK works like a swizzle - it selects + // the component to fetch. The only useful DMASK values are + // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns + // (red,red,red,red) etc.) The ISA document doesn't mention + // this. + // Therefore, disable all code which updates DMASK by setting these two: + let MIMG = 0; + let hasPostISelHook = 0; + let WQM = wqm; +} + +multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm, + RegisterClass dst_rc, + int channels, int wqm> { + def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VGPR_32, wqm>, + MIMG_Mask<asm#"_V1", channels>; + def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64, wqm>, + MIMG_Mask<asm#"_V2", channels>; + def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128, wqm>, + MIMG_Mask<asm#"_V4", channels>; + def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256, wqm>, + MIMG_Mask<asm#"_V8", channels>; + def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512, wqm>, + MIMG_Mask<asm#"_V16", channels>; +} + +multiclass MIMG_Gather <bits<7> op, string asm> { + defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, 0>; + defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, 0>; + defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, 0>; + defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, 0>; +} + +multiclass MIMG_Gather_WQM <bits<7> op, string asm> { + defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, 1>; + defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, 1>; + defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, 1>; + defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, 1>; +} + +//===----------------------------------------------------------------------===// +// Vector instruction mappings +//===----------------------------------------------------------------------===// + +// Maps an opcode in e32 form to its e64 equivalent +def getVOPe64 : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["Size"]; + let KeyCol = ["4"]; + let ValueCols = [["8"]]; +} + +// Maps an opcode in e64 form to its e32 equivalent +def getVOPe32 : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["Size"]; + let KeyCol = ["8"]; + let ValueCols = [["4"]]; +} + +def getMaskedMIMGOp : InstrMapping { + let FilterClass = "MIMG_Mask"; + let RowFields = ["Op"]; + let ColFields = ["Channels"]; + let KeyCol = ["4"]; + let ValueCols = [["1"], ["2"], ["3"] ]; +} + +// Maps an commuted opcode to its original version +def getCommuteOrig : InstrMapping { + let FilterClass = "VOP2_REV"; + let RowFields = ["RevOp"]; + let ColFields = ["IsOrig"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +// Maps an original opcode to its commuted version +def getCommuteRev : InstrMapping { + let FilterClass = "VOP2_REV"; + let RowFields = ["RevOp"]; + let ColFields = ["IsOrig"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + +def getCommuteCmpOrig : InstrMapping { + let FilterClass = "VOP2_REV"; + let RowFields = ["RevOp"]; + let ColFields = ["IsOrig"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +// Maps an original opcode to its commuted version +def getCommuteCmpRev : InstrMapping { + let FilterClass = "VOP2_REV"; + let RowFields = ["RevOp"]; + let ColFields = ["IsOrig"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + + +def getMCOpcodeGen : InstrMapping { + let FilterClass = "SIMCInstr"; + let RowFields = ["PseudoInstr"]; + let ColFields = ["Subtarget"]; + let KeyCol = [!cast<string>(SISubtarget.NONE)]; + let ValueCols = [[!cast<string>(SISubtarget.SI)],[!cast<string>(SISubtarget.VI)]]; +} + +def getAddr64Inst : InstrMapping { + let FilterClass = "MUBUFAddr64Table"; + let RowFields = ["OpName"]; + let ColFields = ["IsAddr64"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +// Maps an atomic opcode to its version with a return value. +def getAtomicRetOp : InstrMapping { + let FilterClass = "AtomicNoRet"; + let RowFields = ["NoRetOp"]; + let ColFields = ["IsRet"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +// Maps an atomic opcode to its returnless version. +def getAtomicNoRetOp : InstrMapping { + let FilterClass = "AtomicNoRet"; + let RowFields = ["NoRetOp"]; + let ColFields = ["IsRet"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + +include "SIInstructions.td" +include "CIInstructions.td" +include "VIInstructions.td" diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td new file mode 100644 index 0000000..89692ab --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -0,0 +1,3279 @@ +//===-- SIInstructions.td - SI Instruction Defintions ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This file was originally auto-generated from a GPU register header file and +// all the instruction definitions were originally commented out. Instructions +// that are not yet supported remain commented out. +//===----------------------------------------------------------------------===// + +class InterpSlots { +int P0 = 2; +int P10 = 0; +int P20 = 1; +} +def INTERP : InterpSlots; + +def InterpSlot : Operand<i32> { + let PrintMethod = "printInterpSlot"; +} + +def SendMsgImm : Operand<i32> { + let PrintMethod = "printSendMsg"; +} + +def isGCN : Predicate<"Subtarget->getGeneration() " + ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">, + AssemblerPredicate<"FeatureGCN">; +def isSI : Predicate<"Subtarget->getGeneration() " + "== AMDGPUSubtarget::SOUTHERN_ISLANDS">, + AssemblerPredicate<"FeatureSouthernIslands">; + + +def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">; +def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">; + +def SWaitMatchClass : AsmOperandClass { + let Name = "SWaitCnt"; + let RenderMethod = "addImmOperands"; + let ParserMethod = "parseSWaitCntOps"; +} + +def WAIT_FLAG : InstFlag<"printWaitFlag"> { + let ParserMatchClass = SWaitMatchClass; +} + +let SubtargetPredicate = isGCN in { + +//===----------------------------------------------------------------------===// +// EXP Instructions +//===----------------------------------------------------------------------===// + +defm EXP : EXP_m; + +//===----------------------------------------------------------------------===// +// SMRD Instructions +//===----------------------------------------------------------------------===// + +// We are using the SGPR_32 and not the SReg_32 register class for 32-bit +// SMRD instructions, because the SGPR_32 register class does not include M0 +// and writing to M0 from an SMRD instruction will hang the GPU. +defm S_LOAD_DWORD : SMRD_Helper <smrd<0x00>, "s_load_dword", SReg_64, SGPR_32>; +defm S_LOAD_DWORDX2 : SMRD_Helper <smrd<0x01>, "s_load_dwordx2", SReg_64, SReg_64>; +defm S_LOAD_DWORDX4 : SMRD_Helper <smrd<0x02>, "s_load_dwordx4", SReg_64, SReg_128>; +defm S_LOAD_DWORDX8 : SMRD_Helper <smrd<0x03>, "s_load_dwordx8", SReg_64, SReg_256>; +defm S_LOAD_DWORDX16 : SMRD_Helper <smrd<0x04>, "s_load_dwordx16", SReg_64, SReg_512>; + +defm S_BUFFER_LOAD_DWORD : SMRD_Helper < + smrd<0x08>, "s_buffer_load_dword", SReg_128, SGPR_32 +>; + +defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper < + smrd<0x09>, "s_buffer_load_dwordx2", SReg_128, SReg_64 +>; + +defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper < + smrd<0x0a>, "s_buffer_load_dwordx4", SReg_128, SReg_128 +>; + +defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper < + smrd<0x0b>, "s_buffer_load_dwordx8", SReg_128, SReg_256 +>; + +defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper < + smrd<0x0c>, "s_buffer_load_dwordx16", SReg_128, SReg_512 +>; + +//def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>; + +defm S_DCACHE_INV : SMRD_Inval <smrd<0x1f, 0x20>, "s_dcache_inv", + int_amdgcn_s_dcache_inv>; + +//===----------------------------------------------------------------------===// +// SOP1 Instructions +//===----------------------------------------------------------------------===// + +let isMoveImm = 1 in { + let isReMaterializable = 1, isAsCheapAsAMove = 1 in { + defm S_MOV_B32 : SOP1_32 <sop1<0x03, 0x00>, "s_mov_b32", []>; + defm S_MOV_B64 : SOP1_64 <sop1<0x04, 0x01>, "s_mov_b64", []>; + } // let isRematerializeable = 1 + + let Uses = [SCC] in { + defm S_CMOV_B32 : SOP1_32 <sop1<0x05, 0x02>, "s_cmov_b32", []>; + defm S_CMOV_B64 : SOP1_64 <sop1<0x06, 0x03>, "s_cmov_b64", []>; + } // End Uses = [SCC] +} // End isMoveImm = 1 + +let Defs = [SCC] in { + defm S_NOT_B32 : SOP1_32 <sop1<0x07, 0x04>, "s_not_b32", + [(set i32:$dst, (not i32:$src0))] + >; + + defm S_NOT_B64 : SOP1_64 <sop1<0x08, 0x05>, "s_not_b64", + [(set i64:$dst, (not i64:$src0))] + >; + defm S_WQM_B32 : SOP1_32 <sop1<0x09, 0x06>, "s_wqm_b32", []>; + defm S_WQM_B64 : SOP1_64 <sop1<0x0a, 0x07>, "s_wqm_b64", []>; +} // End Defs = [SCC] + + +defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32", + [(set i32:$dst, (bitreverse i32:$src0))] +>; +defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>; + +let Defs = [SCC] in { + defm S_BCNT0_I32_B32 : SOP1_32 <sop1<0x0d, 0x0a>, "s_bcnt0_i32_b32", []>; + defm S_BCNT0_I32_B64 : SOP1_32_64 <sop1<0x0e, 0x0b>, "s_bcnt0_i32_b64", []>; + defm S_BCNT1_I32_B32 : SOP1_32 <sop1<0x0f, 0x0c>, "s_bcnt1_i32_b32", + [(set i32:$dst, (ctpop i32:$src0))] + >; + defm S_BCNT1_I32_B64 : SOP1_32_64 <sop1<0x10, 0x0d>, "s_bcnt1_i32_b64", []>; +} // End Defs = [SCC] + +defm S_FF0_I32_B32 : SOP1_32 <sop1<0x11, 0x0e>, "s_ff0_i32_b32", []>; +defm S_FF0_I32_B64 : SOP1_32_64 <sop1<0x12, 0x0f>, "s_ff0_i32_b64", []>; +defm S_FF1_I32_B32 : SOP1_32 <sop1<0x13, 0x10>, "s_ff1_i32_b32", + [(set i32:$dst, (cttz_zero_undef i32:$src0))] +>; +defm S_FF1_I32_B64 : SOP1_32_64 <sop1<0x14, 0x11>, "s_ff1_i32_b64", []>; + +defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32", + [(set i32:$dst, (AMDGPUffbh_u32 i32:$src0))] +>; + +defm S_FLBIT_I32_B64 : SOP1_32_64 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>; +defm S_FLBIT_I32 : SOP1_32 <sop1<0x17, 0x14>, "s_flbit_i32", + [(set i32:$dst, (int_AMDGPU_flbit_i32 i32:$src0))] +>; +defm S_FLBIT_I32_I64 : SOP1_32_64 <sop1<0x18, 0x15>, "s_flbit_i32_i64", []>; +defm S_SEXT_I32_I8 : SOP1_32 <sop1<0x19, 0x16>, "s_sext_i32_i8", + [(set i32:$dst, (sext_inreg i32:$src0, i8))] +>; +defm S_SEXT_I32_I16 : SOP1_32 <sop1<0x1a, 0x17>, "s_sext_i32_i16", + [(set i32:$dst, (sext_inreg i32:$src0, i16))] +>; + +defm S_BITSET0_B32 : SOP1_32 <sop1<0x1b, 0x18>, "s_bitset0_b32", []>; +defm S_BITSET0_B64 : SOP1_64 <sop1<0x1c, 0x19>, "s_bitset0_b64", []>; +defm S_BITSET1_B32 : SOP1_32 <sop1<0x1d, 0x1a>, "s_bitset1_b32", []>; +defm S_BITSET1_B64 : SOP1_64 <sop1<0x1e, 0x1b>, "s_bitset1_b64", []>; +defm S_GETPC_B64 : SOP1_64_0 <sop1<0x1f, 0x1c>, "s_getpc_b64", []>; +defm S_SETPC_B64 : SOP1_64 <sop1<0x20, 0x1d>, "s_setpc_b64", []>; +defm S_SWAPPC_B64 : SOP1_64 <sop1<0x21, 0x1e>, "s_swappc_b64", []>; +defm S_RFE_B64 : SOP1_64 <sop1<0x22, 0x1f>, "s_rfe_b64", []>; + +let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in { + +defm S_AND_SAVEEXEC_B64 : SOP1_64 <sop1<0x24, 0x20>, "s_and_saveexec_b64", []>; +defm S_OR_SAVEEXEC_B64 : SOP1_64 <sop1<0x25, 0x21>, "s_or_saveexec_b64", []>; +defm S_XOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x26, 0x22>, "s_xor_saveexec_b64", []>; +defm S_ANDN2_SAVEEXEC_B64 : SOP1_64 <sop1<0x27, 0x23>, "s_andn2_saveexec_b64", []>; +defm S_ORN2_SAVEEXEC_B64 : SOP1_64 <sop1<0x28, 0x24>, "s_orn2_saveexec_b64", []>; +defm S_NAND_SAVEEXEC_B64 : SOP1_64 <sop1<0x29, 0x25>, "s_nand_saveexec_b64", []>; +defm S_NOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2a, 0x26>, "s_nor_saveexec_b64", []>; +defm S_XNOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2b, 0x27>, "s_xnor_saveexec_b64", []>; + +} // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] + +defm S_QUADMASK_B32 : SOP1_32 <sop1<0x2c, 0x28>, "s_quadmask_b32", []>; +defm S_QUADMASK_B64 : SOP1_64 <sop1<0x2d, 0x29>, "s_quadmask_b64", []>; + +let Uses = [M0] in { +defm S_MOVRELS_B32 : SOP1_32 <sop1<0x2e, 0x2a>, "s_movrels_b32", []>; +defm S_MOVRELS_B64 : SOP1_64 <sop1<0x2f, 0x2b>, "s_movrels_b64", []>; +defm S_MOVRELD_B32 : SOP1_32 <sop1<0x30, 0x2c>, "s_movreld_b32", []>; +defm S_MOVRELD_B64 : SOP1_64 <sop1<0x31, 0x2d>, "s_movreld_b64", []>; +} // End Uses = [M0] + +defm S_CBRANCH_JOIN : SOP1_1 <sop1<0x32, 0x2e>, "s_cbranch_join", []>; +defm S_MOV_REGRD_B32 : SOP1_32 <sop1<0x33, 0x2f>, "s_mov_regrd_b32", []>; +let Defs = [SCC] in { + defm S_ABS_I32 : SOP1_32 <sop1<0x34, 0x30>, "s_abs_i32", []>; +} // End Defs = [SCC] +defm S_MOV_FED_B32 : SOP1_32 <sop1<0x35, 0x31>, "s_mov_fed_b32", []>; + +//===----------------------------------------------------------------------===// +// SOP2 Instructions +//===----------------------------------------------------------------------===// + +let Defs = [SCC] in { // Carry out goes to SCC +let isCommutable = 1 in { +defm S_ADD_U32 : SOP2_32 <sop2<0x00>, "s_add_u32", []>; +defm S_ADD_I32 : SOP2_32 <sop2<0x02>, "s_add_i32", + [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))] +>; +} // End isCommutable = 1 + +defm S_SUB_U32 : SOP2_32 <sop2<0x01>, "s_sub_u32", []>; +defm S_SUB_I32 : SOP2_32 <sop2<0x03>, "s_sub_i32", + [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))] +>; + +let Uses = [SCC] in { // Carry in comes from SCC +let isCommutable = 1 in { +defm S_ADDC_U32 : SOP2_32 <sop2<0x04>, "s_addc_u32", + [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; +} // End isCommutable = 1 + +defm S_SUBB_U32 : SOP2_32 <sop2<0x05>, "s_subb_u32", + [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; +} // End Uses = [SCC] + +defm S_MIN_I32 : SOP2_32 <sop2<0x06>, "s_min_i32", + [(set i32:$dst, (smin i32:$src0, i32:$src1))] +>; +defm S_MIN_U32 : SOP2_32 <sop2<0x07>, "s_min_u32", + [(set i32:$dst, (umin i32:$src0, i32:$src1))] +>; +defm S_MAX_I32 : SOP2_32 <sop2<0x08>, "s_max_i32", + [(set i32:$dst, (smax i32:$src0, i32:$src1))] +>; +defm S_MAX_U32 : SOP2_32 <sop2<0x09>, "s_max_u32", + [(set i32:$dst, (umax i32:$src0, i32:$src1))] +>; +} // End Defs = [SCC] + + +let Uses = [SCC] in { + defm S_CSELECT_B32 : SOP2_32 <sop2<0x0a>, "s_cselect_b32", []>; + defm S_CSELECT_B64 : SOP2_64 <sop2<0x0b>, "s_cselect_b64", []>; +} // End Uses = [SCC] + +let Defs = [SCC] in { +defm S_AND_B32 : SOP2_32 <sop2<0x0e, 0x0c>, "s_and_b32", + [(set i32:$dst, (and i32:$src0, i32:$src1))] +>; + +defm S_AND_B64 : SOP2_64 <sop2<0x0f, 0x0d>, "s_and_b64", + [(set i64:$dst, (and i64:$src0, i64:$src1))] +>; + +defm S_OR_B32 : SOP2_32 <sop2<0x10, 0x0e>, "s_or_b32", + [(set i32:$dst, (or i32:$src0, i32:$src1))] +>; + +defm S_OR_B64 : SOP2_64 <sop2<0x11, 0x0f>, "s_or_b64", + [(set i64:$dst, (or i64:$src0, i64:$src1))] +>; + +defm S_XOR_B32 : SOP2_32 <sop2<0x12, 0x10>, "s_xor_b32", + [(set i32:$dst, (xor i32:$src0, i32:$src1))] +>; + +defm S_XOR_B64 : SOP2_64 <sop2<0x13, 0x11>, "s_xor_b64", + [(set i64:$dst, (xor i64:$src0, i64:$src1))] +>; +defm S_ANDN2_B32 : SOP2_32 <sop2<0x14, 0x12>, "s_andn2_b32", []>; +defm S_ANDN2_B64 : SOP2_64 <sop2<0x15, 0x13>, "s_andn2_b64", []>; +defm S_ORN2_B32 : SOP2_32 <sop2<0x16, 0x14>, "s_orn2_b32", []>; +defm S_ORN2_B64 : SOP2_64 <sop2<0x17, 0x15>, "s_orn2_b64", []>; +defm S_NAND_B32 : SOP2_32 <sop2<0x18, 0x16>, "s_nand_b32", []>; +defm S_NAND_B64 : SOP2_64 <sop2<0x19, 0x17>, "s_nand_b64", []>; +defm S_NOR_B32 : SOP2_32 <sop2<0x1a, 0x18>, "s_nor_b32", []>; +defm S_NOR_B64 : SOP2_64 <sop2<0x1b, 0x19>, "s_nor_b64", []>; +defm S_XNOR_B32 : SOP2_32 <sop2<0x1c, 0x1a>, "s_xnor_b32", []>; +defm S_XNOR_B64 : SOP2_64 <sop2<0x1d, 0x1b>, "s_xnor_b64", []>; +} // End Defs = [SCC] + +// Use added complexity so these patterns are preferred to the VALU patterns. +let AddedComplexity = 1 in { +let Defs = [SCC] in { + +defm S_LSHL_B32 : SOP2_32 <sop2<0x1e, 0x1c>, "s_lshl_b32", + [(set i32:$dst, (shl i32:$src0, i32:$src1))] +>; +defm S_LSHL_B64 : SOP2_64_32 <sop2<0x1f, 0x1d>, "s_lshl_b64", + [(set i64:$dst, (shl i64:$src0, i32:$src1))] +>; +defm S_LSHR_B32 : SOP2_32 <sop2<0x20, 0x1e>, "s_lshr_b32", + [(set i32:$dst, (srl i32:$src0, i32:$src1))] +>; +defm S_LSHR_B64 : SOP2_64_32 <sop2<0x21, 0x1f>, "s_lshr_b64", + [(set i64:$dst, (srl i64:$src0, i32:$src1))] +>; +defm S_ASHR_I32 : SOP2_32 <sop2<0x22, 0x20>, "s_ashr_i32", + [(set i32:$dst, (sra i32:$src0, i32:$src1))] +>; +defm S_ASHR_I64 : SOP2_64_32 <sop2<0x23, 0x21>, "s_ashr_i64", + [(set i64:$dst, (sra i64:$src0, i32:$src1))] +>; +} // End Defs = [SCC] + +defm S_BFM_B32 : SOP2_32 <sop2<0x24, 0x22>, "s_bfm_b32", + [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))]>; +defm S_BFM_B64 : SOP2_64 <sop2<0x25, 0x23>, "s_bfm_b64", []>; +defm S_MUL_I32 : SOP2_32 <sop2<0x26, 0x24>, "s_mul_i32", + [(set i32:$dst, (mul i32:$src0, i32:$src1))] +>; + +} // End AddedComplexity = 1 + +let Defs = [SCC] in { +defm S_BFE_U32 : SOP2_32 <sop2<0x27, 0x25>, "s_bfe_u32", []>; +defm S_BFE_I32 : SOP2_32 <sop2<0x28, 0x26>, "s_bfe_i32", []>; +defm S_BFE_U64 : SOP2_64 <sop2<0x29, 0x27>, "s_bfe_u64", []>; +defm S_BFE_I64 : SOP2_64_32 <sop2<0x2a, 0x28>, "s_bfe_i64", []>; +} // End Defs = [SCC] + +let sdst = 0 in { +defm S_CBRANCH_G_FORK : SOP2_m < + sop2<0x2b, 0x29>, "s_cbranch_g_fork", (outs), + (ins SReg_64:$src0, SReg_64:$src1), "s_cbranch_g_fork $src0, $src1", [] +>; +} + +let Defs = [SCC] in { +defm S_ABSDIFF_I32 : SOP2_32 <sop2<0x2c, 0x2a>, "s_absdiff_i32", []>; +} // End Defs = [SCC] + +//===----------------------------------------------------------------------===// +// SOPC Instructions +//===----------------------------------------------------------------------===// + +def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "s_cmp_eq_i32">; +def S_CMP_LG_I32 : SOPC_32 <0x00000001, "s_cmp_lg_i32">; +def S_CMP_GT_I32 : SOPC_32 <0x00000002, "s_cmp_gt_i32">; +def S_CMP_GE_I32 : SOPC_32 <0x00000003, "s_cmp_ge_i32">; +def S_CMP_LT_I32 : SOPC_32 <0x00000004, "s_cmp_lt_i32">; +def S_CMP_LE_I32 : SOPC_32 <0x00000005, "s_cmp_le_i32">; +def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "s_cmp_eq_u32">; +def S_CMP_LG_U32 : SOPC_32 <0x00000007, "s_cmp_lg_u32">; +def S_CMP_GT_U32 : SOPC_32 <0x00000008, "s_cmp_gt_u32">; +def S_CMP_GE_U32 : SOPC_32 <0x00000009, "s_cmp_ge_u32">; +def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "s_cmp_lt_u32">; +def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">; +////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "s_bitcmp0_b32", []>; +////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "s_bitcmp1_b32", []>; +////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "s_bitcmp0_b64", []>; +////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "s_bitcmp1_b64", []>; +//def S_SETVSKIP : SOPC_ <0x00000010, "s_setvskip", []>; + +//===----------------------------------------------------------------------===// +// SOPK Instructions +//===----------------------------------------------------------------------===// + +let isReMaterializable = 1, isMoveImm = 1 in { +defm S_MOVK_I32 : SOPK_32 <sopk<0x00>, "s_movk_i32", []>; +} // End isReMaterializable = 1 +let Uses = [SCC] in { + defm S_CMOVK_I32 : SOPK_32 <sopk<0x02, 0x01>, "s_cmovk_i32", []>; +} + +let isCompare = 1 in { + +/* +This instruction is disabled for now until we can figure out how to teach +the instruction selector to correctly use the S_CMP* vs V_CMP* +instructions. + +When this instruction is enabled the code generator sometimes produces this +invalid sequence: + +SCC = S_CMPK_EQ_I32 SGPR0, imm +VCC = COPY SCC +VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1 + +defm S_CMPK_EQ_I32 : SOPK_SCC <sopk<0x03, 0x02>, "s_cmpk_eq_i32", + [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))] +>; +*/ + +defm S_CMPK_EQ_I32 : SOPK_SCC <sopk<0x03, 0x02>, "s_cmpk_eq_i32", []>; +defm S_CMPK_LG_I32 : SOPK_SCC <sopk<0x04, 0x03>, "s_cmpk_lg_i32", []>; +defm S_CMPK_GT_I32 : SOPK_SCC <sopk<0x05, 0x04>, "s_cmpk_gt_i32", []>; +defm S_CMPK_GE_I32 : SOPK_SCC <sopk<0x06, 0x05>, "s_cmpk_ge_i32", []>; +defm S_CMPK_LT_I32 : SOPK_SCC <sopk<0x07, 0x06>, "s_cmpk_lt_i32", []>; +defm S_CMPK_LE_I32 : SOPK_SCC <sopk<0x08, 0x07>, "s_cmpk_le_i32", []>; +defm S_CMPK_EQ_U32 : SOPK_SCC <sopk<0x09, 0x08>, "s_cmpk_eq_u32", []>; +defm S_CMPK_LG_U32 : SOPK_SCC <sopk<0x0a, 0x09>, "s_cmpk_lg_u32", []>; +defm S_CMPK_GT_U32 : SOPK_SCC <sopk<0x0b, 0x0a>, "s_cmpk_gt_u32", []>; +defm S_CMPK_GE_U32 : SOPK_SCC <sopk<0x0c, 0x0b>, "s_cmpk_ge_u32", []>; +defm S_CMPK_LT_U32 : SOPK_SCC <sopk<0x0d, 0x0c>, "s_cmpk_lt_u32", []>; +defm S_CMPK_LE_U32 : SOPK_SCC <sopk<0x0e, 0x0d>, "s_cmpk_le_u32", []>; +} // End isCompare = 1 + +let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0", + Constraints = "$sdst = $src0" in { + defm S_ADDK_I32 : SOPK_32TIE <sopk<0x0f, 0x0e>, "s_addk_i32", []>; + defm S_MULK_I32 : SOPK_32TIE <sopk<0x10, 0x0f>, "s_mulk_i32", []>; +} + +defm S_CBRANCH_I_FORK : SOPK_m < + sopk<0x11, 0x10>, "s_cbranch_i_fork", (outs), + (ins SReg_64:$sdst, u16imm:$simm16), " $sdst, $simm16" +>; +defm S_GETREG_B32 : SOPK_32 <sopk<0x12, 0x11>, "s_getreg_b32", []>; +defm S_SETREG_B32 : SOPK_m < + sopk<0x13, 0x12>, "s_setreg_b32", (outs), + (ins SReg_32:$sdst, u16imm:$simm16), " $sdst, $simm16" +>; +// FIXME: Not on SI? +//defm S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32", []>; +defm S_SETREG_IMM32_B32 : SOPK_IMM32 < + sopk<0x15, 0x14>, "s_setreg_imm32_b32", (outs), + (ins i32imm:$imm, u16imm:$simm16), " $imm, $simm16" +>; + +//===----------------------------------------------------------------------===// +// SOPP Instructions +//===----------------------------------------------------------------------===// + +def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">; + +let isTerminator = 1 in { + +def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm", + [(IL_retflag)]> { + let simm16 = 0; + let isBarrier = 1; + let hasCtrlDep = 1; +} + +let isBranch = 1 in { +def S_BRANCH : SOPP < + 0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16", + [(br bb:$simm16)]> { + let isBarrier = 1; +} + +let Uses = [SCC] in { +def S_CBRANCH_SCC0 : SOPP < + 0x00000004, (ins sopp_brtarget:$simm16), + "s_cbranch_scc0 $simm16" +>; +def S_CBRANCH_SCC1 : SOPP < + 0x00000005, (ins sopp_brtarget:$simm16), + "s_cbranch_scc1 $simm16" +>; +} // End Uses = [SCC] + +let Uses = [VCC] in { +def S_CBRANCH_VCCZ : SOPP < + 0x00000006, (ins sopp_brtarget:$simm16), + "s_cbranch_vccz $simm16" +>; +def S_CBRANCH_VCCNZ : SOPP < + 0x00000007, (ins sopp_brtarget:$simm16), + "s_cbranch_vccnz $simm16" +>; +} // End Uses = [VCC] + +let Uses = [EXEC] in { +def S_CBRANCH_EXECZ : SOPP < + 0x00000008, (ins sopp_brtarget:$simm16), + "s_cbranch_execz $simm16" +>; +def S_CBRANCH_EXECNZ : SOPP < + 0x00000009, (ins sopp_brtarget:$simm16), + "s_cbranch_execnz $simm16" +>; +} // End Uses = [EXEC] + + +} // End isBranch = 1 +} // End isTerminator = 1 + +let hasSideEffects = 1 in { +def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier", + [(int_AMDGPU_barrier_local)] +> { + let SchedRW = [WriteBarrier]; + let simm16 = 0; + let mayLoad = 1; + let mayStore = 1; + let isConvergent = 1; +} + +def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">; +def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">; +def S_SLEEP : SOPP <0x0000000e, (ins i16imm:$simm16), "s_sleep $simm16">; +def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$sim16), "s_setprio $sim16">; + +let Uses = [EXEC, M0] in { + def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16", + [(AMDGPUsendmsg (i32 imm:$simm16))] + >; +} // End Uses = [EXEC, M0] + +def S_SENDMSGHALT : SOPP <0x00000011, (ins i16imm:$simm16), "s_sendmsghalt $simm16">; +def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">; +def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> { + let simm16 = 0; +} +def S_INCPERFLEVEL : SOPP <0x00000014, (ins i16imm:$simm16), "s_incperflevel $simm16">; +def S_DECPERFLEVEL : SOPP <0x00000015, (ins i16imm:$simm16), "s_decperflevel $simm16">; +def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> { + let simm16 = 0; +} +} // End hasSideEffects + +//===----------------------------------------------------------------------===// +// VOPC Instructions +//===----------------------------------------------------------------------===// + +let isCompare = 1, isCommutable = 1 in { + +defm V_CMP_F_F32 : VOPC_F32 <vopc<0x0, 0x40>, "v_cmp_f_f32">; +defm V_CMP_LT_F32 : VOPC_F32 <vopc<0x1, 0x41>, "v_cmp_lt_f32", COND_OLT, "v_cmp_gt_f32">; +defm V_CMP_EQ_F32 : VOPC_F32 <vopc<0x2, 0x42>, "v_cmp_eq_f32", COND_OEQ>; +defm V_CMP_LE_F32 : VOPC_F32 <vopc<0x3, 0x43>, "v_cmp_le_f32", COND_OLE, "v_cmp_ge_f32">; +defm V_CMP_GT_F32 : VOPC_F32 <vopc<0x4, 0x44>, "v_cmp_gt_f32", COND_OGT>; +defm V_CMP_LG_F32 : VOPC_F32 <vopc<0x5, 0x45>, "v_cmp_lg_f32", COND_ONE>; +defm V_CMP_GE_F32 : VOPC_F32 <vopc<0x6, 0x46>, "v_cmp_ge_f32", COND_OGE>; +defm V_CMP_O_F32 : VOPC_F32 <vopc<0x7, 0x47>, "v_cmp_o_f32", COND_O>; +defm V_CMP_U_F32 : VOPC_F32 <vopc<0x8, 0x48>, "v_cmp_u_f32", COND_UO>; +defm V_CMP_NGE_F32 : VOPC_F32 <vopc<0x9, 0x49>, "v_cmp_nge_f32", COND_ULT, "v_cmp_nle_f32">; +defm V_CMP_NLG_F32 : VOPC_F32 <vopc<0xa, 0x4a>, "v_cmp_nlg_f32", COND_UEQ>; +defm V_CMP_NGT_F32 : VOPC_F32 <vopc<0xb, 0x4b>, "v_cmp_ngt_f32", COND_ULE, "v_cmp_nlt_f32">; +defm V_CMP_NLE_F32 : VOPC_F32 <vopc<0xc, 0x4c>, "v_cmp_nle_f32", COND_UGT>; +defm V_CMP_NEQ_F32 : VOPC_F32 <vopc<0xd, 0x4d>, "v_cmp_neq_f32", COND_UNE>; +defm V_CMP_NLT_F32 : VOPC_F32 <vopc<0xe, 0x4e>, "v_cmp_nlt_f32", COND_UGE>; +defm V_CMP_TRU_F32 : VOPC_F32 <vopc<0xf, 0x4f>, "v_cmp_tru_f32">; + + +defm V_CMPX_F_F32 : VOPCX_F32 <vopc<0x10, 0x50>, "v_cmpx_f_f32">; +defm V_CMPX_LT_F32 : VOPCX_F32 <vopc<0x11, 0x51>, "v_cmpx_lt_f32", "v_cmpx_gt_f32">; +defm V_CMPX_EQ_F32 : VOPCX_F32 <vopc<0x12, 0x52>, "v_cmpx_eq_f32">; +defm V_CMPX_LE_F32 : VOPCX_F32 <vopc<0x13, 0x53>, "v_cmpx_le_f32", "v_cmpx_ge_f32">; +defm V_CMPX_GT_F32 : VOPCX_F32 <vopc<0x14, 0x54>, "v_cmpx_gt_f32">; +defm V_CMPX_LG_F32 : VOPCX_F32 <vopc<0x15, 0x55>, "v_cmpx_lg_f32">; +defm V_CMPX_GE_F32 : VOPCX_F32 <vopc<0x16, 0x56>, "v_cmpx_ge_f32">; +defm V_CMPX_O_F32 : VOPCX_F32 <vopc<0x17, 0x57>, "v_cmpx_o_f32">; +defm V_CMPX_U_F32 : VOPCX_F32 <vopc<0x18, 0x58>, "v_cmpx_u_f32">; +defm V_CMPX_NGE_F32 : VOPCX_F32 <vopc<0x19, 0x59>, "v_cmpx_nge_f32">; +defm V_CMPX_NLG_F32 : VOPCX_F32 <vopc<0x1a, 0x5a>, "v_cmpx_nlg_f32">; +defm V_CMPX_NGT_F32 : VOPCX_F32 <vopc<0x1b, 0x5b>, "v_cmpx_ngt_f32">; +defm V_CMPX_NLE_F32 : VOPCX_F32 <vopc<0x1c, 0x5c>, "v_cmpx_nle_f32">; +defm V_CMPX_NEQ_F32 : VOPCX_F32 <vopc<0x1d, 0x5d>, "v_cmpx_neq_f32">; +defm V_CMPX_NLT_F32 : VOPCX_F32 <vopc<0x1e, 0x5e>, "v_cmpx_nlt_f32">; +defm V_CMPX_TRU_F32 : VOPCX_F32 <vopc<0x1f, 0x5f>, "v_cmpx_tru_f32">; + + +defm V_CMP_F_F64 : VOPC_F64 <vopc<0x20, 0x60>, "v_cmp_f_f64">; +defm V_CMP_LT_F64 : VOPC_F64 <vopc<0x21, 0x61>, "v_cmp_lt_f64", COND_OLT, "v_cmp_gt_f64">; +defm V_CMP_EQ_F64 : VOPC_F64 <vopc<0x22, 0x62>, "v_cmp_eq_f64", COND_OEQ>; +defm V_CMP_LE_F64 : VOPC_F64 <vopc<0x23, 0x63>, "v_cmp_le_f64", COND_OLE, "v_cmp_ge_f64">; +defm V_CMP_GT_F64 : VOPC_F64 <vopc<0x24, 0x64>, "v_cmp_gt_f64", COND_OGT>; +defm V_CMP_LG_F64 : VOPC_F64 <vopc<0x25, 0x65>, "v_cmp_lg_f64", COND_ONE>; +defm V_CMP_GE_F64 : VOPC_F64 <vopc<0x26, 0x66>, "v_cmp_ge_f64", COND_OGE>; +defm V_CMP_O_F64 : VOPC_F64 <vopc<0x27, 0x67>, "v_cmp_o_f64", COND_O>; +defm V_CMP_U_F64 : VOPC_F64 <vopc<0x28, 0x68>, "v_cmp_u_f64", COND_UO>; +defm V_CMP_NGE_F64 : VOPC_F64 <vopc<0x29, 0x69>, "v_cmp_nge_f64", COND_ULT, "v_cmp_nle_f64">; +defm V_CMP_NLG_F64 : VOPC_F64 <vopc<0x2a, 0x6a>, "v_cmp_nlg_f64", COND_UEQ>; +defm V_CMP_NGT_F64 : VOPC_F64 <vopc<0x2b, 0x6b>, "v_cmp_ngt_f64", COND_ULE, "v_cmp_nlt_f64">; +defm V_CMP_NLE_F64 : VOPC_F64 <vopc<0x2c, 0x6c>, "v_cmp_nle_f64", COND_UGT>; +defm V_CMP_NEQ_F64 : VOPC_F64 <vopc<0x2d, 0x6d>, "v_cmp_neq_f64", COND_UNE>; +defm V_CMP_NLT_F64 : VOPC_F64 <vopc<0x2e, 0x6e>, "v_cmp_nlt_f64", COND_UGE>; +defm V_CMP_TRU_F64 : VOPC_F64 <vopc<0x2f, 0x6f>, "v_cmp_tru_f64">; + + +defm V_CMPX_F_F64 : VOPCX_F64 <vopc<0x30, 0x70>, "v_cmpx_f_f64">; +defm V_CMPX_LT_F64 : VOPCX_F64 <vopc<0x31, 0x71>, "v_cmpx_lt_f64", "v_cmpx_gt_f64">; +defm V_CMPX_EQ_F64 : VOPCX_F64 <vopc<0x32, 0x72>, "v_cmpx_eq_f64">; +defm V_CMPX_LE_F64 : VOPCX_F64 <vopc<0x33, 0x73>, "v_cmpx_le_f64", "v_cmpx_ge_f64">; +defm V_CMPX_GT_F64 : VOPCX_F64 <vopc<0x34, 0x74>, "v_cmpx_gt_f64">; +defm V_CMPX_LG_F64 : VOPCX_F64 <vopc<0x35, 0x75>, "v_cmpx_lg_f64">; +defm V_CMPX_GE_F64 : VOPCX_F64 <vopc<0x36, 0x76>, "v_cmpx_ge_f64">; +defm V_CMPX_O_F64 : VOPCX_F64 <vopc<0x37, 0x77>, "v_cmpx_o_f64">; +defm V_CMPX_U_F64 : VOPCX_F64 <vopc<0x38, 0x78>, "v_cmpx_u_f64">; +defm V_CMPX_NGE_F64 : VOPCX_F64 <vopc<0x39, 0x79>, "v_cmpx_nge_f64", "v_cmpx_nle_f64">; +defm V_CMPX_NLG_F64 : VOPCX_F64 <vopc<0x3a, 0x7a>, "v_cmpx_nlg_f64">; +defm V_CMPX_NGT_F64 : VOPCX_F64 <vopc<0x3b, 0x7b>, "v_cmpx_ngt_f64", "v_cmpx_nlt_f64">; +defm V_CMPX_NLE_F64 : VOPCX_F64 <vopc<0x3c, 0x7c>, "v_cmpx_nle_f64">; +defm V_CMPX_NEQ_F64 : VOPCX_F64 <vopc<0x3d, 0x7d>, "v_cmpx_neq_f64">; +defm V_CMPX_NLT_F64 : VOPCX_F64 <vopc<0x3e, 0x7e>, "v_cmpx_nlt_f64">; +defm V_CMPX_TRU_F64 : VOPCX_F64 <vopc<0x3f, 0x7f>, "v_cmpx_tru_f64">; + + +let SubtargetPredicate = isSICI in { + +defm V_CMPS_F_F32 : VOPC_F32 <vopc<0x40>, "v_cmps_f_f32">; +defm V_CMPS_LT_F32 : VOPC_F32 <vopc<0x41>, "v_cmps_lt_f32", COND_NULL, "v_cmps_gt_f32">; +defm V_CMPS_EQ_F32 : VOPC_F32 <vopc<0x42>, "v_cmps_eq_f32">; +defm V_CMPS_LE_F32 : VOPC_F32 <vopc<0x43>, "v_cmps_le_f32", COND_NULL, "v_cmps_ge_f32">; +defm V_CMPS_GT_F32 : VOPC_F32 <vopc<0x44>, "v_cmps_gt_f32">; +defm V_CMPS_LG_F32 : VOPC_F32 <vopc<0x45>, "v_cmps_lg_f32">; +defm V_CMPS_GE_F32 : VOPC_F32 <vopc<0x46>, "v_cmps_ge_f32">; +defm V_CMPS_O_F32 : VOPC_F32 <vopc<0x47>, "v_cmps_o_f32">; +defm V_CMPS_U_F32 : VOPC_F32 <vopc<0x48>, "v_cmps_u_f32">; +defm V_CMPS_NGE_F32 : VOPC_F32 <vopc<0x49>, "v_cmps_nge_f32", COND_NULL, "v_cmps_nle_f32">; +defm V_CMPS_NLG_F32 : VOPC_F32 <vopc<0x4a>, "v_cmps_nlg_f32">; +defm V_CMPS_NGT_F32 : VOPC_F32 <vopc<0x4b>, "v_cmps_ngt_f32", COND_NULL, "v_cmps_nlt_f32">; +defm V_CMPS_NLE_F32 : VOPC_F32 <vopc<0x4c>, "v_cmps_nle_f32">; +defm V_CMPS_NEQ_F32 : VOPC_F32 <vopc<0x4d>, "v_cmps_neq_f32">; +defm V_CMPS_NLT_F32 : VOPC_F32 <vopc<0x4e>, "v_cmps_nlt_f32">; +defm V_CMPS_TRU_F32 : VOPC_F32 <vopc<0x4f>, "v_cmps_tru_f32">; + + +defm V_CMPSX_F_F32 : VOPCX_F32 <vopc<0x50>, "v_cmpsx_f_f32">; +defm V_CMPSX_LT_F32 : VOPCX_F32 <vopc<0x51>, "v_cmpsx_lt_f32", "v_cmpsx_gt_f32">; +defm V_CMPSX_EQ_F32 : VOPCX_F32 <vopc<0x52>, "v_cmpsx_eq_f32">; +defm V_CMPSX_LE_F32 : VOPCX_F32 <vopc<0x53>, "v_cmpsx_le_f32", "v_cmpsx_ge_f32">; +defm V_CMPSX_GT_F32 : VOPCX_F32 <vopc<0x54>, "v_cmpsx_gt_f32">; +defm V_CMPSX_LG_F32 : VOPCX_F32 <vopc<0x55>, "v_cmpsx_lg_f32">; +defm V_CMPSX_GE_F32 : VOPCX_F32 <vopc<0x56>, "v_cmpsx_ge_f32">; +defm V_CMPSX_O_F32 : VOPCX_F32 <vopc<0x57>, "v_cmpsx_o_f32">; +defm V_CMPSX_U_F32 : VOPCX_F32 <vopc<0x58>, "v_cmpsx_u_f32">; +defm V_CMPSX_NGE_F32 : VOPCX_F32 <vopc<0x59>, "v_cmpsx_nge_f32", "v_cmpsx_nle_f32">; +defm V_CMPSX_NLG_F32 : VOPCX_F32 <vopc<0x5a>, "v_cmpsx_nlg_f32">; +defm V_CMPSX_NGT_F32 : VOPCX_F32 <vopc<0x5b>, "v_cmpsx_ngt_f32", "v_cmpsx_nlt_f32">; +defm V_CMPSX_NLE_F32 : VOPCX_F32 <vopc<0x5c>, "v_cmpsx_nle_f32">; +defm V_CMPSX_NEQ_F32 : VOPCX_F32 <vopc<0x5d>, "v_cmpsx_neq_f32">; +defm V_CMPSX_NLT_F32 : VOPCX_F32 <vopc<0x5e>, "v_cmpsx_nlt_f32">; +defm V_CMPSX_TRU_F32 : VOPCX_F32 <vopc<0x5f>, "v_cmpsx_tru_f32">; + + +defm V_CMPS_F_F64 : VOPC_F64 <vopc<0x60>, "v_cmps_f_f64">; +defm V_CMPS_LT_F64 : VOPC_F64 <vopc<0x61>, "v_cmps_lt_f64", COND_NULL, "v_cmps_gt_f64">; +defm V_CMPS_EQ_F64 : VOPC_F64 <vopc<0x62>, "v_cmps_eq_f64">; +defm V_CMPS_LE_F64 : VOPC_F64 <vopc<0x63>, "v_cmps_le_f64", COND_NULL, "v_cmps_ge_f64">; +defm V_CMPS_GT_F64 : VOPC_F64 <vopc<0x64>, "v_cmps_gt_f64">; +defm V_CMPS_LG_F64 : VOPC_F64 <vopc<0x65>, "v_cmps_lg_f64">; +defm V_CMPS_GE_F64 : VOPC_F64 <vopc<0x66>, "v_cmps_ge_f64">; +defm V_CMPS_O_F64 : VOPC_F64 <vopc<0x67>, "v_cmps_o_f64">; +defm V_CMPS_U_F64 : VOPC_F64 <vopc<0x68>, "v_cmps_u_f64">; +defm V_CMPS_NGE_F64 : VOPC_F64 <vopc<0x69>, "v_cmps_nge_f64", COND_NULL, "v_cmps_nle_f64">; +defm V_CMPS_NLG_F64 : VOPC_F64 <vopc<0x6a>, "v_cmps_nlg_f64">; +defm V_CMPS_NGT_F64 : VOPC_F64 <vopc<0x6b>, "v_cmps_ngt_f64", COND_NULL, "v_cmps_nlt_f64">; +defm V_CMPS_NLE_F64 : VOPC_F64 <vopc<0x6c>, "v_cmps_nle_f64">; +defm V_CMPS_NEQ_F64 : VOPC_F64 <vopc<0x6d>, "v_cmps_neq_f64">; +defm V_CMPS_NLT_F64 : VOPC_F64 <vopc<0x6e>, "v_cmps_nlt_f64">; +defm V_CMPS_TRU_F64 : VOPC_F64 <vopc<0x6f>, "v_cmps_tru_f64">; + + +defm V_CMPSX_F_F64 : VOPCX_F64 <vopc<0x70>, "v_cmpsx_f_f64">; +defm V_CMPSX_LT_F64 : VOPCX_F64 <vopc<0x71>, "v_cmpsx_lt_f64", "v_cmpsx_gt_f64">; +defm V_CMPSX_EQ_F64 : VOPCX_F64 <vopc<0x72>, "v_cmpsx_eq_f64">; +defm V_CMPSX_LE_F64 : VOPCX_F64 <vopc<0x73>, "v_cmpsx_le_f64", "v_cmpsx_ge_f64">; +defm V_CMPSX_GT_F64 : VOPCX_F64 <vopc<0x74>, "v_cmpsx_gt_f64">; +defm V_CMPSX_LG_F64 : VOPCX_F64 <vopc<0x75>, "v_cmpsx_lg_f64">; +defm V_CMPSX_GE_F64 : VOPCX_F64 <vopc<0x76>, "v_cmpsx_ge_f64">; +defm V_CMPSX_O_F64 : VOPCX_F64 <vopc<0x77>, "v_cmpsx_o_f64">; +defm V_CMPSX_U_F64 : VOPCX_F64 <vopc<0x78>, "v_cmpsx_u_f64">; +defm V_CMPSX_NGE_F64 : VOPCX_F64 <vopc<0x79>, "v_cmpsx_nge_f64", "v_cmpsx_nle_f64">; +defm V_CMPSX_NLG_F64 : VOPCX_F64 <vopc<0x7a>, "v_cmpsx_nlg_f64">; +defm V_CMPSX_NGT_F64 : VOPCX_F64 <vopc<0x7b>, "v_cmpsx_ngt_f64", "v_cmpsx_nlt_f64">; +defm V_CMPSX_NLE_F64 : VOPCX_F64 <vopc<0x7c>, "v_cmpsx_nle_f64">; +defm V_CMPSX_NEQ_F64 : VOPCX_F64 <vopc<0x7d>, "v_cmpsx_neq_f64">; +defm V_CMPSX_NLT_F64 : VOPCX_F64 <vopc<0x7e>, "v_cmpsx_nlt_f64">; +defm V_CMPSX_TRU_F64 : VOPCX_F64 <vopc<0x7f>, "v_cmpsx_tru_f64">; + +} // End SubtargetPredicate = isSICI + +defm V_CMP_F_I32 : VOPC_I32 <vopc<0x80, 0xc0>, "v_cmp_f_i32">; +defm V_CMP_LT_I32 : VOPC_I32 <vopc<0x81, 0xc1>, "v_cmp_lt_i32", COND_SLT, "v_cmp_gt_i32">; +defm V_CMP_EQ_I32 : VOPC_I32 <vopc<0x82, 0xc2>, "v_cmp_eq_i32", COND_EQ>; +defm V_CMP_LE_I32 : VOPC_I32 <vopc<0x83, 0xc3>, "v_cmp_le_i32", COND_SLE, "v_cmp_ge_i32">; +defm V_CMP_GT_I32 : VOPC_I32 <vopc<0x84, 0xc4>, "v_cmp_gt_i32", COND_SGT>; +defm V_CMP_NE_I32 : VOPC_I32 <vopc<0x85, 0xc5>, "v_cmp_ne_i32", COND_NE>; +defm V_CMP_GE_I32 : VOPC_I32 <vopc<0x86, 0xc6>, "v_cmp_ge_i32", COND_SGE>; +defm V_CMP_T_I32 : VOPC_I32 <vopc<0x87, 0xc7>, "v_cmp_t_i32">; + + +defm V_CMPX_F_I32 : VOPCX_I32 <vopc<0x90, 0xd0>, "v_cmpx_f_i32">; +defm V_CMPX_LT_I32 : VOPCX_I32 <vopc<0x91, 0xd1>, "v_cmpx_lt_i32", "v_cmpx_gt_i32">; +defm V_CMPX_EQ_I32 : VOPCX_I32 <vopc<0x92, 0xd2>, "v_cmpx_eq_i32">; +defm V_CMPX_LE_I32 : VOPCX_I32 <vopc<0x93, 0xd3>, "v_cmpx_le_i32", "v_cmpx_ge_i32">; +defm V_CMPX_GT_I32 : VOPCX_I32 <vopc<0x94, 0xd4>, "v_cmpx_gt_i32">; +defm V_CMPX_NE_I32 : VOPCX_I32 <vopc<0x95, 0xd5>, "v_cmpx_ne_i32">; +defm V_CMPX_GE_I32 : VOPCX_I32 <vopc<0x96, 0xd6>, "v_cmpx_ge_i32">; +defm V_CMPX_T_I32 : VOPCX_I32 <vopc<0x97, 0xd7>, "v_cmpx_t_i32">; + + +defm V_CMP_F_I64 : VOPC_I64 <vopc<0xa0, 0xe0>, "v_cmp_f_i64">; +defm V_CMP_LT_I64 : VOPC_I64 <vopc<0xa1, 0xe1>, "v_cmp_lt_i64", COND_SLT, "v_cmp_gt_i64">; +defm V_CMP_EQ_I64 : VOPC_I64 <vopc<0xa2, 0xe2>, "v_cmp_eq_i64", COND_EQ>; +defm V_CMP_LE_I64 : VOPC_I64 <vopc<0xa3, 0xe3>, "v_cmp_le_i64", COND_SLE, "v_cmp_ge_i64">; +defm V_CMP_GT_I64 : VOPC_I64 <vopc<0xa4, 0xe4>, "v_cmp_gt_i64", COND_SGT>; +defm V_CMP_NE_I64 : VOPC_I64 <vopc<0xa5, 0xe5>, "v_cmp_ne_i64", COND_NE>; +defm V_CMP_GE_I64 : VOPC_I64 <vopc<0xa6, 0xe6>, "v_cmp_ge_i64", COND_SGE>; +defm V_CMP_T_I64 : VOPC_I64 <vopc<0xa7, 0xe7>, "v_cmp_t_i64">; + + +defm V_CMPX_F_I64 : VOPCX_I64 <vopc<0xb0, 0xf0>, "v_cmpx_f_i64">; +defm V_CMPX_LT_I64 : VOPCX_I64 <vopc<0xb1, 0xf1>, "v_cmpx_lt_i64", "v_cmpx_gt_i64">; +defm V_CMPX_EQ_I64 : VOPCX_I64 <vopc<0xb2, 0xf2>, "v_cmpx_eq_i64">; +defm V_CMPX_LE_I64 : VOPCX_I64 <vopc<0xb3, 0xf3>, "v_cmpx_le_i64", "v_cmpx_ge_i64">; +defm V_CMPX_GT_I64 : VOPCX_I64 <vopc<0xb4, 0xf4>, "v_cmpx_gt_i64">; +defm V_CMPX_NE_I64 : VOPCX_I64 <vopc<0xb5, 0xf5>, "v_cmpx_ne_i64">; +defm V_CMPX_GE_I64 : VOPCX_I64 <vopc<0xb6, 0xf6>, "v_cmpx_ge_i64">; +defm V_CMPX_T_I64 : VOPCX_I64 <vopc<0xb7, 0xf7>, "v_cmpx_t_i64">; + + +defm V_CMP_F_U32 : VOPC_I32 <vopc<0xc0, 0xc8>, "v_cmp_f_u32">; +defm V_CMP_LT_U32 : VOPC_I32 <vopc<0xc1, 0xc9>, "v_cmp_lt_u32", COND_ULT, "v_cmp_gt_u32">; +defm V_CMP_EQ_U32 : VOPC_I32 <vopc<0xc2, 0xca>, "v_cmp_eq_u32", COND_EQ>; +defm V_CMP_LE_U32 : VOPC_I32 <vopc<0xc3, 0xcb>, "v_cmp_le_u32", COND_ULE, "v_cmp_ge_u32">; +defm V_CMP_GT_U32 : VOPC_I32 <vopc<0xc4, 0xcc>, "v_cmp_gt_u32", COND_UGT>; +defm V_CMP_NE_U32 : VOPC_I32 <vopc<0xc5, 0xcd>, "v_cmp_ne_u32", COND_NE>; +defm V_CMP_GE_U32 : VOPC_I32 <vopc<0xc6, 0xce>, "v_cmp_ge_u32", COND_UGE>; +defm V_CMP_T_U32 : VOPC_I32 <vopc<0xc7, 0xcf>, "v_cmp_t_u32">; + + +defm V_CMPX_F_U32 : VOPCX_I32 <vopc<0xd0, 0xd8>, "v_cmpx_f_u32">; +defm V_CMPX_LT_U32 : VOPCX_I32 <vopc<0xd1, 0xd9>, "v_cmpx_lt_u32", "v_cmpx_gt_u32">; +defm V_CMPX_EQ_U32 : VOPCX_I32 <vopc<0xd2, 0xda>, "v_cmpx_eq_u32">; +defm V_CMPX_LE_U32 : VOPCX_I32 <vopc<0xd3, 0xdb>, "v_cmpx_le_u32", "v_cmpx_le_u32">; +defm V_CMPX_GT_U32 : VOPCX_I32 <vopc<0xd4, 0xdc>, "v_cmpx_gt_u32">; +defm V_CMPX_NE_U32 : VOPCX_I32 <vopc<0xd5, 0xdd>, "v_cmpx_ne_u32">; +defm V_CMPX_GE_U32 : VOPCX_I32 <vopc<0xd6, 0xde>, "v_cmpx_ge_u32">; +defm V_CMPX_T_U32 : VOPCX_I32 <vopc<0xd7, 0xdf>, "v_cmpx_t_u32">; + + +defm V_CMP_F_U64 : VOPC_I64 <vopc<0xe0, 0xe8>, "v_cmp_f_u64">; +defm V_CMP_LT_U64 : VOPC_I64 <vopc<0xe1, 0xe9>, "v_cmp_lt_u64", COND_ULT, "v_cmp_gt_u64">; +defm V_CMP_EQ_U64 : VOPC_I64 <vopc<0xe2, 0xea>, "v_cmp_eq_u64", COND_EQ>; +defm V_CMP_LE_U64 : VOPC_I64 <vopc<0xe3, 0xeb>, "v_cmp_le_u64", COND_ULE, "v_cmp_ge_u64">; +defm V_CMP_GT_U64 : VOPC_I64 <vopc<0xe4, 0xec>, "v_cmp_gt_u64", COND_UGT>; +defm V_CMP_NE_U64 : VOPC_I64 <vopc<0xe5, 0xed>, "v_cmp_ne_u64", COND_NE>; +defm V_CMP_GE_U64 : VOPC_I64 <vopc<0xe6, 0xee>, "v_cmp_ge_u64", COND_UGE>; +defm V_CMP_T_U64 : VOPC_I64 <vopc<0xe7, 0xef>, "v_cmp_t_u64">; + +defm V_CMPX_F_U64 : VOPCX_I64 <vopc<0xf0, 0xf8>, "v_cmpx_f_u64">; +defm V_CMPX_LT_U64 : VOPCX_I64 <vopc<0xf1, 0xf9>, "v_cmpx_lt_u64", "v_cmpx_gt_u64">; +defm V_CMPX_EQ_U64 : VOPCX_I64 <vopc<0xf2, 0xfa>, "v_cmpx_eq_u64">; +defm V_CMPX_LE_U64 : VOPCX_I64 <vopc<0xf3, 0xfb>, "v_cmpx_le_u64", "v_cmpx_ge_u64">; +defm V_CMPX_GT_U64 : VOPCX_I64 <vopc<0xf4, 0xfc>, "v_cmpx_gt_u64">; +defm V_CMPX_NE_U64 : VOPCX_I64 <vopc<0xf5, 0xfd>, "v_cmpx_ne_u64">; +defm V_CMPX_GE_U64 : VOPCX_I64 <vopc<0xf6, 0xfe>, "v_cmpx_ge_u64">; +defm V_CMPX_T_U64 : VOPCX_I64 <vopc<0xf7, 0xff>, "v_cmpx_t_u64">; + +} // End isCompare = 1, isCommutable = 1 + +defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <vopc<0x88, 0x10>, "v_cmp_class_f32">; +defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <vopc<0x98, 0x11>, "v_cmpx_class_f32">; +defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <vopc<0xa8, 0x12>, "v_cmp_class_f64">; +defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <vopc<0xb8, 0x13>, "v_cmpx_class_f64">; + +//===----------------------------------------------------------------------===// +// DS Instructions +//===----------------------------------------------------------------------===// + +defm DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VGPR_32>; +defm DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VGPR_32>; +defm DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VGPR_32>; +defm DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VGPR_32>; +defm DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VGPR_32>; +defm DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VGPR_32>; +defm DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VGPR_32>; +defm DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VGPR_32>; +defm DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VGPR_32>; +defm DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VGPR_32>; +defm DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VGPR_32>; +defm DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>; +defm DS_MSKOR_B32 : DS_1A2D_NORET <0xc, "ds_mskor_b32", VGPR_32>; +let mayLoad = 0 in { +defm DS_WRITE_B32 : DS_1A1D_NORET <0xd, "ds_write_b32", VGPR_32>; +defm DS_WRITE2_B32 : DS_1A1D_Off8_NORET <0xe, "ds_write2_b32", VGPR_32>; +defm DS_WRITE2ST64_B32 : DS_1A1D_Off8_NORET <0xf, "ds_write2st64_b32", VGPR_32>; +} +defm DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>; +defm DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>; +defm DS_MIN_F32 : DS_1A2D_NORET <0x12, "ds_min_f32", VGPR_32>; +defm DS_MAX_F32 : DS_1A2D_NORET <0x13, "ds_max_f32", VGPR_32>; + +defm DS_GWS_INIT : DS_1A_GDS <0x19, "ds_gws_init">; +defm DS_GWS_SEMA_V : DS_1A_GDS <0x1a, "ds_gws_sema_v">; +defm DS_GWS_SEMA_BR : DS_1A_GDS <0x1b, "ds_gws_sema_br">; +defm DS_GWS_SEMA_P : DS_1A_GDS <0x1c, "ds_gws_sema_p">; +defm DS_GWS_BARRIER : DS_1A_GDS <0x1d, "ds_gws_barrier">; +let mayLoad = 0 in { +defm DS_WRITE_B8 : DS_1A1D_NORET <0x1e, "ds_write_b8", VGPR_32>; +defm DS_WRITE_B16 : DS_1A1D_NORET <0x1f, "ds_write_b16", VGPR_32>; +} +defm DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VGPR_32, "ds_add_u32">; +defm DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">; +defm DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">; +defm DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">; +defm DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VGPR_32, "ds_dec_u32">; +defm DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VGPR_32, "ds_min_i32">; +defm DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VGPR_32, "ds_max_i32">; +defm DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VGPR_32, "ds_min_u32">; +defm DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VGPR_32, "ds_max_u32">; +defm DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VGPR_32, "ds_and_b32">; +defm DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VGPR_32, "ds_or_b32">; +defm DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">; +defm DS_MSKOR_RTN_B32 : DS_1A2D_RET <0x2c, "ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">; +defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VGPR_32>; +defm DS_WRXCHG2_RTN_B32 : DS_1A2D_RET < + 0x2e, "ds_wrxchg2_rtn_b32", VReg_64, "", VGPR_32 +>; +defm DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_RET < + 0x2f, "ds_wrxchg2st64_rtn_b32", VReg_64, "", VGPR_32 +>; +defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">; +defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">; +defm DS_MIN_RTN_F32 : DS_1A2D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">; +defm DS_MAX_RTN_F32 : DS_1A2D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">; +defm DS_SWIZZLE_B32 : DS_1A_RET <0x35, "ds_swizzle_b32", VGPR_32>; +let mayStore = 0 in { +defm DS_READ_B32 : DS_1A_RET <0x36, "ds_read_b32", VGPR_32>; +defm DS_READ2_B32 : DS_1A_Off8_RET <0x37, "ds_read2_b32", VReg_64>; +defm DS_READ2ST64_B32 : DS_1A_Off8_RET <0x38, "ds_read2st64_b32", VReg_64>; +defm DS_READ_I8 : DS_1A_RET <0x39, "ds_read_i8", VGPR_32>; +defm DS_READ_U8 : DS_1A_RET <0x3a, "ds_read_u8", VGPR_32>; +defm DS_READ_I16 : DS_1A_RET <0x3b, "ds_read_i16", VGPR_32>; +defm DS_READ_U16 : DS_1A_RET <0x3c, "ds_read_u16", VGPR_32>; +} +defm DS_CONSUME : DS_0A_RET <0x3d, "ds_consume">; +defm DS_APPEND : DS_0A_RET <0x3e, "ds_append">; +defm DS_ORDERED_COUNT : DS_1A_RET_GDS <0x3f, "ds_ordered_count">; +defm DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>; +defm DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>; +defm DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>; +defm DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>; +defm DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>; +defm DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>; +defm DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>; +defm DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>; +defm DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>; +defm DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>; +defm DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>; +defm DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>; +defm DS_MSKOR_B64 : DS_1A2D_NORET <0x4c, "ds_mskor_b64", VReg_64>; +let mayLoad = 0 in { +defm DS_WRITE_B64 : DS_1A1D_NORET <0x4d, "ds_write_b64", VReg_64>; +defm DS_WRITE2_B64 : DS_1A1D_Off8_NORET <0x4E, "ds_write2_b64", VReg_64>; +defm DS_WRITE2ST64_B64 : DS_1A1D_Off8_NORET <0x4f, "ds_write2st64_b64", VReg_64>; +} +defm DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>; +defm DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>; +defm DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>; +defm DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>; + +defm DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">; +defm DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">; +defm DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">; +defm DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">; +defm DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">; +defm DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">; +defm DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">; +defm DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">; +defm DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">; +defm DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">; +defm DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">; +defm DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">; +defm DS_MSKOR_RTN_B64 : DS_1A2D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">; +defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">; +defm DS_WRXCHG2_RTN_B64 : DS_1A2D_RET <0x6e, "ds_wrxchg2_rtn_b64", VReg_128, "ds_wrxchg2_b64", VReg_64>; +defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_RET <0x6f, "ds_wrxchg2st64_rtn_b64", VReg_128, "ds_wrxchg2st64_b64", VReg_64>; +defm DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">; +defm DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">; +defm DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_rtn_f64", VReg_64, "ds_min_f64">; +defm DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_rtn_f64", VReg_64, "ds_max_f64">; + +let mayStore = 0 in { +defm DS_READ_B64 : DS_1A_RET <0x76, "ds_read_b64", VReg_64>; +defm DS_READ2_B64 : DS_1A_Off8_RET <0x77, "ds_read2_b64", VReg_128>; +defm DS_READ2ST64_B64 : DS_1A_Off8_RET <0x78, "ds_read2st64_b64", VReg_128>; +} + +defm DS_ADD_SRC2_U32 : DS_1A <0x80, "ds_add_src2_u32">; +defm DS_SUB_SRC2_U32 : DS_1A <0x81, "ds_sub_src2_u32">; +defm DS_RSUB_SRC2_U32 : DS_1A <0x82, "ds_rsub_src2_u32">; +defm DS_INC_SRC2_U32 : DS_1A <0x83, "ds_inc_src2_u32">; +defm DS_DEC_SRC2_U32 : DS_1A <0x84, "ds_dec_src2_u32">; +defm DS_MIN_SRC2_I32 : DS_1A <0x85, "ds_min_src2_i32">; +defm DS_MAX_SRC2_I32 : DS_1A <0x86, "ds_max_src2_i32">; +defm DS_MIN_SRC2_U32 : DS_1A <0x87, "ds_min_src2_u32">; +defm DS_MAX_SRC2_U32 : DS_1A <0x88, "ds_max_src2_u32">; +defm DS_AND_SRC2_B32 : DS_1A <0x89, "ds_and_src_b32">; +defm DS_OR_SRC2_B32 : DS_1A <0x8a, "ds_or_src2_b32">; +defm DS_XOR_SRC2_B32 : DS_1A <0x8b, "ds_xor_src2_b32">; +defm DS_WRITE_SRC2_B32 : DS_1A <0x8c, "ds_write_src2_b32">; + +defm DS_MIN_SRC2_F32 : DS_1A <0x92, "ds_min_src2_f32">; +defm DS_MAX_SRC2_F32 : DS_1A <0x93, "ds_max_src2_f32">; + +defm DS_ADD_SRC2_U64 : DS_1A <0xc0, "ds_add_src2_u64">; +defm DS_SUB_SRC2_U64 : DS_1A <0xc1, "ds_sub_src2_u64">; +defm DS_RSUB_SRC2_U64 : DS_1A <0xc2, "ds_rsub_src2_u64">; +defm DS_INC_SRC2_U64 : DS_1A <0xc3, "ds_inc_src2_u64">; +defm DS_DEC_SRC2_U64 : DS_1A <0xc4, "ds_dec_src2_u64">; +defm DS_MIN_SRC2_I64 : DS_1A <0xc5, "ds_min_src2_i64">; +defm DS_MAX_SRC2_I64 : DS_1A <0xc6, "ds_max_src2_i64">; +defm DS_MIN_SRC2_U64 : DS_1A <0xc7, "ds_min_src2_u64">; +defm DS_MAX_SRC2_U64 : DS_1A <0xc8, "ds_max_src2_u64">; +defm DS_AND_SRC2_B64 : DS_1A <0xc9, "ds_and_src2_b64">; +defm DS_OR_SRC2_B64 : DS_1A <0xca, "ds_or_src2_b64">; +defm DS_XOR_SRC2_B64 : DS_1A <0xcb, "ds_xor_src2_b64">; +defm DS_WRITE_SRC2_B64 : DS_1A <0xcc, "ds_write_src2_b64">; + +defm DS_MIN_SRC2_F64 : DS_1A <0xd2, "ds_min_src2_f64">; +defm DS_MAX_SRC2_F64 : DS_1A <0xd3, "ds_max_src2_f64">; + +//===----------------------------------------------------------------------===// +// MUBUF Instructions +//===----------------------------------------------------------------------===// + +defm BUFFER_LOAD_FORMAT_X : MUBUF_Load_Helper < + mubuf<0x00>, "buffer_load_format_x", VGPR_32 +>; +defm BUFFER_LOAD_FORMAT_XY : MUBUF_Load_Helper < + mubuf<0x01>, "buffer_load_format_xy", VReg_64 +>; +defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Load_Helper < + mubuf<0x02>, "buffer_load_format_xyz", VReg_96 +>; +defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper < + mubuf<0x03>, "buffer_load_format_xyzw", VReg_128 +>; +defm BUFFER_STORE_FORMAT_X : MUBUF_Store_Helper < + mubuf<0x04>, "buffer_store_format_x", VGPR_32 +>; +defm BUFFER_STORE_FORMAT_XY : MUBUF_Store_Helper < + mubuf<0x05>, "buffer_store_format_xy", VReg_64 +>; +defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Store_Helper < + mubuf<0x06>, "buffer_store_format_xyz", VReg_96 +>; +defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Store_Helper < + mubuf<0x07>, "buffer_store_format_xyzw", VReg_128 +>; +defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper < + mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, az_extloadi8_global +>; +defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper < + mubuf<0x09, 0x11>, "buffer_load_sbyte", VGPR_32, i32, sextloadi8_global +>; +defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper < + mubuf<0x0a, 0x12>, "buffer_load_ushort", VGPR_32, i32, az_extloadi16_global +>; +defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper < + mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global +>; +defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper < + mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, mubuf_load +>; +defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper < + mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, mubuf_load +>; +defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper < + mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load +>; + +defm BUFFER_STORE_BYTE : MUBUF_Store_Helper < + mubuf<0x18>, "buffer_store_byte", VGPR_32, i32, truncstorei8_global +>; + +defm BUFFER_STORE_SHORT : MUBUF_Store_Helper < + mubuf<0x1a>, "buffer_store_short", VGPR_32, i32, truncstorei16_global +>; + +defm BUFFER_STORE_DWORD : MUBUF_Store_Helper < + mubuf<0x1c>, "buffer_store_dword", VGPR_32, i32, global_store +>; + +defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper < + mubuf<0x1d>, "buffer_store_dwordx2", VReg_64, v2i32, global_store +>; + +defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper < + mubuf<0x1e, 0x1f>, "buffer_store_dwordx4", VReg_128, v4i32, global_store +>; + +defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic < + mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global +>; +//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <mubuf<0x31, 0x41>, "buffer_atomic_cmpswap", []>; +defm BUFFER_ATOMIC_ADD : MUBUF_Atomic < + mubuf<0x32, 0x42>, "buffer_atomic_add", VGPR_32, i32, atomic_add_global +>; +defm BUFFER_ATOMIC_SUB : MUBUF_Atomic < + mubuf<0x33, 0x43>, "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global +>; +//def BUFFER_ATOMIC_RSUB : MUBUF_ <mubuf<0x34>, "buffer_atomic_rsub", []>; // isn't on CI & VI +defm BUFFER_ATOMIC_SMIN : MUBUF_Atomic < + mubuf<0x35, 0x44>, "buffer_atomic_smin", VGPR_32, i32, atomic_min_global +>; +defm BUFFER_ATOMIC_UMIN : MUBUF_Atomic < + mubuf<0x36, 0x45>, "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global +>; +defm BUFFER_ATOMIC_SMAX : MUBUF_Atomic < + mubuf<0x37, 0x46>, "buffer_atomic_smax", VGPR_32, i32, atomic_max_global +>; +defm BUFFER_ATOMIC_UMAX : MUBUF_Atomic < + mubuf<0x38, 0x47>, "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global +>; +defm BUFFER_ATOMIC_AND : MUBUF_Atomic < + mubuf<0x39, 0x48>, "buffer_atomic_and", VGPR_32, i32, atomic_and_global +>; +defm BUFFER_ATOMIC_OR : MUBUF_Atomic < + mubuf<0x3a, 0x49>, "buffer_atomic_or", VGPR_32, i32, atomic_or_global +>; +defm BUFFER_ATOMIC_XOR : MUBUF_Atomic < + mubuf<0x3b, 0x4a>, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global +>; +//def BUFFER_ATOMIC_INC : MUBUF_ <mubuf<0x3c, 0x4b>, "buffer_atomic_inc", []>; +//def BUFFER_ATOMIC_DEC : MUBUF_ <mubuf<0x3d, 0x4c>, "buffer_atomic_dec", []>; +//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <mubuf<0x3e>, "buffer_atomic_fcmpswap", []>; // isn't on VI +//def BUFFER_ATOMIC_FMIN : MUBUF_ <mubuf<0x3f>, "buffer_atomic_fmin", []>; // isn't on VI +//def BUFFER_ATOMIC_FMAX : MUBUF_ <mubuf<0x40>, "buffer_atomic_fmax", []>; // isn't on VI +//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <mubuf<0x50, 0x60>, "buffer_atomic_swap_x2", []>; +//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", []>; +//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <mubuf<0x52, 0x62>, "buffer_atomic_add_x2", []>; +//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <mubuf<0x53, 0x63>, "buffer_atomic_sub_x2", []>; +//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <mubuf<0x54>, "buffer_atomic_rsub_x2", []>; // isn't on CI & VI +//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <mubuf<0x55, 0x64>, "buffer_atomic_smin_x2", []>; +//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <mubuf<0x56, 0x65>, "buffer_atomic_umin_x2", []>; +//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <mubuf<0x57, 0x66>, "buffer_atomic_smax_x2", []>; +//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <mubuf<0x58, 0x67>, "buffer_atomic_umax_x2", []>; +//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <mubuf<0x59, 0x68>, "buffer_atomic_and_x2", []>; +//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <mubuf<0x5a, 0x69>, "buffer_atomic_or_x2", []>; +//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <mubuf<0x5b, 0x6a>, "buffer_atomic_xor_x2", []>; +//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <mubuf<0x5c, 0x6b>, "buffer_atomic_inc_x2", []>; +//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <mubuf<0x5d, 0x6c>, "buffer_atomic_dec_x2", []>; +//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <mubuf<0x5e>, "buffer_atomic_fcmpswap_x2", []>; // isn't on VI +//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <mubuf<0x5f>, "buffer_atomic_fmin_x2", []>; // isn't on VI +//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <mubuf<0x60>, "buffer_atomic_fmax_x2", []>; // isn't on VI + +let SubtargetPredicate = isSI in { +defm BUFFER_WBINVL1_SC : MUBUF_Invalidate <mubuf<0x70>, "buffer_wbinvl1_sc", int_amdgcn_buffer_wbinvl1_sc>; // isn't on CI & VI +} + +defm BUFFER_WBINVL1 : MUBUF_Invalidate <mubuf<0x71, 0x3e>, "buffer_wbinvl1", int_amdgcn_buffer_wbinvl1>; + +//===----------------------------------------------------------------------===// +// MTBUF Instructions +//===----------------------------------------------------------------------===// + +//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "tbuffer_load_format_x", []>; +//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "tbuffer_load_format_xy", []>; +//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "tbuffer_load_format_xyz", []>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "tbuffer_load_format_xyzw", VReg_128>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "tbuffer_store_format_x", VGPR_32>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "tbuffer_store_format_xy", VReg_64>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "tbuffer_store_format_xyz", VReg_128>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "tbuffer_store_format_xyzw", VReg_128>; + +//===----------------------------------------------------------------------===// +// MIMG Instructions +//===----------------------------------------------------------------------===// + +defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load">; +defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">; +//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"image_load_pck", 0x00000002>; +//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>; +//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>; +//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>; +//def IMAGE_STORE : MIMG_NoPattern_ <"image_store", 0x00000008>; +//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"image_store_mip", 0x00000009>; +//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>; +//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>; +defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">; +//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"image_atomic_swap", 0x0000000f>; +//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"image_atomic_cmpswap", 0x00000010>; +//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"image_atomic_add", 0x00000011>; +//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"image_atomic_sub", 0x00000012>; +//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; +//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"image_atomic_smin", 0x00000014>; +//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"image_atomic_umin", 0x00000015>; +//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"image_atomic_smax", 0x00000016>; +//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"image_atomic_umax", 0x00000017>; +//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"image_atomic_and", 0x00000018>; +//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"image_atomic_or", 0x00000019>; +//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"image_atomic_xor", 0x0000001a>; +//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"image_atomic_inc", 0x0000001b>; +//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"image_atomic_dec", 0x0000001c>; +//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; +//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; +//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; +defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, "image_sample">; +defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">; +defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "image_sample_d">; +defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, "image_sample_d_cl">; +defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "image_sample_l">; +defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, "image_sample_b">; +defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">; +defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, "image_sample_lz">; +defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, "image_sample_c">; +defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">; +defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "image_sample_c_d">; +defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">; +defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "image_sample_c_l">; +defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">; +defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">; +defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, "image_sample_c_lz">; +defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, "image_sample_o">; +defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">; +defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, "image_sample_d_o">; +defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">; +defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, "image_sample_l_o">; +defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">; +defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">; +defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, "image_sample_lz_o">; +defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">; +defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">; +defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">; +defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">; +defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">; +defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">; +defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">; +defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">; +defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, "image_gather4">; +defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">; +defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "image_gather4_l">; +defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, "image_gather4_b">; +defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">; +defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "image_gather4_lz">; +defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, "image_gather4_c">; +defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">; +defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "image_gather4_c_l">; +defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">; +defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">; +defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "image_gather4_c_lz">; +defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, "image_gather4_o">; +defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">; +defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "image_gather4_l_o">; +defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">; +defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">; +defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "image_gather4_lz_o">; +defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">; +defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">; +defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">; +defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">; +defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">; +defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">; +defm IMAGE_GET_LOD : MIMG_Sampler_WQM <0x00000060, "image_get_lod">; +defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">; +defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">; +defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">; +defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, "image_sample_c_cd_cl">; +defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, "image_sample_cd_o">; +defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, "image_sample_cd_cl_o">; +defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, "image_sample_c_cd_o">; +defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o">; +//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>; +//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>; + +//===----------------------------------------------------------------------===// +// VOP1 Instructions +//===----------------------------------------------------------------------===// + +let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in { +defm V_NOP : VOP1Inst <vop1<0x0>, "v_nop", VOP_NONE>; +} + +let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in { +defm V_MOV_B32 : VOP1Inst <vop1<0x1>, "v_mov_b32", VOP_I32_I32>; +} // End isMoveImm = 1 + +let Uses = [EXEC] in { + +// FIXME: Specify SchedRW for READFIRSTLANE_B32 + +def V_READFIRSTLANE_B32 : VOP1 < + 0x00000002, + (outs SReg_32:$vdst), + (ins VGPR_32:$src0), + "v_readfirstlane_b32 $vdst, $src0", + [] +>; + +} + +let SchedRW = [WriteQuarterRate32] in { + +defm V_CVT_I32_F64 : VOP1Inst <vop1<0x3>, "v_cvt_i32_f64", + VOP_I32_F64, fp_to_sint +>; +defm V_CVT_F64_I32 : VOP1Inst <vop1<0x4>, "v_cvt_f64_i32", + VOP_F64_I32, sint_to_fp +>; +defm V_CVT_F32_I32 : VOP1Inst <vop1<0x5>, "v_cvt_f32_i32", + VOP_F32_I32, sint_to_fp +>; +defm V_CVT_F32_U32 : VOP1Inst <vop1<0x6>, "v_cvt_f32_u32", + VOP_F32_I32, uint_to_fp +>; +defm V_CVT_U32_F32 : VOP1Inst <vop1<0x7>, "v_cvt_u32_f32", + VOP_I32_F32, fp_to_uint +>; +defm V_CVT_I32_F32 : VOP1Inst <vop1<0x8>, "v_cvt_i32_f32", + VOP_I32_F32, fp_to_sint +>; +defm V_CVT_F16_F32 : VOP1Inst <vop1<0xa>, "v_cvt_f16_f32", + VOP_I32_F32, fp_to_f16 +>; +defm V_CVT_F32_F16 : VOP1Inst <vop1<0xb>, "v_cvt_f32_f16", + VOP_F32_I32, f16_to_fp +>; +defm V_CVT_RPI_I32_F32 : VOP1Inst <vop1<0xc>, "v_cvt_rpi_i32_f32", + VOP_I32_F32, cvt_rpi_i32_f32>; +defm V_CVT_FLR_I32_F32 : VOP1Inst <vop1<0xd>, "v_cvt_flr_i32_f32", + VOP_I32_F32, cvt_flr_i32_f32>; +defm V_CVT_OFF_F32_I4 : VOP1Inst <vop1<0x0e>, "v_cvt_off_f32_i4", VOP_F32_I32>; +defm V_CVT_F32_F64 : VOP1Inst <vop1<0xf>, "v_cvt_f32_f64", + VOP_F32_F64, fround +>; +defm V_CVT_F64_F32 : VOP1Inst <vop1<0x10>, "v_cvt_f64_f32", + VOP_F64_F32, fextend +>; +defm V_CVT_F32_UBYTE0 : VOP1Inst <vop1<0x11>, "v_cvt_f32_ubyte0", + VOP_F32_I32, AMDGPUcvt_f32_ubyte0 +>; +defm V_CVT_F32_UBYTE1 : VOP1Inst <vop1<0x12>, "v_cvt_f32_ubyte1", + VOP_F32_I32, AMDGPUcvt_f32_ubyte1 +>; +defm V_CVT_F32_UBYTE2 : VOP1Inst <vop1<0x13>, "v_cvt_f32_ubyte2", + VOP_F32_I32, AMDGPUcvt_f32_ubyte2 +>; +defm V_CVT_F32_UBYTE3 : VOP1Inst <vop1<0x14>, "v_cvt_f32_ubyte3", + VOP_F32_I32, AMDGPUcvt_f32_ubyte3 +>; +defm V_CVT_U32_F64 : VOP1Inst <vop1<0x15>, "v_cvt_u32_f64", + VOP_I32_F64, fp_to_uint +>; +defm V_CVT_F64_U32 : VOP1Inst <vop1<0x16>, "v_cvt_f64_u32", + VOP_F64_I32, uint_to_fp +>; + +} // let SchedRW = [WriteQuarterRate32] + +defm V_FRACT_F32 : VOP1Inst <vop1<0x20, 0x1b>, "v_fract_f32", + VOP_F32_F32, AMDGPUfract +>; +defm V_TRUNC_F32 : VOP1Inst <vop1<0x21, 0x1c>, "v_trunc_f32", + VOP_F32_F32, ftrunc +>; +defm V_CEIL_F32 : VOP1Inst <vop1<0x22, 0x1d>, "v_ceil_f32", + VOP_F32_F32, fceil +>; +defm V_RNDNE_F32 : VOP1Inst <vop1<0x23, 0x1e>, "v_rndne_f32", + VOP_F32_F32, frint +>; +defm V_FLOOR_F32 : VOP1Inst <vop1<0x24, 0x1f>, "v_floor_f32", + VOP_F32_F32, ffloor +>; +defm V_EXP_F32 : VOP1Inst <vop1<0x25, 0x20>, "v_exp_f32", + VOP_F32_F32, fexp2 +>; + +let SchedRW = [WriteQuarterRate32] in { + +defm V_LOG_F32 : VOP1Inst <vop1<0x27, 0x21>, "v_log_f32", + VOP_F32_F32, flog2 +>; +defm V_RCP_F32 : VOP1Inst <vop1<0x2a, 0x22>, "v_rcp_f32", + VOP_F32_F32, AMDGPUrcp +>; +defm V_RCP_IFLAG_F32 : VOP1Inst <vop1<0x2b, 0x23>, "v_rcp_iflag_f32", + VOP_F32_F32 +>; +defm V_RSQ_F32 : VOP1Inst <vop1<0x2e, 0x24>, "v_rsq_f32", + VOP_F32_F32, AMDGPUrsq +>; + +} //let SchedRW = [WriteQuarterRate32] + +let SchedRW = [WriteDouble] in { + +defm V_RCP_F64 : VOP1Inst <vop1<0x2f, 0x25>, "v_rcp_f64", + VOP_F64_F64, AMDGPUrcp +>; +defm V_RSQ_F64 : VOP1Inst <vop1<0x31, 0x26>, "v_rsq_f64", + VOP_F64_F64, AMDGPUrsq +>; + +} // let SchedRW = [WriteDouble]; + +defm V_SQRT_F32 : VOP1Inst <vop1<0x33, 0x27>, "v_sqrt_f32", + VOP_F32_F32, fsqrt +>; + +let SchedRW = [WriteDouble] in { + +defm V_SQRT_F64 : VOP1Inst <vop1<0x34, 0x28>, "v_sqrt_f64", + VOP_F64_F64, fsqrt +>; + +} // End SchedRW = [WriteDouble] + +let SchedRW = [WriteQuarterRate32] in { + +defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32", + VOP_F32_F32, AMDGPUsin +>; +defm V_COS_F32 : VOP1Inst <vop1<0x36, 0x2a>, "v_cos_f32", + VOP_F32_F32, AMDGPUcos +>; + +} // End SchedRW = [WriteQuarterRate32] + +defm V_NOT_B32 : VOP1Inst <vop1<0x37, 0x2b>, "v_not_b32", VOP_I32_I32>; +defm V_BFREV_B32 : VOP1Inst <vop1<0x38, 0x2c>, "v_bfrev_b32", VOP_I32_I32>; +defm V_FFBH_U32 : VOP1Inst <vop1<0x39, 0x2d>, "v_ffbh_u32", VOP_I32_I32>; +defm V_FFBL_B32 : VOP1Inst <vop1<0x3a, 0x2e>, "v_ffbl_b32", VOP_I32_I32>; +defm V_FFBH_I32 : VOP1Inst <vop1<0x3b, 0x2f>, "v_ffbh_i32", VOP_I32_I32>; +defm V_FREXP_EXP_I32_F64 : VOP1Inst <vop1<0x3c,0x30>, "v_frexp_exp_i32_f64", + VOP_I32_F64 +>; + +let SchedRW = [WriteDoubleAdd] in { +defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d, 0x31>, "v_frexp_mant_f64", + VOP_F64_F64 +>; + +defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64", + VOP_F64_F64 +>; +} // End SchedRW = [WriteDoubleAdd] + + +defm V_FREXP_EXP_I32_F32 : VOP1Inst <vop1<0x3f, 0x33>, "v_frexp_exp_i32_f32", + VOP_I32_F32 +>; +defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40, 0x34>, "v_frexp_mant_f32", + VOP_F32_F32 +>; +let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in { +defm V_CLREXCP : VOP1Inst <vop1<0x41,0x35>, "v_clrexcp", VOP_NONE>; +} + +let Uses = [M0, EXEC] in { +defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_I32_I32>; +defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_I32_I32>; +defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_I32_I32>; +} // End Uses = [M0, EXEC] + +// These instruction only exist on SI and CI +let SubtargetPredicate = isSICI in { + +let SchedRW = [WriteQuarterRate32] in { + +defm V_MOV_FED_B32 : VOP1InstSI <vop1<0x9>, "v_mov_fed_b32", VOP_I32_I32>; +defm V_LOG_CLAMP_F32 : VOP1InstSI <vop1<0x26>, "v_log_clamp_f32", VOP_F32_F32>; +defm V_RCP_CLAMP_F32 : VOP1InstSI <vop1<0x28>, "v_rcp_clamp_f32", VOP_F32_F32>; +defm V_RCP_LEGACY_F32 : VOP1InstSI <vop1<0x29>, "v_rcp_legacy_f32", VOP_F32_F32>; +defm V_RSQ_CLAMP_F32 : VOP1InstSI <vop1<0x2c>, "v_rsq_clamp_f32", + VOP_F32_F32, AMDGPUrsq_clamped +>; +defm V_RSQ_LEGACY_F32 : VOP1InstSI <vop1<0x2d>, "v_rsq_legacy_f32", + VOP_F32_F32, AMDGPUrsq_legacy +>; + +} // End SchedRW = [WriteQuarterRate32] + +let SchedRW = [WriteDouble] in { + +defm V_RCP_CLAMP_F64 : VOP1InstSI <vop1<0x30>, "v_rcp_clamp_f64", VOP_F64_F64>; +defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64", + VOP_F64_F64, AMDGPUrsq_clamped +>; + +} // End SchedRW = [WriteDouble] + +} // End SubtargetPredicate = isSICI + +//===----------------------------------------------------------------------===// +// VINTRP Instructions +//===----------------------------------------------------------------------===// + +let Uses = [M0, EXEC] in { + +// FIXME: Specify SchedRW for VINTRP insturctions. + +multiclass V_INTERP_P1_F32_m : VINTRP_m < + 0x00000000, + (outs VGPR_32:$dst), + (ins VGPR_32:$i, i32imm:$attr_chan, i32imm:$attr), + "v_interp_p1_f32 $dst, $i, $attr_chan, $attr, [m0]", + [(set f32:$dst, (AMDGPUinterp_p1 i32:$i, (i32 imm:$attr_chan), + (i32 imm:$attr)))] +>; + +let OtherPredicates = [has32BankLDS] in { + +defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m; + +} // End OtherPredicates = [has32BankLDS] + +let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $dst" in { + +defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m; + +} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $dst" + +let DisableEncoding = "$src0", Constraints = "$src0 = $dst" in { + +defm V_INTERP_P2_F32 : VINTRP_m < + 0x00000001, + (outs VGPR_32:$dst), + (ins VGPR_32:$src0, VGPR_32:$j, i32imm:$attr_chan, i32imm:$attr), + "v_interp_p2_f32 $dst, [$src0], $j, $attr_chan, $attr, [m0]", + [(set f32:$dst, (AMDGPUinterp_p2 f32:$src0, i32:$j, (i32 imm:$attr_chan), + (i32 imm:$attr)))]>; + +} // End DisableEncoding = "$src0", Constraints = "$src0 = $dst" + +defm V_INTERP_MOV_F32 : VINTRP_m < + 0x00000002, + (outs VGPR_32:$dst), + (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr), + "v_interp_mov_f32 $dst, $src0, $attr_chan, $attr, [m0]", + [(set f32:$dst, (AMDGPUinterp_mov (i32 imm:$src0), (i32 imm:$attr_chan), + (i32 imm:$attr)))]>; + +} // End Uses = [M0, EXEC] + +//===----------------------------------------------------------------------===// +// VOP2 Instructions +//===----------------------------------------------------------------------===// + +multiclass V_CNDMASK <vop2 op, string name> { + defm _e32 : VOP2_m <op, name, VOP_CNDMASK, [], name>; + + defm _e64 : VOP3_m < + op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins64, + name#!cast<string>(VOP_CNDMASK.Asm64), [], name, 3>; +} + +defm V_CNDMASK_B32 : V_CNDMASK<vop2<0x0>, "v_cndmask_b32">; + +let isCommutable = 1 in { +defm V_ADD_F32 : VOP2Inst <vop2<0x3, 0x1>, "v_add_f32", + VOP_F32_F32_F32, fadd +>; + +defm V_SUB_F32 : VOP2Inst <vop2<0x4, 0x2>, "v_sub_f32", VOP_F32_F32_F32, fsub>; +defm V_SUBREV_F32 : VOP2Inst <vop2<0x5, 0x3>, "v_subrev_f32", + VOP_F32_F32_F32, null_frag, "v_sub_f32" +>; +} // End isCommutable = 1 + +let isCommutable = 1 in { + +defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7, 0x4>, "v_mul_legacy_f32", + VOP_F32_F32_F32, int_AMDGPU_mul +>; + +defm V_MUL_F32 : VOP2Inst <vop2<0x8, 0x5>, "v_mul_f32", + VOP_F32_F32_F32, fmul +>; + +defm V_MUL_I32_I24 : VOP2Inst <vop2<0x9, 0x6>, "v_mul_i32_i24", + VOP_I32_I32_I32, AMDGPUmul_i24 +>; + +defm V_MUL_HI_I32_I24 : VOP2Inst <vop2<0xa,0x7>, "v_mul_hi_i32_i24", + VOP_I32_I32_I32 +>; + +defm V_MUL_U32_U24 : VOP2Inst <vop2<0xb, 0x8>, "v_mul_u32_u24", + VOP_I32_I32_I32, AMDGPUmul_u24 +>; + +defm V_MUL_HI_U32_U24 : VOP2Inst <vop2<0xc,0x9>, "v_mul_hi_u32_u24", + VOP_I32_I32_I32 +>; + +defm V_MIN_F32 : VOP2Inst <vop2<0xf, 0xa>, "v_min_f32", VOP_F32_F32_F32, + fminnum>; +defm V_MAX_F32 : VOP2Inst <vop2<0x10, 0xb>, "v_max_f32", VOP_F32_F32_F32, + fmaxnum>; +defm V_MIN_I32 : VOP2Inst <vop2<0x11, 0xc>, "v_min_i32", VOP_I32_I32_I32>; +defm V_MAX_I32 : VOP2Inst <vop2<0x12, 0xd>, "v_max_i32", VOP_I32_I32_I32>; +defm V_MIN_U32 : VOP2Inst <vop2<0x13, 0xe>, "v_min_u32", VOP_I32_I32_I32>; +defm V_MAX_U32 : VOP2Inst <vop2<0x14, 0xf>, "v_max_u32", VOP_I32_I32_I32>; + +defm V_LSHRREV_B32 : VOP2Inst < + vop2<0x16, 0x10>, "v_lshrrev_b32", VOP_I32_I32_I32, null_frag, + "v_lshr_b32" +>; + +defm V_ASHRREV_I32 : VOP2Inst < + vop2<0x18, 0x11>, "v_ashrrev_i32", VOP_I32_I32_I32, null_frag, + "v_ashr_i32" +>; + +defm V_LSHLREV_B32 : VOP2Inst < + vop2<0x1a, 0x12>, "v_lshlrev_b32", VOP_I32_I32_I32, null_frag, + "v_lshl_b32" +>; + +defm V_AND_B32 : VOP2Inst <vop2<0x1b, 0x13>, "v_and_b32", VOP_I32_I32_I32>; +defm V_OR_B32 : VOP2Inst <vop2<0x1c, 0x14>, "v_or_b32", VOP_I32_I32_I32>; +defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32", VOP_I32_I32_I32>; + +let Constraints = "$dst = $src2", DisableEncoding="$src2", + isConvertibleToThreeAddress = 1 in { +defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_MAC>; +} +} // End isCommutable = 1 + +defm V_MADMK_F32 : VOP2MADK <vop2<0x20, 0x17>, "v_madmk_f32">; + +let isCommutable = 1 in { +defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32">; +} // End isCommutable = 1 + +let isCommutable = 1 in { +// No patterns so that the scalar instructions are always selected. +// The scalar versions will be replaced with vector when needed later. + +// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI, +// but the VI instructions behave the same as the SI versions. +defm V_ADD_I32 : VOP2bInst <vop2<0x25, 0x19>, "v_add_i32", + VOP2b_I32_I1_I32_I32 +>; +defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP2b_I32_I1_I32_I32>; + +defm V_SUBREV_I32 : VOP2bInst <vop2<0x27, 0x1b>, "v_subrev_i32", + VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32" +>; + +defm V_ADDC_U32 : VOP2bInst <vop2<0x28, 0x1c>, "v_addc_u32", + VOP2b_I32_I1_I32_I32_I1 +>; +defm V_SUBB_U32 : VOP2bInst <vop2<0x29, 0x1d>, "v_subb_u32", + VOP2b_I32_I1_I32_I32_I1 +>; +defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32", + VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32" +>; + +} // End isCommutable = 1 + +defm V_READLANE_B32 : VOP2SI_3VI_m < + vop3 <0x001, 0x289>, + "v_readlane_b32", + (outs SReg_32:$vdst), + (ins VGPR_32:$src0, SCSrc_32:$src1), + "v_readlane_b32 $vdst, $src0, $src1" +>; + +defm V_WRITELANE_B32 : VOP2SI_3VI_m < + vop3 <0x002, 0x28a>, + "v_writelane_b32", + (outs VGPR_32:$vdst), + (ins SReg_32:$src0, SCSrc_32:$src1), + "v_writelane_b32 $vdst, $src0, $src1" +>; + +// These instructions only exist on SI and CI +let SubtargetPredicate = isSICI in { + +let isCommutable = 1 in { +defm V_MAC_LEGACY_F32 : VOP2InstSI <vop2<0x6>, "v_mac_legacy_f32", + VOP_F32_F32_F32 +>; +} // End isCommutable = 1 + +defm V_MIN_LEGACY_F32 : VOP2InstSI <vop2<0xd>, "v_min_legacy_f32", + VOP_F32_F32_F32, AMDGPUfmin_legacy +>; +defm V_MAX_LEGACY_F32 : VOP2InstSI <vop2<0xe>, "v_max_legacy_f32", + VOP_F32_F32_F32, AMDGPUfmax_legacy +>; + +let isCommutable = 1 in { +defm V_LSHR_B32 : VOP2InstSI <vop2<0x15>, "v_lshr_b32", VOP_I32_I32_I32>; +defm V_ASHR_I32 : VOP2InstSI <vop2<0x17>, "v_ashr_i32", VOP_I32_I32_I32>; +defm V_LSHL_B32 : VOP2InstSI <vop2<0x19>, "v_lshl_b32", VOP_I32_I32_I32>; +} // End isCommutable = 1 +} // End let SubtargetPredicate = SICI + +defm V_BFM_B32 : VOP2_VI3_Inst <vop23<0x1e, 0x293>, "v_bfm_b32", + VOP_I32_I32_I32 +>; +defm V_BCNT_U32_B32 : VOP2_VI3_Inst <vop23<0x22, 0x28b>, "v_bcnt_u32_b32", + VOP_I32_I32_I32 +>; +defm V_MBCNT_LO_U32_B32 : VOP2_VI3_Inst <vop23<0x23, 0x28c>, "v_mbcnt_lo_u32_b32", + VOP_I32_I32_I32, int_amdgcn_mbcnt_lo +>; +defm V_MBCNT_HI_U32_B32 : VOP2_VI3_Inst <vop23<0x24, 0x28d>, "v_mbcnt_hi_u32_b32", + VOP_I32_I32_I32, int_amdgcn_mbcnt_hi +>; +defm V_LDEXP_F32 : VOP2_VI3_Inst <vop23<0x2b, 0x288>, "v_ldexp_f32", + VOP_F32_F32_I32, AMDGPUldexp +>; + +defm V_CVT_PKACCUM_U8_F32 : VOP2_VI3_Inst <vop23<0x2c, 0x1f0>, "v_cvt_pkaccum_u8_f32", + VOP_I32_F32_I32>; // TODO: set "Uses = dst" + +defm V_CVT_PKNORM_I16_F32 : VOP2_VI3_Inst <vop23<0x2d, 0x294>, "v_cvt_pknorm_i16_f32", + VOP_I32_F32_F32 +>; +defm V_CVT_PKNORM_U16_F32 : VOP2_VI3_Inst <vop23<0x2e, 0x295>, "v_cvt_pknorm_u16_f32", + VOP_I32_F32_F32 +>; +defm V_CVT_PKRTZ_F16_F32 : VOP2_VI3_Inst <vop23<0x2f, 0x296>, "v_cvt_pkrtz_f16_f32", + VOP_I32_F32_F32, int_SI_packf16 +>; +defm V_CVT_PK_U16_U32 : VOP2_VI3_Inst <vop23<0x30, 0x297>, "v_cvt_pk_u16_u32", + VOP_I32_I32_I32 +>; +defm V_CVT_PK_I16_I32 : VOP2_VI3_Inst <vop23<0x31, 0x298>, "v_cvt_pk_i16_i32", + VOP_I32_I32_I32 +>; + +//===----------------------------------------------------------------------===// +// VOP3 Instructions +//===----------------------------------------------------------------------===// + +let isCommutable = 1 in { +defm V_MAD_LEGACY_F32 : VOP3Inst <vop3<0x140, 0x1c0>, "v_mad_legacy_f32", + VOP_F32_F32_F32_F32 +>; + +defm V_MAD_F32 : VOP3Inst <vop3<0x141, 0x1c1>, "v_mad_f32", + VOP_F32_F32_F32_F32, fmad +>; + +defm V_MAD_I32_I24 : VOP3Inst <vop3<0x142, 0x1c2>, "v_mad_i32_i24", + VOP_I32_I32_I32_I32, AMDGPUmad_i24 +>; +defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143, 0x1c3>, "v_mad_u32_u24", + VOP_I32_I32_I32_I32, AMDGPUmad_u24 +>; +} // End isCommutable = 1 + +defm V_CUBEID_F32 : VOP3Inst <vop3<0x144, 0x1c4>, "v_cubeid_f32", + VOP_F32_F32_F32_F32 +>; +defm V_CUBESC_F32 : VOP3Inst <vop3<0x145, 0x1c5>, "v_cubesc_f32", + VOP_F32_F32_F32_F32 +>; +defm V_CUBETC_F32 : VOP3Inst <vop3<0x146, 0x1c6>, "v_cubetc_f32", + VOP_F32_F32_F32_F32 +>; +defm V_CUBEMA_F32 : VOP3Inst <vop3<0x147, 0x1c7>, "v_cubema_f32", + VOP_F32_F32_F32_F32 +>; + +defm V_BFE_U32 : VOP3Inst <vop3<0x148, 0x1c8>, "v_bfe_u32", + VOP_I32_I32_I32_I32, AMDGPUbfe_u32 +>; +defm V_BFE_I32 : VOP3Inst <vop3<0x149, 0x1c9>, "v_bfe_i32", + VOP_I32_I32_I32_I32, AMDGPUbfe_i32 +>; + +defm V_BFI_B32 : VOP3Inst <vop3<0x14a, 0x1ca>, "v_bfi_b32", + VOP_I32_I32_I32_I32, AMDGPUbfi +>; + +let isCommutable = 1 in { +defm V_FMA_F32 : VOP3Inst <vop3<0x14b, 0x1cb>, "v_fma_f32", + VOP_F32_F32_F32_F32, fma +>; +defm V_FMA_F64 : VOP3Inst <vop3<0x14c, 0x1cc>, "v_fma_f64", + VOP_F64_F64_F64_F64, fma +>; +} // End isCommutable = 1 + +//def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>; +defm V_ALIGNBIT_B32 : VOP3Inst <vop3<0x14e, 0x1ce>, "v_alignbit_b32", + VOP_I32_I32_I32_I32 +>; +defm V_ALIGNBYTE_B32 : VOP3Inst <vop3<0x14f, 0x1cf>, "v_alignbyte_b32", + VOP_I32_I32_I32_I32 +>; + +defm V_MIN3_F32 : VOP3Inst <vop3<0x151, 0x1d0>, "v_min3_f32", + VOP_F32_F32_F32_F32, AMDGPUfmin3>; + +defm V_MIN3_I32 : VOP3Inst <vop3<0x152, 0x1d1>, "v_min3_i32", + VOP_I32_I32_I32_I32, AMDGPUsmin3 +>; +defm V_MIN3_U32 : VOP3Inst <vop3<0x153, 0x1d2>, "v_min3_u32", + VOP_I32_I32_I32_I32, AMDGPUumin3 +>; +defm V_MAX3_F32 : VOP3Inst <vop3<0x154, 0x1d3>, "v_max3_f32", + VOP_F32_F32_F32_F32, AMDGPUfmax3 +>; +defm V_MAX3_I32 : VOP3Inst <vop3<0x155, 0x1d4>, "v_max3_i32", + VOP_I32_I32_I32_I32, AMDGPUsmax3 +>; +defm V_MAX3_U32 : VOP3Inst <vop3<0x156, 0x1d5>, "v_max3_u32", + VOP_I32_I32_I32_I32, AMDGPUumax3 +>; +defm V_MED3_F32 : VOP3Inst <vop3<0x157, 0x1d6>, "v_med3_f32", + VOP_F32_F32_F32_F32 +>; +defm V_MED3_I32 : VOP3Inst <vop3<0x158, 0x1d7>, "v_med3_i32", + VOP_I32_I32_I32_I32 +>; +defm V_MED3_U32 : VOP3Inst <vop3<0x159, 0x1d8>, "v_med3_u32", + VOP_I32_I32_I32_I32 +>; + +//def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>; +//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "v_sad_hi_u8", []>; +//def V_SAD_U16 : VOP3_U16 <0x0000015c, "v_sad_u16", []>; +defm V_SAD_U32 : VOP3Inst <vop3<0x15d, 0x1dc>, "v_sad_u32", + VOP_I32_I32_I32_I32 +>; +////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>; +defm V_DIV_FIXUP_F32 : VOP3Inst < + vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup +>; + +let SchedRW = [WriteDoubleAdd] in { + +defm V_DIV_FIXUP_F64 : VOP3Inst < + vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup +>; + +} // End SchedRW = [WriteDouble] + +let SchedRW = [WriteDoubleAdd] in { +let isCommutable = 1 in { + +defm V_ADD_F64 : VOP3Inst <vop3<0x164, 0x280>, "v_add_f64", + VOP_F64_F64_F64, fadd +>; +defm V_MUL_F64 : VOP3Inst <vop3<0x165, 0x281>, "v_mul_f64", + VOP_F64_F64_F64, fmul +>; + +defm V_MIN_F64 : VOP3Inst <vop3<0x166, 0x282>, "v_min_f64", + VOP_F64_F64_F64, fminnum +>; +defm V_MAX_F64 : VOP3Inst <vop3<0x167, 0x283>, "v_max_f64", + VOP_F64_F64_F64, fmaxnum +>; + +} // isCommutable = 1 + +defm V_LDEXP_F64 : VOP3Inst <vop3<0x168, 0x284>, "v_ldexp_f64", + VOP_F64_F64_I32, AMDGPUldexp +>; + +} // let SchedRW = [WriteDoubleAdd] + +let isCommutable = 1, SchedRW = [WriteQuarterRate32] in { + +defm V_MUL_LO_U32 : VOP3Inst <vop3<0x169, 0x285>, "v_mul_lo_u32", + VOP_I32_I32_I32 +>; +defm V_MUL_HI_U32 : VOP3Inst <vop3<0x16a, 0x286>, "v_mul_hi_u32", + VOP_I32_I32_I32 +>; + +defm V_MUL_LO_I32 : VOP3Inst <vop3<0x16b, 0x285>, "v_mul_lo_i32", + VOP_I32_I32_I32 +>; +defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32", + VOP_I32_I32_I32 +>; + +} // isCommutable = 1, SchedRW = [WriteQuarterRate32] + +let SchedRW = [WriteFloatFMA, WriteSALU] in { +defm V_DIV_SCALE_F32 : VOP3bInst <vop3<0x16d, 0x1e0>, "v_div_scale_f32", + VOP3b_F32_I1_F32_F32_F32 +>; +} + +let SchedRW = [WriteDouble, WriteSALU] in { +// Double precision division pre-scale. +defm V_DIV_SCALE_F64 : VOP3bInst <vop3<0x16e, 0x1e1>, "v_div_scale_f64", + VOP3b_F64_I1_F64_F64_F64 +>; +} // let SchedRW = [WriteDouble] + +let isCommutable = 1, Uses = [VCC, EXEC] in { + +let SchedRW = [WriteFloatFMA] in { +// v_div_fmas_f32: +// result = src0 * src1 + src2 +// if (vcc) +// result *= 2^32 +// +defm V_DIV_FMAS_F32 : VOP3_VCC_Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32", + VOP_F32_F32_F32_F32, AMDGPUdiv_fmas +>; +} + +let SchedRW = [WriteDouble] in { +// v_div_fmas_f64: +// result = src0 * src1 + src2 +// if (vcc) +// result *= 2^64 +// +defm V_DIV_FMAS_F64 : VOP3_VCC_Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64", + VOP_F64_F64_F64_F64, AMDGPUdiv_fmas +>; + +} // End SchedRW = [WriteDouble] +} // End isCommutable = 1, Uses = [VCC, EXEC] + +//def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>; +//def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>; +//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>; + +let SchedRW = [WriteDouble] in { +defm V_TRIG_PREOP_F64 : VOP3Inst < + vop3<0x174, 0x292>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop +>; + +} // let SchedRW = [WriteDouble] + +// These instructions only exist on SI and CI +let SubtargetPredicate = isSICI in { + +defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64", VOP_I64_I64_I32>; +defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64", VOP_I64_I64_I32>; +defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64", VOP_I64_I64_I32>; + +defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32", + VOP_F32_F32_F32_F32>; + +} // End SubtargetPredicate = isSICI + +let SubtargetPredicate = isVI in { + +defm V_LSHLREV_B64 : VOP3Inst <vop3<0, 0x28f>, "v_lshlrev_b64", + VOP_I64_I32_I64 +>; +defm V_LSHRREV_B64 : VOP3Inst <vop3<0, 0x290>, "v_lshrrev_b64", + VOP_I64_I32_I64 +>; +defm V_ASHRREV_I64 : VOP3Inst <vop3<0, 0x291>, "v_ashrrev_i64", + VOP_I64_I32_I64 +>; + +} // End SubtargetPredicate = isVI + +//===----------------------------------------------------------------------===// +// Pseudo Instructions +//===----------------------------------------------------------------------===// +let isCodeGenOnly = 1, isPseudo = 1 in { + +// For use in patterns +def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$dst), + (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", [] +>; + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { +// 64-bit vector move instruction. This is mainly used by the SIFoldOperands +// pass to enable folding of inline immediates. +def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>; +} // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0 + +let hasSideEffects = 1, SALU = 1 in { +def SGPR_USE : InstSI <(outs),(ins), "", []>; +} + +// SI pseudo instructions. These are used by the CFG structurizer pass +// and should be lowered to ISA instructions prior to codegen. + +let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { +let Uses = [EXEC], Defs = [EXEC] in { + +let isBranch = 1, isTerminator = 1 in { + +def SI_IF: InstSI < + (outs SReg_64:$dst), + (ins SReg_64:$vcc, brtarget:$target), + "", + [(set i64:$dst, (int_SI_if i1:$vcc, bb:$target))] +>; + +def SI_ELSE : InstSI < + (outs SReg_64:$dst), + (ins SReg_64:$src, brtarget:$target), + "", + [(set i64:$dst, (int_SI_else i64:$src, bb:$target))] +> { + let Constraints = "$src = $dst"; +} + +def SI_LOOP : InstSI < + (outs), + (ins SReg_64:$saved, brtarget:$target), + "si_loop $saved, $target", + [(int_SI_loop i64:$saved, bb:$target)] +>; + +} // end isBranch = 1, isTerminator = 1 + +def SI_BREAK : InstSI < + (outs SReg_64:$dst), + (ins SReg_64:$src), + "si_else $dst, $src", + [(set i64:$dst, (int_SI_break i64:$src))] +>; + +def SI_IF_BREAK : InstSI < + (outs SReg_64:$dst), + (ins SReg_64:$vcc, SReg_64:$src), + "si_if_break $dst, $vcc, $src", + [(set i64:$dst, (int_SI_if_break i1:$vcc, i64:$src))] +>; + +def SI_ELSE_BREAK : InstSI < + (outs SReg_64:$dst), + (ins SReg_64:$src0, SReg_64:$src1), + "si_else_break $dst, $src0, $src1", + [(set i64:$dst, (int_SI_else_break i64:$src0, i64:$src1))] +>; + +def SI_END_CF : InstSI < + (outs), + (ins SReg_64:$saved), + "si_end_cf $saved", + [(int_SI_end_cf i64:$saved)] +>; + +} // End Uses = [EXEC], Defs = [EXEC] + +let Uses = [EXEC], Defs = [EXEC,VCC] in { +def SI_KILL : InstSI < + (outs), + (ins VSrc_32:$src), + "si_kill $src", + [(int_AMDGPU_kill f32:$src)] +>; +} // End Uses = [EXEC], Defs = [EXEC,VCC] + +} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1 + +let Uses = [EXEC], Defs = [EXEC,VCC,M0] in { + +class SI_INDIRECT_SRC<RegisterClass rc> : InstSI < + (outs VGPR_32:$dst, SReg_64:$temp), + (ins rc:$src, VSrc_32:$idx, i32imm:$off), + "si_indirect_src $dst, $temp, $src, $idx, $off", + [] +>; + +class SI_INDIRECT_DST<RegisterClass rc> : InstSI < + (outs rc:$dst, SReg_64:$temp), + (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val), + "si_indirect_dst $dst, $temp, $src, $idx, $off, $val", + [] +> { + let Constraints = "$src = $dst"; +} + +// TODO: We can support indirect SGPR access. +def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>; +def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>; +def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>; +def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>; +def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>; + +def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>; +def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>; +def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>; +def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>; +def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>; + +} // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0] + +multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { + + let UseNamedOperandTable = 1, Uses = [EXEC] in { + def _SAVE : InstSI < + (outs), + (ins sgpr_class:$src, i32imm:$frame_idx), + "", [] + > { + let mayStore = 1; + let mayLoad = 0; + } + + def _RESTORE : InstSI < + (outs sgpr_class:$dst), + (ins i32imm:$frame_idx), + "", [] + > { + let mayStore = 0; + let mayLoad = 1; + } + } // End UseNamedOperandTable = 1 +} + +// It's unclear whether you can use M0 as the output of v_readlane_b32 +// instructions, so use SGPR_32 register class for spills to prevent +// this from happening. +defm SI_SPILL_S32 : SI_SPILL_SGPR <SGPR_32>; +defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>; +defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>; +defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; +defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; + +multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> { + let UseNamedOperandTable = 1, VGPRSpill = 1, Uses = [EXEC] in { + def _SAVE : InstSI < + (outs), + (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, + SReg_32:$scratch_offset), + "", [] + > { + let mayStore = 1; + let mayLoad = 0; + } + + def _RESTORE : InstSI < + (outs vgpr_class:$dst), + (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), + "", [] + > { + let mayStore = 0; + let mayLoad = 1; + } + } // End UseNamedOperandTable = 1, VGPRSpill = 1 +} + +defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>; +defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>; +defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>; +defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>; +defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>; +defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>; + +let Defs = [SCC] in { + +def SI_CONSTDATA_PTR : InstSI < + (outs SReg_64:$dst), + (ins const_ga:$ptr), + "", [(set SReg_64:$dst, (i64 (SIconstdata_ptr (tglobaladdr:$ptr))))] +> { + let SALU = 1; +} + +} // End Defs = [SCC] + +} // end IsCodeGenOnly, isPseudo + +} // end SubtargetPredicate = isGCN + +let Predicates = [isGCN] in { + +def : Pat< + (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2), + (V_CNDMASK_B32_e64 $src2, $src1, + (V_CMP_GT_F32_e64 SRCMODS.NONE, 0, SRCMODS.NONE, $src0, + DSTCLAMP.NONE, DSTOMOD.NONE)) +>; + +def : Pat < + (int_AMDGPU_kilp), + (SI_KILL 0xbf800000) +>; + +/* int_SI_vs_load_input */ +def : Pat< + (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr), + (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, 0, imm:$attr_offset, 0, 0, 0) +>; + +/* int_SI_export */ +def : Pat < + (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr, + f32:$src0, f32:$src1, f32:$src2, f32:$src3), + (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm, + $src0, $src1, $src2, $src3) +>; + +//===----------------------------------------------------------------------===// +// SMRD Patterns +//===----------------------------------------------------------------------===// + +multiclass SMRD_Pattern <string Instr, ValueType vt> { + + // 1. IMM offset + def : Pat < + (smrd_load (SMRDImm i64:$sbase, i32:$offset)), + (vt (!cast<SMRD>(Instr#"_IMM") $sbase, $offset)) + >; + + // 2. SGPR offset + def : Pat < + (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)), + (vt (!cast<SMRD>(Instr#"_SGPR") $sbase, $offset)) + >; + + def : Pat < + (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)), + (vt (!cast<SMRD>(Instr#"_IMM_ci") $sbase, $offset)) + > { + let Predicates = [isCIOnly]; + } +} + +// Global and constant loads can be selected to either MUBUF or SMRD +// instructions, but SMRD instructions are faster so we want the instruction +// selector to prefer those. +let AddedComplexity = 100 in { + +defm : SMRD_Pattern <"S_LOAD_DWORD", i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX8", v32i8>; +defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>; + +// 1. Offset as an immediate +def : Pat < + (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)), + (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset) +>; + +// 2. Offset loaded in an 32bit SGPR +def : Pat < + (SIload_constant v4i32:$sbase, (SMRDBufferSgpr i32:$offset)), + (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset) +>; + +let Predicates = [isCI] in { + +def : Pat < + (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)), + (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset) +>; + +} // End Predicates = [isCI] + +} // End let AddedComplexity = 10000 + +//===----------------------------------------------------------------------===// +// SOP1 Patterns +//===----------------------------------------------------------------------===// + +def : Pat < + (i64 (ctpop i64:$src)), + (i64 (REG_SEQUENCE SReg_64, + (S_BCNT1_I32_B64 $src), sub0, + (S_MOV_B32 0), sub1)) +>; + +def : Pat < + (i32 (smax i32:$x, (i32 (ineg i32:$x)))), + (S_ABS_I32 $x) +>; + +//===----------------------------------------------------------------------===// +// SOP2 Patterns +//===----------------------------------------------------------------------===// + +// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector +// case, the sgpr-copies pass will fix this to use the vector version. +def : Pat < + (i32 (addc i32:$src0, i32:$src1)), + (S_ADD_U32 $src0, $src1) +>; + +//===----------------------------------------------------------------------===// +// SOPP Patterns +//===----------------------------------------------------------------------===// + +def : Pat < + (int_AMDGPU_barrier_global), + (S_BARRIER) +>; + +//===----------------------------------------------------------------------===// +// VOP1 Patterns +//===----------------------------------------------------------------------===// + +let Predicates = [UnsafeFPMath] in { + +//def : RcpPat<V_RCP_F64_e32, f64>; +//defm : RsqPat<V_RSQ_F64_e32, f64>; +//defm : RsqPat<V_RSQ_F32_e32, f32>; + +def : RsqPat<V_RSQ_F32_e32, f32>; +def : RsqPat<V_RSQ_F64_e32, f64>; +} + +//===----------------------------------------------------------------------===// +// VOP2 Patterns +//===----------------------------------------------------------------------===// + +def : Pat < + (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)), + (V_BCNT_U32_B32_e64 $popcnt, $val) +>; + +def : Pat < + (i32 (select i1:$src0, i32:$src1, i32:$src2)), + (V_CNDMASK_B32_e64 $src2, $src1, $src0) +>; + +// Pattern for V_MAC_F32 +def : Pat < + (fmad (VOP3NoMods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3NoMods f32:$src1, i32:$src1_modifiers), + (VOP3NoMods f32:$src2, i32:$src2_modifiers)), + (V_MAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, + $src2_modifiers, $src2, $clamp, $omod) +>; + +/********** ======================= **********/ +/********** Image sampling patterns **********/ +/********** ======================= **********/ + +// Image + sampler +class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm, + i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), + (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da), + (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc), + $addr, $rsrc, $sampler) +>; + +multiclass SampleRawPatterns<SDPatternOperator name, string opcode> { + def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V1), i32>; + def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>; + def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>; + def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V8), v8i32>; + def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V16), v16i32>; +} + +// Image only +class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v8i32:$rsrc, i32:$dmask, i32:$unorm, + i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), + (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da), + (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc), + $addr, $rsrc) +>; + +multiclass ImagePatterns<SDPatternOperator name, string opcode> { + def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V1), i32>; + def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>; + def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>; +} + +// Basic sample +defm : SampleRawPatterns<int_SI_image_sample, "IMAGE_SAMPLE">; +defm : SampleRawPatterns<int_SI_image_sample_cl, "IMAGE_SAMPLE_CL">; +defm : SampleRawPatterns<int_SI_image_sample_d, "IMAGE_SAMPLE_D">; +defm : SampleRawPatterns<int_SI_image_sample_d_cl, "IMAGE_SAMPLE_D_CL">; +defm : SampleRawPatterns<int_SI_image_sample_l, "IMAGE_SAMPLE_L">; +defm : SampleRawPatterns<int_SI_image_sample_b, "IMAGE_SAMPLE_B">; +defm : SampleRawPatterns<int_SI_image_sample_b_cl, "IMAGE_SAMPLE_B_CL">; +defm : SampleRawPatterns<int_SI_image_sample_lz, "IMAGE_SAMPLE_LZ">; +defm : SampleRawPatterns<int_SI_image_sample_cd, "IMAGE_SAMPLE_CD">; +defm : SampleRawPatterns<int_SI_image_sample_cd_cl, "IMAGE_SAMPLE_CD_CL">; + +// Sample with comparison +defm : SampleRawPatterns<int_SI_image_sample_c, "IMAGE_SAMPLE_C">; +defm : SampleRawPatterns<int_SI_image_sample_c_cl, "IMAGE_SAMPLE_C_CL">; +defm : SampleRawPatterns<int_SI_image_sample_c_d, "IMAGE_SAMPLE_C_D">; +defm : SampleRawPatterns<int_SI_image_sample_c_d_cl, "IMAGE_SAMPLE_C_D_CL">; +defm : SampleRawPatterns<int_SI_image_sample_c_l, "IMAGE_SAMPLE_C_L">; +defm : SampleRawPatterns<int_SI_image_sample_c_b, "IMAGE_SAMPLE_C_B">; +defm : SampleRawPatterns<int_SI_image_sample_c_b_cl, "IMAGE_SAMPLE_C_B_CL">; +defm : SampleRawPatterns<int_SI_image_sample_c_lz, "IMAGE_SAMPLE_C_LZ">; +defm : SampleRawPatterns<int_SI_image_sample_c_cd, "IMAGE_SAMPLE_C_CD">; +defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl, "IMAGE_SAMPLE_C_CD_CL">; + +// Sample with offsets +defm : SampleRawPatterns<int_SI_image_sample_o, "IMAGE_SAMPLE_O">; +defm : SampleRawPatterns<int_SI_image_sample_cl_o, "IMAGE_SAMPLE_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_d_o, "IMAGE_SAMPLE_D_O">; +defm : SampleRawPatterns<int_SI_image_sample_d_cl_o, "IMAGE_SAMPLE_D_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_l_o, "IMAGE_SAMPLE_L_O">; +defm : SampleRawPatterns<int_SI_image_sample_b_o, "IMAGE_SAMPLE_B_O">; +defm : SampleRawPatterns<int_SI_image_sample_b_cl_o, "IMAGE_SAMPLE_B_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_lz_o, "IMAGE_SAMPLE_LZ_O">; +defm : SampleRawPatterns<int_SI_image_sample_cd_o, "IMAGE_SAMPLE_CD_O">; +defm : SampleRawPatterns<int_SI_image_sample_cd_cl_o, "IMAGE_SAMPLE_CD_CL_O">; + +// Sample with comparison and offsets +defm : SampleRawPatterns<int_SI_image_sample_c_o, "IMAGE_SAMPLE_C_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_cl_o, "IMAGE_SAMPLE_C_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_d_o, "IMAGE_SAMPLE_C_D_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_d_cl_o, "IMAGE_SAMPLE_C_D_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_l_o, "IMAGE_SAMPLE_C_L_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_b_o, "IMAGE_SAMPLE_C_B_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_b_cl_o, "IMAGE_SAMPLE_C_B_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_lz_o, "IMAGE_SAMPLE_C_LZ_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_cd_o, "IMAGE_SAMPLE_C_CD_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">; + +// Gather opcodes +// Only the variants which make sense are defined. +def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V2, v2i32>; +def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_cl, IMAGE_GATHER4_CL_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_l, IMAGE_GATHER4_L_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_b, IMAGE_GATHER4_B_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V2, v2i32>; +def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V4, v4i32>; + +def : SampleRawPattern<int_SI_gather4_c, IMAGE_GATHER4_C_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_b_cl, IMAGE_GATHER4_C_B_CL_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_lz, IMAGE_GATHER4_C_LZ_V4_V4, v4i32>; + +def : SampleRawPattern<int_SI_gather4_o, IMAGE_GATHER4_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_b_cl_o, IMAGE_GATHER4_B_CL_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_lz_o, IMAGE_GATHER4_LZ_O_V4_V4, v4i32>; + +def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_cl_o, IMAGE_GATHER4_C_CL_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_l_o, IMAGE_GATHER4_C_L_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_b_o, IMAGE_GATHER4_C_B_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_b_cl_o, IMAGE_GATHER4_C_B_CL_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V8, v8i32>; + +def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V1, i32>; +def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V2, v2i32>; +def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>; + +def : ImagePattern<int_SI_getresinfo, IMAGE_GET_RESINFO_V4_V1, i32>; +defm : ImagePatterns<int_SI_image_load, "IMAGE_LOAD">; +defm : ImagePatterns<int_SI_image_load_mip, "IMAGE_LOAD_MIP">; + +/* SIsample for simple 1D texture lookup */ +def : Pat < + (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm), + (IMAGE_SAMPLE_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) +>; + +class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, imm), + (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) +>; + +class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_RECT), + (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) +>; + +class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_ARRAY), + (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler) +>; + +class SampleShadowPattern<SDNode name, MIMG opcode, + ValueType vt> : Pat < + (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW), + (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) +>; + +class SampleShadowArrayPattern<SDNode name, MIMG opcode, + ValueType vt> : Pat < + (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY), + (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler) +>; + +/* SIsample* for texture lookups consuming more address parameters */ +multiclass SamplePatterns<MIMG sample, MIMG sample_c, MIMG sample_l, + MIMG sample_c_l, MIMG sample_b, MIMG sample_c_b, +MIMG sample_d, MIMG sample_c_d, ValueType addr_type> { + def : SamplePattern <SIsample, sample, addr_type>; + def : SampleRectPattern <SIsample, sample, addr_type>; + def : SampleArrayPattern <SIsample, sample, addr_type>; + def : SampleShadowPattern <SIsample, sample_c, addr_type>; + def : SampleShadowArrayPattern <SIsample, sample_c, addr_type>; + + def : SamplePattern <SIsamplel, sample_l, addr_type>; + def : SampleArrayPattern <SIsamplel, sample_l, addr_type>; + def : SampleShadowPattern <SIsamplel, sample_c_l, addr_type>; + def : SampleShadowArrayPattern <SIsamplel, sample_c_l, addr_type>; + + def : SamplePattern <SIsampleb, sample_b, addr_type>; + def : SampleArrayPattern <SIsampleb, sample_b, addr_type>; + def : SampleShadowPattern <SIsampleb, sample_c_b, addr_type>; + def : SampleShadowArrayPattern <SIsampleb, sample_c_b, addr_type>; + + def : SamplePattern <SIsampled, sample_d, addr_type>; + def : SampleArrayPattern <SIsampled, sample_d, addr_type>; + def : SampleShadowPattern <SIsampled, sample_c_d, addr_type>; + def : SampleShadowArrayPattern <SIsampled, sample_c_d, addr_type>; +} + +defm : SamplePatterns<IMAGE_SAMPLE_V4_V2, IMAGE_SAMPLE_C_V4_V2, + IMAGE_SAMPLE_L_V4_V2, IMAGE_SAMPLE_C_L_V4_V2, + IMAGE_SAMPLE_B_V4_V2, IMAGE_SAMPLE_C_B_V4_V2, + IMAGE_SAMPLE_D_V4_V2, IMAGE_SAMPLE_C_D_V4_V2, + v2i32>; +defm : SamplePatterns<IMAGE_SAMPLE_V4_V4, IMAGE_SAMPLE_C_V4_V4, + IMAGE_SAMPLE_L_V4_V4, IMAGE_SAMPLE_C_L_V4_V4, + IMAGE_SAMPLE_B_V4_V4, IMAGE_SAMPLE_C_B_V4_V4, + IMAGE_SAMPLE_D_V4_V4, IMAGE_SAMPLE_C_D_V4_V4, + v4i32>; +defm : SamplePatterns<IMAGE_SAMPLE_V4_V8, IMAGE_SAMPLE_C_V4_V8, + IMAGE_SAMPLE_L_V4_V8, IMAGE_SAMPLE_C_L_V4_V8, + IMAGE_SAMPLE_B_V4_V8, IMAGE_SAMPLE_C_B_V4_V8, + IMAGE_SAMPLE_D_V4_V8, IMAGE_SAMPLE_C_D_V4_V8, + v8i32>; +defm : SamplePatterns<IMAGE_SAMPLE_V4_V16, IMAGE_SAMPLE_C_V4_V16, + IMAGE_SAMPLE_L_V4_V16, IMAGE_SAMPLE_C_L_V4_V16, + IMAGE_SAMPLE_B_V4_V16, IMAGE_SAMPLE_C_B_V4_V16, + IMAGE_SAMPLE_D_V4_V16, IMAGE_SAMPLE_C_D_V4_V16, + v16i32>; + +/* int_SI_imageload for texture fetches consuming varying address parameters */ +class ImageLoadPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat < + (name addr_type:$addr, v32i8:$rsrc, imm), + (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc) +>; + +class ImageLoadArrayPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat < + (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY), + (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc) +>; + +class ImageLoadMSAAPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat < + (name addr_type:$addr, v32i8:$rsrc, TEX_MSAA), + (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc) +>; + +class ImageLoadArrayMSAAPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat < + (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY_MSAA), + (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc) +>; + +multiclass ImageLoadPatterns<MIMG opcode, ValueType addr_type> { + def : ImageLoadPattern <int_SI_imageload, opcode, addr_type>; + def : ImageLoadArrayPattern <int_SI_imageload, opcode, addr_type>; +} + +multiclass ImageLoadMSAAPatterns<MIMG opcode, ValueType addr_type> { + def : ImageLoadMSAAPattern <int_SI_imageload, opcode, addr_type>; + def : ImageLoadArrayMSAAPattern <int_SI_imageload, opcode, addr_type>; +} + +defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V4_V2, v2i32>; +defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V4_V4, v4i32>; + +defm : ImageLoadMSAAPatterns<IMAGE_LOAD_V4_V2, v2i32>; +defm : ImageLoadMSAAPatterns<IMAGE_LOAD_V4_V4, v4i32>; + +/* Image resource information */ +def : Pat < + (int_SI_resinfo i32:$mipid, v32i8:$rsrc, imm), + (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) +>; + +def : Pat < + (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY), + (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) +>; + +def : Pat < + (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY_MSAA), + (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) +>; + +/********** ============================================ **********/ +/********** Extraction, Insertion, Building and Casting **********/ +/********** ============================================ **********/ + +//def : Extract_Element<i64, v2i64, 0, sub0_sub1>; +//def : Extract_Element<i64, v2i64, 1, sub2_sub3>; +//def : Extract_Element<f64, v2f64, 0, sub0_sub1>; +//def : Extract_Element<f64, v2f64, 1, sub2_sub3>; + +foreach Index = 0-2 in { + def Extract_Element_v2i32_#Index : Extract_Element < + i32, v2i32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v2i32_#Index : Insert_Element < + i32, v2i32, Index, !cast<SubRegIndex>(sub#Index) + >; + + def Extract_Element_v2f32_#Index : Extract_Element < + f32, v2f32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v2f32_#Index : Insert_Element < + f32, v2f32, Index, !cast<SubRegIndex>(sub#Index) + >; +} + +foreach Index = 0-3 in { + def Extract_Element_v4i32_#Index : Extract_Element < + i32, v4i32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v4i32_#Index : Insert_Element < + i32, v4i32, Index, !cast<SubRegIndex>(sub#Index) + >; + + def Extract_Element_v4f32_#Index : Extract_Element < + f32, v4f32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v4f32_#Index : Insert_Element < + f32, v4f32, Index, !cast<SubRegIndex>(sub#Index) + >; +} + +foreach Index = 0-7 in { + def Extract_Element_v8i32_#Index : Extract_Element < + i32, v8i32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v8i32_#Index : Insert_Element < + i32, v8i32, Index, !cast<SubRegIndex>(sub#Index) + >; + + def Extract_Element_v8f32_#Index : Extract_Element < + f32, v8f32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v8f32_#Index : Insert_Element < + f32, v8f32, Index, !cast<SubRegIndex>(sub#Index) + >; +} + +foreach Index = 0-15 in { + def Extract_Element_v16i32_#Index : Extract_Element < + i32, v16i32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v16i32_#Index : Insert_Element < + i32, v16i32, Index, !cast<SubRegIndex>(sub#Index) + >; + + def Extract_Element_v16f32_#Index : Extract_Element < + f32, v16f32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v16f32_#Index : Insert_Element < + f32, v16f32, Index, !cast<SubRegIndex>(sub#Index) + >; +} + +def : BitConvert <i32, f32, SReg_32>; +def : BitConvert <i32, f32, VGPR_32>; + +def : BitConvert <f32, i32, SReg_32>; +def : BitConvert <f32, i32, VGPR_32>; + +def : BitConvert <i64, f64, VReg_64>; + +def : BitConvert <f64, i64, VReg_64>; + +def : BitConvert <v2f32, v2i32, VReg_64>; +def : BitConvert <v2i32, v2f32, VReg_64>; +def : BitConvert <v2i32, i64, VReg_64>; +def : BitConvert <i64, v2i32, VReg_64>; +def : BitConvert <v2f32, i64, VReg_64>; +def : BitConvert <i64, v2f32, VReg_64>; +def : BitConvert <v2f32, f64, VReg_64>; +def : BitConvert <v2i32, f64, VReg_64>; +def : BitConvert <f64, v2f32, VReg_64>; +def : BitConvert <f64, v2i32, VReg_64>; +def : BitConvert <v4f32, v4i32, VReg_128>; +def : BitConvert <v4i32, v4f32, VReg_128>; + + +def : BitConvert <v2i64, v4i32, SReg_128>; +def : BitConvert <v4i32, v2i64, SReg_128>; + +def : BitConvert <v2f64, v4f32, VReg_128>; +def : BitConvert <v2f64, v4i32, VReg_128>; +def : BitConvert <v4f32, v2f64, VReg_128>; +def : BitConvert <v4i32, v2f64, VReg_128>; + + + + +def : BitConvert <v8f32, v8i32, SReg_256>; +def : BitConvert <v8i32, v8f32, SReg_256>; +def : BitConvert <v8i32, v32i8, SReg_256>; +def : BitConvert <v32i8, v8i32, SReg_256>; +def : BitConvert <v8i32, v32i8, VReg_256>; +def : BitConvert <v8i32, v8f32, VReg_256>; +def : BitConvert <v8f32, v8i32, VReg_256>; +def : BitConvert <v32i8, v8i32, VReg_256>; + +def : BitConvert <v16i32, v16f32, VReg_512>; +def : BitConvert <v16f32, v16i32, VReg_512>; + +/********** =================== **********/ +/********** Src & Dst modifiers **********/ +/********** =================== **********/ + +def : Pat < + (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod), + (f32 FP_ZERO), (f32 FP_ONE)), + (V_ADD_F32_e64 $src0_modifiers, $src0, 0, 0, 1, $omod) +>; + +/********** ================================ **********/ +/********** Floating point absolute/negative **********/ +/********** ================================ **********/ + +// Prevent expanding both fneg and fabs. + +def : Pat < + (fneg (fabs f32:$src)), + (S_OR_B32 $src, 0x80000000) /* Set sign bit */ +>; + +// FIXME: Should use S_OR_B32 +def : Pat < + (fneg (fabs f64:$src)), + (REG_SEQUENCE VReg_64, + (i32 (EXTRACT_SUBREG f64:$src, sub0)), + sub0, + (V_OR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), + (V_MOV_B32_e32 0x80000000)), // Set sign bit. + sub1) +>; + +def : Pat < + (fabs f32:$src), + (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff)) +>; + +def : Pat < + (fneg f32:$src), + (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) +>; + +def : Pat < + (fabs f64:$src), + (REG_SEQUENCE VReg_64, + (i32 (EXTRACT_SUBREG f64:$src, sub0)), + sub0, + (V_AND_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), + (V_MOV_B32_e32 0x7fffffff)), // Set sign bit. + sub1) +>; + +def : Pat < + (fneg f64:$src), + (REG_SEQUENCE VReg_64, + (i32 (EXTRACT_SUBREG f64:$src, sub0)), + sub0, + (V_XOR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), + (V_MOV_B32_e32 0x80000000)), + sub1) +>; + +/********** ================== **********/ +/********** Immediate Patterns **********/ +/********** ================== **********/ + +def : Pat < + (SGPRImm<(i32 imm)>:$imm), + (S_MOV_B32 imm:$imm) +>; + +def : Pat < + (SGPRImm<(f32 fpimm)>:$imm), + (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm))) +>; + +def : Pat < + (i32 imm:$imm), + (V_MOV_B32_e32 imm:$imm) +>; + +def : Pat < + (f32 fpimm:$imm), + (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm))) +>; + +def : Pat < + (i64 InlineImm<i64>:$imm), + (S_MOV_B64 InlineImm<i64>:$imm) +>; + +// XXX - Should this use a s_cmp to set SCC? + +// Set to sign-extended 64-bit value (true = -1, false = 0) +def : Pat < + (i1 imm:$imm), + (S_MOV_B64 (i64 (as_i64imm $imm))) +>; + +def : Pat < + (f64 InlineFPImm<f64>:$imm), + (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineFPImm<f64>:$imm))) +>; + +/********** ================== **********/ +/********** Intrinsic Patterns **********/ +/********** ================== **********/ + +/* llvm.AMDGPU.pow */ +def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>; + +def : Pat < + (int_AMDGPU_div f32:$src0, f32:$src1), + (V_MUL_LEGACY_F32_e32 $src0, (V_RCP_LEGACY_F32_e32 $src1)) +>; + +def : Pat < + (int_AMDGPU_cube v4f32:$src), + (REG_SEQUENCE VReg_128, + (V_CUBETC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */, (EXTRACT_SUBREG $src, sub1), + 0 /* src2_modifiers */, (EXTRACT_SUBREG $src, sub2), + 0 /* clamp */, 0 /* omod */), sub0, + (V_CUBESC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), + 0 /* src2_modifiers */,(EXTRACT_SUBREG $src, sub2), + 0 /* clamp */, 0 /* omod */), sub1, + (V_CUBEMA_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), + 0 /* clamp */, 0 /* omod */), sub2, + (V_CUBEID_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), + 0 /* clamp */, 0 /* omod */), sub3) +>; + +def : Pat < + (i32 (sext i1:$src0)), + (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0) +>; + +class Ext32Pat <SDNode ext> : Pat < + (i32 (ext i1:$src0)), + (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src0) +>; + +def : Ext32Pat <zext>; +def : Ext32Pat <anyext>; + +// Offset in an 32Bit VGPR +def : Pat < + (SIload_constant v4i32:$sbase, i32:$voff), + (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, 0, 0, 0, 0, 0) +>; + +// The multiplication scales from [0,1] to the unsigned integer range +def : Pat < + (AMDGPUurecip i32:$src0), + (V_CVT_U32_F32_e32 + (V_MUL_F32_e32 CONST.FP_UINT_MAX_PLUS_1, + (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) +>; + +def : Pat < + (int_SI_tid), + (V_MBCNT_HI_U32_B32_e64 0xffffffff, + (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0)) +>; + +//===----------------------------------------------------------------------===// +// VOP3 Patterns +//===----------------------------------------------------------------------===// + +def : IMad24Pat<V_MAD_I32_I24>; +def : UMad24Pat<V_MAD_U32_U24>; + +def : Pat < + (mulhu i32:$src0, i32:$src1), + (V_MUL_HI_U32 $src0, $src1) +>; + +def : Pat < + (mulhs i32:$src0, i32:$src1), + (V_MUL_HI_I32 $src0, $src1) +>; + +defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>; +def : ROTRPattern <V_ALIGNBIT_B32>; + +/********** ======================= **********/ +/********** Load/Store Patterns **********/ +/********** ======================= **********/ + +class DSReadPat <DS inst, ValueType vt, PatFrag frag> : Pat < + (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), + (inst $ptr, (as_i16imm $offset), (i1 0)) +>; + +def : DSReadPat <DS_READ_I8, i32, si_sextload_local_i8>; +def : DSReadPat <DS_READ_U8, i32, si_az_extload_local_i8>; +def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>; +def : DSReadPat <DS_READ_U16, i32, si_az_extload_local_i16>; +def : DSReadPat <DS_READ_B32, i32, si_load_local>; + +let AddedComplexity = 100 in { + +def : DSReadPat <DS_READ_B64, v2i32, si_load_local_align8>; + +} // End AddedComplexity = 100 + +def : Pat < + (v2i32 (si_load_local (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, + i8:$offset1))), + (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0)) +>; + +class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat < + (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), + (inst $ptr, $value, (as_i16imm $offset), (i1 0)) +>; + +def : DSWritePat <DS_WRITE_B8, i32, si_truncstore_local_i8>; +def : DSWritePat <DS_WRITE_B16, i32, si_truncstore_local_i16>; +def : DSWritePat <DS_WRITE_B32, i32, si_store_local>; + +let AddedComplexity = 100 in { + +def : DSWritePat <DS_WRITE_B64, v2i32, si_store_local_align8>; +} // End AddedComplexity = 100 + +def : Pat < + (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, + i8:$offset1)), + (DS_WRITE2_B32 $ptr, (EXTRACT_SUBREG $value, sub0), + (EXTRACT_SUBREG $value, sub1), $offset0, $offset1, + (i1 0)) +>; + +class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat < + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), + (inst $ptr, $value, (as_i16imm $offset), (i1 0)) +>; + +// Special case of DSAtomicRetPat for add / sub 1 -> inc / dec +// +// We need to use something for the data0, so we set a register to +// -1. For the non-rtn variants, the manual says it does +// DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max +// will always do the increment so I'm assuming it's the same. +class DSAtomicIncRetPat<DS inst, ValueType vt, + Instruction LoadImm, PatFrag frag> : Pat < + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)), + (inst $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (i1 0)) +>; + + +class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat < + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), + (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0)) +>; + + +// 32-bit atomics. +def : DSAtomicIncRetPat<DS_INC_RTN_U32, i32, + V_MOV_B32_e32, si_atomic_load_add_local>; +def : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32, + V_MOV_B32_e32, si_atomic_load_sub_local>; + +def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>; +def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>; +def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, si_atomic_load_sub_local>; +def : DSAtomicRetPat<DS_AND_RTN_B32, i32, si_atomic_load_and_local>; +def : DSAtomicRetPat<DS_OR_RTN_B32, i32, si_atomic_load_or_local>; +def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, si_atomic_load_xor_local>; +def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, si_atomic_load_min_local>; +def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, si_atomic_load_max_local>; +def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, si_atomic_load_umin_local>; +def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, si_atomic_load_umax_local>; + +def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>; + +// 64-bit atomics. +def : DSAtomicIncRetPat<DS_INC_RTN_U64, i64, + V_MOV_B64_PSEUDO, si_atomic_load_add_local>; +def : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64, + V_MOV_B64_PSEUDO, si_atomic_load_sub_local>; + +def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>; +def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>; +def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, si_atomic_load_sub_local>; +def : DSAtomicRetPat<DS_AND_RTN_B64, i64, si_atomic_load_and_local>; +def : DSAtomicRetPat<DS_OR_RTN_B64, i64, si_atomic_load_or_local>; +def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, si_atomic_load_xor_local>; +def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, si_atomic_load_min_local>; +def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, si_atomic_load_max_local>; +def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, si_atomic_load_umin_local>; +def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, si_atomic_load_umax_local>; + +def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, si_atomic_cmp_swap_64_local>; + + +//===----------------------------------------------------------------------===// +// MUBUF Patterns +//===----------------------------------------------------------------------===// + +multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt, + PatFrag constant_ld> { + def : Pat < + (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe) + >; +} + +let Predicates = [isSICI] in { +defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>; +} // End Predicates = [isSICI] + +class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat < + (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr, + i32:$soffset, u16imm:$offset))), + (Instr $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) +>; + +def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>; + +// BUFFER_LOAD_DWORD*, addr64=0 +multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxen, + MUBUF bothen> { + + def : Pat < + (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset, + imm:$offset, 0, 0, imm:$glc, imm:$slc, + imm:$tfe)), + (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), + (as_i1imm $slc), (as_i1imm $tfe)) + >; + + def : Pat < + (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, + imm:$offset, 1, 0, imm:$glc, imm:$slc, + imm:$tfe)), + (offen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), + (as_i1imm $tfe)) + >; + + def : Pat < + (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, + imm:$offset, 0, 1, imm:$glc, imm:$slc, + imm:$tfe)), + (idxen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), + (as_i1imm $slc), (as_i1imm $tfe)) + >; + + def : Pat < + (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset, + imm:$offset, 1, 1, imm:$glc, imm:$slc, + imm:$tfe)), + (bothen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), + (as_i1imm $tfe)) + >; +} + +defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_OFFSET, BUFFER_LOAD_DWORD_OFFEN, + BUFFER_LOAD_DWORD_IDXEN, BUFFER_LOAD_DWORD_BOTHEN>; +defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_OFFEN, + BUFFER_LOAD_DWORDX2_IDXEN, BUFFER_LOAD_DWORDX2_BOTHEN>; +defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN, + BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>; + +class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat < + (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, + u16imm:$offset)), + (Instr $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) +>; + +def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>; + +/* +class MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> : Pat < + (st vt:$value, (MUBUFScratch v4i32:$srsrc, i64:$vaddr, u16imm:$offset)), + (Instr $value, $srsrc, $vaddr, $offset) +>; + +let Predicates = [isSICI] in { +def : MUBUFStore_Pattern <BUFFER_STORE_BYTE_ADDR64, i32, truncstorei8_private>; +def : MUBUFStore_Pattern <BUFFER_STORE_SHORT_ADDR64, i32, truncstorei16_private>; +def : MUBUFStore_Pattern <BUFFER_STORE_DWORD_ADDR64, i32, store_private>; +def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2_ADDR64, v2i32, store_private>; +def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4_ADDR64, v4i32, store_private>; +} // End Predicates = [isSICI] + +*/ + +//===----------------------------------------------------------------------===// +// MTBUF Patterns +//===----------------------------------------------------------------------===// + +// TBUFFER_STORE_FORMAT_*, addr64=0 +class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF opcode> : Pat< + (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr, + i32:$soffset, imm:$inst_offset, imm:$dfmt, + imm:$nfmt, imm:$offen, imm:$idxen, + imm:$glc, imm:$slc, imm:$tfe), + (opcode + $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen), + (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc, + (as_i1imm $slc), (as_i1imm $tfe), $soffset) +>; + +def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>; +def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>; +def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>; +def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>; + +/********** ====================== **********/ +/********** Indirect adressing **********/ +/********** ====================== **********/ + +multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> { + + // 1. Extract with offset + def : Pat< + (eltvt (extractelt vt:$vec, (add i32:$idx, imm:$off))), + (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, imm:$off) + >; + + // 2. Extract without offset + def : Pat< + (eltvt (extractelt vt:$vec, i32:$idx)), + (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, 0) + >; + + // 3. Insert with offset + def : Pat< + (insertelt vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)), + (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, imm:$off, $val) + >; + + // 4. Insert without offset + def : Pat< + (insertelt vt:$vec, eltvt:$val, i32:$idx), + (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, 0, $val) + >; +} + +defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">; +defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">; +defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">; +defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">; + +defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">; +defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">; +defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">; +defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">; + +//===----------------------------------------------------------------------===// +// Conversion Patterns +//===----------------------------------------------------------------------===// + +def : Pat<(i32 (sext_inreg i32:$src, i1)), + (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16 + +// Handle sext_inreg in i64 +def : Pat < + (i64 (sext_inreg i64:$src, i1)), + (S_BFE_I64 i64:$src, 0x10000) // 0 | 1 << 16 +>; + +def : Pat < + (i64 (sext_inreg i64:$src, i8)), + (S_BFE_I64 i64:$src, 0x80000) // 0 | 8 << 16 +>; + +def : Pat < + (i64 (sext_inreg i64:$src, i16)), + (S_BFE_I64 i64:$src, 0x100000) // 0 | 16 << 16 +>; + +def : Pat < + (i64 (sext_inreg i64:$src, i32)), + (S_BFE_I64 i64:$src, 0x200000) // 0 | 32 << 16 +>; + +class ZExt_i64_i32_Pat <SDNode ext> : Pat < + (i64 (ext i32:$src)), + (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 0), sub1) +>; + +class ZExt_i64_i1_Pat <SDNode ext> : Pat < + (i64 (ext i1:$src)), + (REG_SEQUENCE VReg_64, + (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0, + (S_MOV_B32 0), sub1) +>; + + +def : ZExt_i64_i32_Pat<zext>; +def : ZExt_i64_i32_Pat<anyext>; +def : ZExt_i64_i1_Pat<zext>; +def : ZExt_i64_i1_Pat<anyext>; + +def : Pat < + (i64 (sext i32:$src)), + (REG_SEQUENCE SReg_64, $src, sub0, + (S_ASHR_I32 $src, 31), sub1) +>; + +def : Pat < + (i64 (sext i1:$src)), + (REG_SEQUENCE VReg_64, + (V_CNDMASK_B32_e64 0, -1, $src), sub0, + (V_CNDMASK_B32_e64 0, -1, $src), sub1) +>; + +// If we need to perform a logical operation on i1 values, we need to +// use vector comparisons since there is only one SCC register. Vector +// comparisions still write to a pair of SGPRs, so treat these as +// 64-bit comparisons. When legalizing SGPR copies, instructions +// resulting in the copies from SCC to these instructions will be +// moved to the VALU. +def : Pat < + (i1 (and i1:$src0, i1:$src1)), + (S_AND_B64 $src0, $src1) +>; + +def : Pat < + (i1 (or i1:$src0, i1:$src1)), + (S_OR_B64 $src0, $src1) +>; + +def : Pat < + (i1 (xor i1:$src0, i1:$src1)), + (S_XOR_B64 $src0, $src1) +>; + +def : Pat < + (f32 (sint_to_fp i1:$src)), + (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src) +>; + +def : Pat < + (f32 (uint_to_fp i1:$src)), + (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_ONE, $src) +>; + +def : Pat < + (f64 (sint_to_fp i1:$src)), + (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)) +>; + +def : Pat < + (f64 (uint_to_fp i1:$src)), + (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)) +>; + +//===----------------------------------------------------------------------===// +// Miscellaneous Patterns +//===----------------------------------------------------------------------===// + +def : Pat < + (i32 (trunc i64:$a)), + (EXTRACT_SUBREG $a, sub0) +>; + +def : Pat < + (i1 (trunc i32:$a)), + (V_CMP_EQ_I32_e64 (S_AND_B32 (i32 1), $a), 1) +>; + +def : Pat < + (i1 (trunc i64:$a)), + (V_CMP_EQ_I32_e64 (S_AND_B32 (i32 1), + (EXTRACT_SUBREG $a, sub0)), 1) +>; + +def : Pat < + (i32 (bswap i32:$a)), + (V_BFI_B32 (S_MOV_B32 0x00ff00ff), + (V_ALIGNBIT_B32 $a, $a, 24), + (V_ALIGNBIT_B32 $a, $a, 8)) +>; + +def : Pat < + (f32 (select i1:$src2, f32:$src1, f32:$src0)), + (V_CNDMASK_B32_e64 $src0, $src1, $src2) +>; + +multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> { + def : Pat < + (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)), + (BFM $a, $b) + >; + + def : Pat < + (vt (add (vt (shl 1, vt:$a)), -1)), + (BFM $a, (MOV 0)) + >; +} + +defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>; +// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>; + +def : BFEPattern <V_BFE_U32, S_MOV_B32>; + +//===----------------------------------------------------------------------===// +// Fract Patterns +//===----------------------------------------------------------------------===// + +let Predicates = [isSI] in { + +// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is +// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient +// way to implement it is using V_FRACT_F64. +// The workaround for the V_FRACT bug is: +// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) + +// Convert (x + (-floor(x)) to fract(x) +def : Pat < + (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), + (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), + (V_CNDMASK_B64_PSEUDO + (V_MIN_F64 + SRCMODS.NONE, + (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE), + SRCMODS.NONE, + (V_MOV_B64_PSEUDO 0x3fefffffffffffff), + DSTCLAMP.NONE, DSTOMOD.NONE), + $x, + (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)) +>; + +// Convert floor(x) to (x - fract(x)) +def : Pat < + (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))), + (V_ADD_F64 + $mods, + $x, + SRCMODS.NEG, + (V_CNDMASK_B64_PSEUDO + (V_MIN_F64 + SRCMODS.NONE, + (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE), + SRCMODS.NONE, + (V_MOV_B64_PSEUDO 0x3fefffffffffffff), + DSTCLAMP.NONE, DSTOMOD.NONE), + $x, + (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)), + DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +} // End Predicates = [isSI] + +//============================================================================// +// Miscellaneous Optimization Patterns +//============================================================================// + +def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>; + +//============================================================================// +// Assembler aliases +//============================================================================// + +def : MnemonicAlias<"v_add_u32", "v_add_i32">; +def : MnemonicAlias<"v_sub_u32", "v_sub_i32">; +def : MnemonicAlias<"v_subrev_u32", "v_subrev_i32">; + +} // End isGCN predicate diff --git a/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td new file mode 100644 index 0000000..027a0a2 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td @@ -0,0 +1,199 @@ +//===-- SIIntrinsics.td - SI Intrinsic defs ----------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// SI Intrinsic Definitions +// +//===----------------------------------------------------------------------===// + + +let TargetPrefix = "SI", isTarget = 1 in { + + def int_SI_tid : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>; + def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; + def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; + def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ; + + // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed + def int_SI_tbuffer_store : Intrinsic < + [], + [llvm_anyint_ty, // rsrc(SGPR) + llvm_anyint_ty, // vdata(VGPR), overloaded for types i32, v2i32, v4i32 + llvm_i32_ty, // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW + llvm_i32_ty, // vaddr(VGPR) + llvm_i32_ty, // soffset(SGPR) + llvm_i32_ty, // inst_offset(imm) + llvm_i32_ty, // dfmt(imm) + llvm_i32_ty, // nfmt(imm) + llvm_i32_ty, // offen(imm) + llvm_i32_ty, // idxen(imm) + llvm_i32_ty, // glc(imm) + llvm_i32_ty, // slc(imm) + llvm_i32_ty], // tfe(imm) + []>; + + // Fully-flexible BUFFER_LOAD_DWORD_* except for the ADDR64 bit, which is not exposed + def int_SI_buffer_load_dword : Intrinsic < + [llvm_anyint_ty], // vdata(VGPR), overloaded for types i32, v2i32, v4i32 + [llvm_anyint_ty, // rsrc(SGPR) + llvm_anyint_ty, // vaddr(VGPR) + llvm_i32_ty, // soffset(SGPR) + llvm_i32_ty, // inst_offset(imm) + llvm_i32_ty, // offen(imm) + llvm_i32_ty, // idxen(imm) + llvm_i32_ty, // glc(imm) + llvm_i32_ty, // slc(imm) + llvm_i32_ty], // tfe(imm) + [IntrReadArgMem]>; + + def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + + // Fully-flexible SAMPLE instruction. + class SampleRaw : Intrinsic < + [llvm_v4f32_ty], // vdata(VGPR) + [llvm_anyint_ty, // vaddr(VGPR) + llvm_v8i32_ty, // rsrc(SGPR) + llvm_v4i32_ty, // sampler(SGPR) + llvm_i32_ty, // dmask(imm) + llvm_i32_ty, // unorm(imm) + llvm_i32_ty, // r128(imm) + llvm_i32_ty, // da(imm) + llvm_i32_ty, // glc(imm) + llvm_i32_ty, // slc(imm) + llvm_i32_ty, // tfe(imm) + llvm_i32_ty], // lwe(imm) + [IntrNoMem]>; + + // Image instruction without a sampler. + class Image : Intrinsic < + [llvm_v4f32_ty], // vdata(VGPR) + [llvm_anyint_ty, // vaddr(VGPR) + llvm_v8i32_ty, // rsrc(SGPR) + llvm_i32_ty, // dmask(imm) + llvm_i32_ty, // unorm(imm) + llvm_i32_ty, // r128(imm) + llvm_i32_ty, // da(imm) + llvm_i32_ty, // glc(imm) + llvm_i32_ty, // slc(imm) + llvm_i32_ty, // tfe(imm) + llvm_i32_ty], // lwe(imm) + [IntrNoMem]>; + + // Basic sample + def int_SI_image_sample : SampleRaw; + def int_SI_image_sample_cl : SampleRaw; + def int_SI_image_sample_d : SampleRaw; + def int_SI_image_sample_d_cl : SampleRaw; + def int_SI_image_sample_l : SampleRaw; + def int_SI_image_sample_b : SampleRaw; + def int_SI_image_sample_b_cl : SampleRaw; + def int_SI_image_sample_lz : SampleRaw; + def int_SI_image_sample_cd : SampleRaw; + def int_SI_image_sample_cd_cl : SampleRaw; + + // Sample with comparison + def int_SI_image_sample_c : SampleRaw; + def int_SI_image_sample_c_cl : SampleRaw; + def int_SI_image_sample_c_d : SampleRaw; + def int_SI_image_sample_c_d_cl : SampleRaw; + def int_SI_image_sample_c_l : SampleRaw; + def int_SI_image_sample_c_b : SampleRaw; + def int_SI_image_sample_c_b_cl : SampleRaw; + def int_SI_image_sample_c_lz : SampleRaw; + def int_SI_image_sample_c_cd : SampleRaw; + def int_SI_image_sample_c_cd_cl : SampleRaw; + + // Sample with offsets + def int_SI_image_sample_o : SampleRaw; + def int_SI_image_sample_cl_o : SampleRaw; + def int_SI_image_sample_d_o : SampleRaw; + def int_SI_image_sample_d_cl_o : SampleRaw; + def int_SI_image_sample_l_o : SampleRaw; + def int_SI_image_sample_b_o : SampleRaw; + def int_SI_image_sample_b_cl_o : SampleRaw; + def int_SI_image_sample_lz_o : SampleRaw; + def int_SI_image_sample_cd_o : SampleRaw; + def int_SI_image_sample_cd_cl_o : SampleRaw; + + // Sample with comparison and offsets + def int_SI_image_sample_c_o : SampleRaw; + def int_SI_image_sample_c_cl_o : SampleRaw; + def int_SI_image_sample_c_d_o : SampleRaw; + def int_SI_image_sample_c_d_cl_o : SampleRaw; + def int_SI_image_sample_c_l_o : SampleRaw; + def int_SI_image_sample_c_b_o : SampleRaw; + def int_SI_image_sample_c_b_cl_o : SampleRaw; + def int_SI_image_sample_c_lz_o : SampleRaw; + def int_SI_image_sample_c_cd_o : SampleRaw; + def int_SI_image_sample_c_cd_cl_o : SampleRaw; + + // Basic gather4 + def int_SI_gather4 : SampleRaw; + def int_SI_gather4_cl : SampleRaw; + def int_SI_gather4_l : SampleRaw; + def int_SI_gather4_b : SampleRaw; + def int_SI_gather4_b_cl : SampleRaw; + def int_SI_gather4_lz : SampleRaw; + + // Gather4 with comparison + def int_SI_gather4_c : SampleRaw; + def int_SI_gather4_c_cl : SampleRaw; + def int_SI_gather4_c_l : SampleRaw; + def int_SI_gather4_c_b : SampleRaw; + def int_SI_gather4_c_b_cl : SampleRaw; + def int_SI_gather4_c_lz : SampleRaw; + + // Gather4 with offsets + def int_SI_gather4_o : SampleRaw; + def int_SI_gather4_cl_o : SampleRaw; + def int_SI_gather4_l_o : SampleRaw; + def int_SI_gather4_b_o : SampleRaw; + def int_SI_gather4_b_cl_o : SampleRaw; + def int_SI_gather4_lz_o : SampleRaw; + + // Gather4 with comparison and offsets + def int_SI_gather4_c_o : SampleRaw; + def int_SI_gather4_c_cl_o : SampleRaw; + def int_SI_gather4_c_l_o : SampleRaw; + def int_SI_gather4_c_b_o : SampleRaw; + def int_SI_gather4_c_b_cl_o : SampleRaw; + def int_SI_gather4_c_lz_o : SampleRaw; + + def int_SI_getlod : SampleRaw; + + // Image instrinsics. + def int_SI_image_load : Image; + def int_SI_image_load_mip : Image; + def int_SI_getresinfo : Image; + + // Deprecated image and sample intrinsics. + class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; + + def int_SI_sample : Sample; + def int_SI_sampleb : Sample; + def int_SI_sampled : Sample; + def int_SI_samplel : Sample; + def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; + + /* Interpolation Intrinsics */ + + def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrNoMem]>; + + /* Control flow Intrinsics */ + + def int_SI_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>; + def int_SI_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>; + def int_SI_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>; + def int_SI_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>; + def int_SI_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>; + def int_SI_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>; + def int_SI_end_cf : Intrinsic<[], [llvm_i64_ty], []>; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp new file mode 100644 index 0000000..1bdb1f0 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -0,0 +1,443 @@ +//===-- SILoadStoreOptimizer.cpp ------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass tries to fuse DS instructions with close by immediate offsets. +// This will fuse operations such as +// ds_read_b32 v0, v2 offset:16 +// ds_read_b32 v1, v2 offset:32 +// ==> +// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 +// +// +// Future improvements: +// +// - This currently relies on the scheduler to place loads and stores next to +// each other, and then only merges adjacent pairs of instructions. It would +// be good to be more flexible with interleaved instructions, and possibly run +// before scheduling. It currently missing stores of constants because loading +// the constant into the data register is placed between the stores, although +// this is arguably a scheduling problem. +// +// - Live interval recomputing seems inefficient. This currently only matches +// one pair, and recomputes live intervals and moves on to the next pair. It +// would be better to compute a list of all merges that need to occur +// +// - With a list of instructions to process, we can also merge more. If a +// cluster of loads have offsets that are too large to fit in the 8-bit +// offsets, but are close enough to fit in the 8 bits, we can add to the base +// pointer and use the new reduced offsets. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-load-store-opt" + +namespace { + +class SILoadStoreOptimizer : public MachineFunctionPass { +private: + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + MachineRegisterInfo *MRI; + LiveIntervals *LIS; + + + static bool offsetsCanBeCombined(unsigned Offset0, + unsigned Offset1, + unsigned EltSize); + + MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I, + unsigned EltSize); + + void updateRegDefsUses(unsigned SrcReg, + unsigned DstReg, + unsigned SubIdx); + + MachineBasicBlock::iterator mergeRead2Pair( + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + unsigned EltSize); + + MachineBasicBlock::iterator mergeWrite2Pair( + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + unsigned EltSize); + +public: + static char ID; + + SILoadStoreOptimizer() + : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr), + LIS(nullptr) {} + + SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) { + initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); + } + + bool optimizeBlock(MachineBasicBlock &MBB); + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Load / Store Optimizer"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addPreserved<SlotIndexes>(); + AU.addPreserved<LiveIntervals>(); + AU.addPreserved<LiveVariables>(); + AU.addRequired<LiveIntervals>(); + + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, + "SI Load / Store Optimizer", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(LiveVariables) +INITIALIZE_PASS_DEPENDENCY(SlotIndexes) +INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, + "SI Load / Store Optimizer", false, false) + +char SILoadStoreOptimizer::ID = 0; + +char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; + +FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) { + return new SILoadStoreOptimizer(TM); +} + +bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0, + unsigned Offset1, + unsigned Size) { + // XXX - Would the same offset be OK? Is there any reason this would happen or + // be useful? + if (Offset0 == Offset1) + return false; + + // This won't be valid if the offset isn't aligned. + if ((Offset0 % Size != 0) || (Offset1 % Size != 0)) + return false; + + unsigned EltOffset0 = Offset0 / Size; + unsigned EltOffset1 = Offset1 / Size; + + // Check if the new offsets fit in the reduced 8-bit range. + if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) + return true; + + // If the offset in elements doesn't fit in 8-bits, we might be able to use + // the stride 64 versions. + if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0) + return false; + + return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64); +} + +MachineBasicBlock::iterator +SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, + unsigned EltSize){ + MachineBasicBlock::iterator E = I->getParent()->end(); + MachineBasicBlock::iterator MBBI = I; + ++MBBI; + + if (MBBI->getOpcode() != I->getOpcode()) + return E; + + // Don't merge volatiles. + if (MBBI->hasOrderedMemoryRef()) + return E; + + int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); + const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); + const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); + + // Check same base pointer. Be careful of subregisters, which can occur with + // vectors of pointers. + if (AddrReg0.getReg() == AddrReg1.getReg() && + AddrReg0.getSubReg() == AddrReg1.getSubReg()) { + int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), + AMDGPU::OpName::offset); + unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff; + unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; + + // Check both offsets fit in the reduced range. + if (offsetsCanBeCombined(Offset0, Offset1, EltSize)) + return MBBI; + } + + return E; +} + +void SILoadStoreOptimizer::updateRegDefsUses(unsigned SrcReg, + unsigned DstReg, + unsigned SubIdx) { + for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(SrcReg), + E = MRI->reg_end(); I != E; ) { + MachineOperand &O = *I; + ++I; + O.substVirtReg(DstReg, SubIdx, *TRI); + } +} + +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + unsigned EltSize) { + MachineBasicBlock *MBB = I->getParent(); + + // Be careful, since the addresses could be subregisters themselves in weird + // cases, like vectors of pointers. + const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); + + const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst); + const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst); + + unsigned Offset0 + = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; + unsigned Offset1 + = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; + + unsigned NewOffset0 = Offset0 / EltSize; + unsigned NewOffset1 = Offset1 / EltSize; + unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; + + // Prefer the st64 form if we can use it, even if we can fit the offset in the + // non st64 version. I'm not sure if there's any real reason to do this. + bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); + if (UseST64) { + NewOffset0 /= 64; + NewOffset1 /= 64; + Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; + } + + assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && + (NewOffset0 != NewOffset1) && + "Computed offset doesn't fit"); + + const MCInstrDesc &Read2Desc = TII->get(Opc); + + const TargetRegisterClass *SuperRC + = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; + unsigned DestReg = MRI->createVirtualRegister(SuperRC); + + DebugLoc DL = I->getDebugLoc(); + MachineInstrBuilder Read2 + = BuildMI(*MBB, I, DL, Read2Desc, DestReg) + .addOperand(*AddrReg) // addr + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .addMemOperand(*I->memoperands_begin()) + .addMemOperand(*Paired->memoperands_begin()); + + unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; + unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; + + const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); + + // Copy to the old destination registers. + MachineInstr *Copy0 = BuildMI(*MBB, I, DL, CopyDesc) + .addOperand(*Dest0) // Copy to same destination including flags and sub reg. + .addReg(DestReg, 0, SubRegIdx0); + MachineInstr *Copy1 = BuildMI(*MBB, I, DL, CopyDesc) + .addOperand(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); + + LIS->InsertMachineInstrInMaps(Read2); + + // repairLiveintervalsInRange() doesn't handle physical register, so we have + // to update the M0 range manually. + SlotIndex PairedIndex = LIS->getInstructionIndex(Paired); + LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); + LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); + bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); + + // The new write to the original destination register is now the copy. Steal + // the old SlotIndex. + LIS->ReplaceMachineInstrInMaps(I, Copy0); + LIS->ReplaceMachineInstrInMaps(Paired, Copy1); + + I->eraseFromParent(); + Paired->eraseFromParent(); + + LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg()); + LIS->shrinkToUses(&AddrRegLI); + + LIS->createAndComputeVirtRegInterval(DestReg); + + if (UpdateM0Range) { + SlotIndex Read2Index = LIS->getInstructionIndex(Read2); + M0Segment->end = Read2Index.getRegSlot(); + } + + DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); + return Read2.getInstr(); +} + +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + unsigned EltSize) { + MachineBasicBlock *MBB = I->getParent(); + + // Be sure to use .addOperand(), and not .addReg() with these. We want to be + // sure we preserve the subregister index and any register flags set on them. + const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr); + const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0); + const MachineOperand *Data1 + = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0); + + + unsigned Offset0 + = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; + unsigned Offset1 + = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; + + unsigned NewOffset0 = Offset0 / EltSize; + unsigned NewOffset1 = Offset1 / EltSize; + unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; + + // Prefer the st64 form if we can use it, even if we can fit the offset in the + // non st64 version. I'm not sure if there's any real reason to do this. + bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); + if (UseST64) { + NewOffset0 /= 64; + NewOffset1 /= 64; + Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; + } + + assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && + (NewOffset0 != NewOffset1) && + "Computed offset doesn't fit"); + + const MCInstrDesc &Write2Desc = TII->get(Opc); + DebugLoc DL = I->getDebugLoc(); + + // repairLiveintervalsInRange() doesn't handle physical register, so we have + // to update the M0 range manually. + SlotIndex PairedIndex = LIS->getInstructionIndex(Paired); + LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); + LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); + bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); + + MachineInstrBuilder Write2 + = BuildMI(*MBB, I, DL, Write2Desc) + .addOperand(*Addr) // addr + .addOperand(*Data0) // data0 + .addOperand(*Data1) // data1 + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .addMemOperand(*I->memoperands_begin()) + .addMemOperand(*Paired->memoperands_begin()); + + // XXX - How do we express subregisters here? + unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() }; + + LIS->RemoveMachineInstrFromMaps(I); + LIS->RemoveMachineInstrFromMaps(Paired); + I->eraseFromParent(); + Paired->eraseFromParent(); + + // This doesn't handle physical registers like M0 + LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs); + + if (UpdateM0Range) { + SlotIndex Write2Index = LIS->getInstructionIndex(Write2); + M0Segment->end = Write2Index.getRegSlot(); + } + + DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); + return Write2.getInstr(); +} + +// Scan through looking for adjacent LDS operations with constant offsets from +// the same base register. We rely on the scheduler to do the hard work of +// clustering nearby loads, and assume these are all adjacent. +bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { + bool Modified = false; + + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { + MachineInstr &MI = *I; + + // Don't combine if volatile. + if (MI.hasOrderedMemoryRef()) { + ++I; + continue; + } + + unsigned Opc = MI.getOpcode(); + if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { + unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; + MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); + if (Match != E) { + Modified = true; + I = mergeRead2Pair(I, Match, Size); + } else { + ++I; + } + + continue; + } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { + unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; + MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); + if (Match != E) { + Modified = true; + I = mergeWrite2Pair(I, Match, Size); + } else { + ++I; + } + + continue; + } + + ++I; + } + + return Modified; +} + +bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { + const TargetSubtargetInfo &STM = MF.getSubtarget(); + TRI = static_cast<const SIRegisterInfo *>(STM.getRegisterInfo()); + TII = static_cast<const SIInstrInfo *>(STM.getInstrInfo()); + MRI = &MF.getRegInfo(); + + LIS = &getAnalysis<LiveIntervals>(); + + DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); + + assert(!MRI->isSSA()); + + bool Modified = false; + + for (MachineBasicBlock &MBB : MF) + Modified |= optimizeBlock(MBB); + + return Modified; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp new file mode 100644 index 0000000..126f624 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -0,0 +1,607 @@ +//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This pass lowers the pseudo control flow instructions to real +/// machine instructions. +/// +/// All control flow is handled using predicated instructions and +/// a predicate stack. Each Scalar ALU controls the operations of 64 Vector +/// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs +/// by writting to the 64-bit EXEC register (each bit corresponds to a +/// single vector ALU). Typically, for predicates, a vector ALU will write +/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each +/// Vector ALU) and then the ScalarALU will AND the VCC register with the +/// EXEC to update the predicates. +/// +/// For example: +/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 +/// %SGPR0 = SI_IF %VCC +/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 +/// %SGPR0 = SI_ELSE %SGPR0 +/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 +/// SI_END_CF %SGPR0 +/// +/// becomes: +/// +/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask +/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask +/// S_CBRANCH_EXECZ label0 // This instruction is an optional +/// // optimization which allows us to +/// // branch if all the bits of +/// // EXEC are zero. +/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch +/// +/// label0: +/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block +/// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask +/// S_BRANCH_EXECZ label1 // Use our branch optimization +/// // instruction again. +/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block +/// label1: +/// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Constants.h" + +using namespace llvm; + +namespace { + +class SILowerControlFlowPass : public MachineFunctionPass { + +private: + static const unsigned SkipThreshold = 12; + + static char ID; + const SIRegisterInfo *TRI; + const SIInstrInfo *TII; + + bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); + + void Skip(MachineInstr &From, MachineOperand &To); + void SkipIfDead(MachineInstr &MI); + + void If(MachineInstr &MI); + void Else(MachineInstr &MI); + void Break(MachineInstr &MI); + void IfBreak(MachineInstr &MI); + void ElseBreak(MachineInstr &MI); + void Loop(MachineInstr &MI); + void EndCf(MachineInstr &MI); + + void Kill(MachineInstr &MI); + void Branch(MachineInstr &MI); + + void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0); + void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset); + void IndirectSrc(MachineInstr &MI); + void IndirectDst(MachineInstr &MI); + +public: + SILowerControlFlowPass(TargetMachine &tm) : + MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Lower control flow instructions"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace + +char SILowerControlFlowPass::ID = 0; + +FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) { + return new SILowerControlFlowPass(tm); +} + +bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From, + MachineBasicBlock *To) { + + unsigned NumInstr = 0; + + for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty(); + MBB = *MBB->succ_begin()) { + + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); + NumInstr < SkipThreshold && I != E; ++I) { + + if (I->isBundle() || !I->isBundled()) + if (++NumInstr >= SkipThreshold) + return true; + } + } + + return false; +} + +void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) { + + if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB())) + return; + + DebugLoc DL = From.getDebugLoc(); + BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) + .addOperand(To); +} + +void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getShaderType() != + ShaderType::PIXEL || + !shouldSkip(&MBB, &MBB.getParent()->back())) + return; + + MachineBasicBlock::iterator Insert = &MI; + ++Insert; + + // If the exec mask is non-zero, skip the next two instructions + BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addImm(3); + + // Exec mask is zero: Export to NULL target... + BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP)) + .addImm(0) + .addImm(0x09) // V_008DFC_SQ_EXP_NULL + .addImm(0) + .addImm(1) + .addImm(1) + .addReg(AMDGPU::VGPR0) + .addReg(AMDGPU::VGPR0) + .addReg(AMDGPU::VGPR0) + .addReg(AMDGPU::VGPR0); + + // ... and terminate wavefront + BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); +} + +void SILowerControlFlowPass::If(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Reg = MI.getOperand(0).getReg(); + unsigned Vcc = MI.getOperand(1).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg) + .addReg(Vcc); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg) + .addReg(AMDGPU::EXEC) + .addReg(Reg); + + Skip(MI, MI.getOperand(2)); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::Else(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Src = MI.getOperand(1).getReg(); + + BuildMI(MBB, MBB.getFirstNonPHI(), DL, + TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) + .addReg(Src); // Saved EXEC + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(Dst); + + Skip(MI, MI.getOperand(2)); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::Break(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Src = MI.getOperand(1).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(AMDGPU::EXEC) + .addReg(Src); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::IfBreak(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Vcc = MI.getOperand(1).getReg(); + unsigned Src = MI.getOperand(2).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(Vcc) + .addReg(Src); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Saved = MI.getOperand(1).getReg(); + unsigned Src = MI.getOperand(2).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(Saved) + .addReg(Src); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::Loop(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Src = MI.getOperand(0).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(Src); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addOperand(MI.getOperand(1)); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::EndCf(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Reg = MI.getOperand(0).getReg(); + + BuildMI(MBB, MBB.getFirstNonPHI(), DL, + TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(Reg); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::Branch(MachineInstr &MI) { + if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode()) + MI.eraseFromParent(); + + // If these aren't equal, this is probably an infinite loop. +} + +void SILowerControlFlowPass::Kill(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + const MachineOperand &Op = MI.getOperand(0); + +#ifndef NDEBUG + const SIMachineFunctionInfo *MFI + = MBB.getParent()->getInfo<SIMachineFunctionInfo>(); + // Kill is only allowed in pixel / geometry shaders. + assert(MFI->getShaderType() == ShaderType::PIXEL || + MFI->getShaderType() == ShaderType::GEOMETRY); +#endif + + // Clear this thread from the exec mask if the operand is negative + if ((Op.isImm())) { + // Constant operand: Set exec mask to 0 or do nothing + if (Op.getImm() & 0x80000000) { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addImm(0); + } + } else { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32)) + .addImm(0) + .addOperand(Op); + } + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + MachineBasicBlock::iterator I = MI; + + unsigned Save = MI.getOperand(1).getReg(); + unsigned Idx = MI.getOperand(3).getReg(); + + if (AMDGPU::SReg_32RegClass.contains(Idx)) { + if (Offset) { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) + .addReg(Idx) + .addImm(Offset); + } else { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(Idx); + } + MBB.insert(I, MovRel); + } else { + + assert(AMDGPU::SReg_64RegClass.contains(Save)); + assert(AMDGPU::VGPR_32RegClass.contains(Idx)); + + // Save the EXEC mask + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save) + .addReg(AMDGPU::EXEC); + + // Read the next variant into VCC (lower 32 bits) <- also loop target + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + AMDGPU::VCC_LO) + .addReg(Idx); + + // Move index from VCC into M0 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(AMDGPU::VCC_LO); + + // Compare the just read M0 value to all possible Idx values + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32)) + .addReg(AMDGPU::M0) + .addReg(Idx); + + // Update EXEC, save the original EXEC value to VCC + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) + .addReg(AMDGPU::VCC); + + if (Offset) { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) + .addReg(AMDGPU::M0) + .addImm(Offset); + } + // Do the actual move + MBB.insert(I, MovRel); + + // Update EXEC, switch all done bits to 0 and all todo bits to 1 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(AMDGPU::VCC); + + // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addImm(-7); + + // Restore EXEC + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addReg(Save); + + } + MI.eraseFromParent(); +} + +/// \param @VecReg The register which holds element zero of the vector +/// being addressed into. +/// \param[out] @Reg The base register to use in the indirect addressing instruction. +/// \param[in,out] @Offset As an input, this is the constant offset part of the +// indirect Index. e.g. v0 = v[VecReg + Offset] +// As an output, this is a constant value that needs +// to be added to the value stored in M0. +void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg, + unsigned &Reg, + int &Offset) { + unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0); + if (!SubReg) + SubReg = VecReg; + + const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg); + int RegIdx = TRI->getHWRegIndex(SubReg) + Offset; + + if (RegIdx < 0) { + Offset = RegIdx; + RegIdx = 0; + } else { + Offset = 0; + } + + Reg = RC->getRegister(RegIdx); +} + +void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Vec = MI.getOperand(2).getReg(); + int Off = MI.getOperand(4).getImm(); + unsigned Reg; + + computeIndirectRegAndOffset(Vec, Reg, Off); + + MachineInstr *MovRel = + BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) + .addReg(Reg) + .addReg(Vec, RegState::Implicit); + + LoadM0(MI, MovRel, Off); +} + +void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + int Off = MI.getOperand(4).getImm(); + unsigned Val = MI.getOperand(5).getReg(); + unsigned Reg; + + computeIndirectRegAndOffset(Dst, Reg, Off); + + MachineInstr *MovRel = + BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32)) + .addReg(Reg, RegState::Define) + .addReg(Val) + .addReg(Dst, RegState::Implicit); + + LoadM0(MI, MovRel, Off); +} + +bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { + TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + TRI = + static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + bool HaveKill = false; + bool NeedWQM = false; + bool NeedFlat = false; + unsigned Depth = 0; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + + MachineInstr &MI = *I; + if (TII->isWQM(MI) || TII->isDS(MI)) + NeedWQM = true; + + // Flat uses m0 in case it needs to access LDS. + if (TII->isFLAT(MI)) + NeedFlat = true; + + switch (MI.getOpcode()) { + default: break; + case AMDGPU::SI_IF: + ++Depth; + If(MI); + break; + + case AMDGPU::SI_ELSE: + Else(MI); + break; + + case AMDGPU::SI_BREAK: + Break(MI); + break; + + case AMDGPU::SI_IF_BREAK: + IfBreak(MI); + break; + + case AMDGPU::SI_ELSE_BREAK: + ElseBreak(MI); + break; + + case AMDGPU::SI_LOOP: + ++Depth; + Loop(MI); + break; + + case AMDGPU::SI_END_CF: + if (--Depth == 0 && HaveKill) { + SkipIfDead(MI); + HaveKill = false; + } + EndCf(MI); + break; + + case AMDGPU::SI_KILL: + if (Depth == 0) + SkipIfDead(MI); + else + HaveKill = true; + Kill(MI); + break; + + case AMDGPU::S_BRANCH: + Branch(MI); + break; + + case AMDGPU::SI_INDIRECT_SRC_V1: + case AMDGPU::SI_INDIRECT_SRC_V2: + case AMDGPU::SI_INDIRECT_SRC_V4: + case AMDGPU::SI_INDIRECT_SRC_V8: + case AMDGPU::SI_INDIRECT_SRC_V16: + IndirectSrc(MI); + break; + + case AMDGPU::SI_INDIRECT_DST_V1: + case AMDGPU::SI_INDIRECT_DST_V2: + case AMDGPU::SI_INDIRECT_DST_V4: + case AMDGPU::SI_INDIRECT_DST_V8: + case AMDGPU::SI_INDIRECT_DST_V16: + IndirectDst(MI); + break; + } + } + } + + if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) { + MachineBasicBlock &MBB = MF.front(); + BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64), + AMDGPU::EXEC).addReg(AMDGPU::EXEC); + } + + // FIXME: This seems inappropriate to do here. + if (NeedFlat && MFI->IsKernel) { + // Insert the prologue initializing the SGPRs pointing to the scratch space + // for flat accesses. + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + + // TODO: What to use with function calls? + + // FIXME: This is reporting stack size that is used in a scratch buffer + // rather than registers as well. + uint64_t StackSizeBytes = FrameInfo->getStackSize(); + + int IndirectBegin + = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF); + // Convert register index to 256-byte unit. + uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256); + + assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff && + "Stack limits should be smaller than 16-bits"); + + // Initialize the flat scratch register pair. + // TODO: Can we use one s_mov_b64 here? + + // Offset is in units of 256-bytes. + MachineBasicBlock &MBB = MF.front(); + DebugLoc NoDL; + MachineBasicBlock::iterator Start = MBB.getFirstNonPHI(); + const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32); + + assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes)); + + BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO) + .addImm(StackOffset); + + // Documentation says size is "per-thread scratch size in bytes" + BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI) + .addImm(StackSizeBytes); + } + + return true; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp new file mode 100644 index 0000000..a2fa5fd --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -0,0 +1,152 @@ +//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// i1 values are usually inserted by the CFG Structurize pass and they are +/// unique in that they can be copied from VALU to SALU registers. +/// This is not possible for any other value type. Since there are no +/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1. +/// +//===----------------------------------------------------------------------===// +// + +#define DEBUG_TYPE "si-i1-copies" +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +namespace { + +class SILowerI1Copies : public MachineFunctionPass { +public: + static char ID; + +public: + SILowerI1Copies() : MachineFunctionPass(ID) { + initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Lower i1 Copies"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, + "SI Lower i1 Copies", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, + "SI Lower i1 Copies", false, false) + +char SILowerI1Copies::ID = 0; + +char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID; + +FunctionPass *llvm::createSILowerI1CopiesPass() { + return new SILowerI1Copies(); +} + +bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + std::vector<unsigned> I1Defs; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + + if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) { + unsigned Reg = MI.getOperand(0).getReg(); + const TargetRegisterClass *RC = MRI.getRegClass(Reg); + if (RC == &AMDGPU::VReg_1RegClass) + MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass); + continue; + } + + if (MI.getOpcode() != AMDGPU::COPY) + continue; + + const MachineOperand &Dst = MI.getOperand(0); + const MachineOperand &Src = MI.getOperand(1); + + if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) || + !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) + continue; + + const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg()); + const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg()); + + if (DstRC == &AMDGPU::VReg_1RegClass && + TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) { + I1Defs.push_back(Dst.getReg()); + DebugLoc DL = MI.getDebugLoc(); + + MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg()); + if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) { + if (DefInst->getOperand(1).isImm()) { + I1Defs.push_back(Dst.getReg()); + + int64_t Val = DefInst->getOperand(1).getImm(); + assert(Val == 0 || Val == -1); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32)) + .addOperand(Dst) + .addImm(Val); + MI.eraseFromParent(); + continue; + } + } + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64)) + .addOperand(Dst) + .addImm(0) + .addImm(-1) + .addOperand(Src); + MI.eraseFromParent(); + } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) && + SrcRC == &AMDGPU::VReg_1RegClass) { + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64)) + .addOperand(Dst) + .addOperand(Src) + .addImm(0); + MI.eraseFromParent(); + } + } + } + + for (unsigned Reg : I1Defs) + MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass); + + return false; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp new file mode 100644 index 0000000..49677fc --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -0,0 +1,195 @@ +//===-- SIMachineFunctionInfo.cpp - SI Machine Function Info -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + + +#include "SIMachineFunctionInfo.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" + +#define MAX_LANES 64 + +using namespace llvm; + + +// Pin the vtable to this file. +void SIMachineFunctionInfo::anchor() {} + +SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) + : AMDGPUMachineFunction(MF), + TIDReg(AMDGPU::NoRegister), + ScratchRSrcReg(AMDGPU::NoRegister), + ScratchWaveOffsetReg(AMDGPU::NoRegister), + PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister), + DispatchPtrUserSGPR(AMDGPU::NoRegister), + QueuePtrUserSGPR(AMDGPU::NoRegister), + KernargSegmentPtrUserSGPR(AMDGPU::NoRegister), + DispatchIDUserSGPR(AMDGPU::NoRegister), + FlatScratchInitUserSGPR(AMDGPU::NoRegister), + PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister), + WorkGroupIDXSystemSGPR(AMDGPU::NoRegister), + WorkGroupIDYSystemSGPR(AMDGPU::NoRegister), + WorkGroupIDZSystemSGPR(AMDGPU::NoRegister), + WorkGroupInfoSystemSGPR(AMDGPU::NoRegister), + PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), + PSInputAddr(0), + ReturnsVoid(true), + LDSWaveSpillSize(0), + PSInputEna(0), + NumUserSGPRs(0), + NumSystemSGPRs(0), + HasSpilledSGPRs(false), + HasSpilledVGPRs(false), + PrivateSegmentBuffer(false), + DispatchPtr(false), + QueuePtr(false), + DispatchID(false), + KernargSegmentPtr(false), + FlatScratchInit(false), + GridWorkgroupCountX(false), + GridWorkgroupCountY(false), + GridWorkgroupCountZ(false), + WorkGroupIDX(true), + WorkGroupIDY(false), + WorkGroupIDZ(false), + WorkGroupInfo(false), + PrivateSegmentWaveByteOffset(false), + WorkItemIDX(true), + WorkItemIDY(false), + WorkItemIDZ(false) { + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + const Function *F = MF.getFunction(); + + PSInputAddr = AMDGPU::getInitialPSInputAddr(*F); + + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + + if (getShaderType() == ShaderType::COMPUTE) + KernargSegmentPtr = true; + + if (F->hasFnAttribute("amdgpu-work-group-id-y")) + WorkGroupIDY = true; + + if (F->hasFnAttribute("amdgpu-work-group-id-z")) + WorkGroupIDZ = true; + + if (F->hasFnAttribute("amdgpu-work-item-id-y")) + WorkItemIDY = true; + + if (F->hasFnAttribute("amdgpu-work-item-id-z")) + WorkItemIDZ = true; + + bool MaySpill = ST.isVGPRSpillingEnabled(this); + bool HasStackObjects = FrameInfo->hasStackObjects(); + + if (HasStackObjects || MaySpill) + PrivateSegmentWaveByteOffset = true; + + if (ST.isAmdHsaOS()) { + if (HasStackObjects || MaySpill) + PrivateSegmentBuffer = true; + + if (F->hasFnAttribute("amdgpu-dispatch-ptr")) + DispatchPtr = true; + } + + // X, XY, and XYZ are the only supported combinations, so make sure Y is + // enabled if Z is. + if (WorkItemIDZ) + WorkItemIDY = true; +} + +unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( + const SIRegisterInfo &TRI) { + PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass); + NumUserSGPRs += 4; + return PrivateSegmentBufferUserSGPR; +} + +unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) { + DispatchPtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return DispatchPtrUserSGPR; +} + +unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) { + QueuePtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return QueuePtrUserSGPR; +} + +unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) { + KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return KernargSegmentPtrUserSGPR; +} + +SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( + MachineFunction *MF, + unsigned FrameIndex, + unsigned SubIdx) { + const MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>( + MF->getSubtarget<AMDGPUSubtarget>().getRegisterInfo()); + MachineRegisterInfo &MRI = MF->getRegInfo(); + int64_t Offset = FrameInfo->getObjectOffset(FrameIndex); + Offset += SubIdx * 4; + + unsigned LaneVGPRIdx = Offset / (64 * 4); + unsigned Lane = (Offset / 4) % 64; + + struct SpilledReg Spill; + + if (!LaneVGPRs.count(LaneVGPRIdx)) { + unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass); + + if (LaneVGPR == AMDGPU::NoRegister) { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("Ran out of VGPRs for spilling SGPR"); + + // When compiling from inside Mesa, the compilation continues. + // Select an arbitrary register to avoid triggering assertions + // during subsequent passes. + LaneVGPR = AMDGPU::VGPR0; + } + + LaneVGPRs[LaneVGPRIdx] = LaneVGPR; + + // Add this register as live-in to all blocks to avoid machine verifer + // complaining about use of an undefined physical register. + for (MachineFunction::iterator BI = MF->begin(), BE = MF->end(); + BI != BE; ++BI) { + BI->addLiveIn(LaneVGPR); + } + } + + Spill.VGPR = LaneVGPRs[LaneVGPRIdx]; + Spill.Lane = Lane; + return Spill; +} + +unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize( + const MachineFunction &MF) const { + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + // FIXME: We should get this information from kernel attributes if it + // is available. + return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h new file mode 100644 index 0000000..846ee5d --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -0,0 +1,306 @@ +//===- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface -*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + + +#ifndef LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H +#define LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H + +#include "AMDGPUMachineFunction.h" +#include "SIRegisterInfo.h" +#include <map> + +namespace llvm { + +class MachineRegisterInfo; + +/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which +/// tells the hardware which interpolation parameters to load. +class SIMachineFunctionInfo : public AMDGPUMachineFunction { + // FIXME: This should be removed and getPreloadedValue moved here. + friend struct SIRegisterInfo; + void anchor() override; + + unsigned TIDReg; + + // Registers that may be reserved for spilling purposes. These may be the same + // as the input registers. + unsigned ScratchRSrcReg; + unsigned ScratchWaveOffsetReg; + + // Input registers setup for the HSA ABI. + // User SGPRs in allocation order. + unsigned PrivateSegmentBufferUserSGPR; + unsigned DispatchPtrUserSGPR; + unsigned QueuePtrUserSGPR; + unsigned KernargSegmentPtrUserSGPR; + unsigned DispatchIDUserSGPR; + unsigned FlatScratchInitUserSGPR; + unsigned PrivateSegmentSizeUserSGPR; + unsigned GridWorkGroupCountXUserSGPR; + unsigned GridWorkGroupCountYUserSGPR; + unsigned GridWorkGroupCountZUserSGPR; + + // System SGPRs in allocation order. + unsigned WorkGroupIDXSystemSGPR; + unsigned WorkGroupIDYSystemSGPR; + unsigned WorkGroupIDZSystemSGPR; + unsigned WorkGroupInfoSystemSGPR; + unsigned PrivateSegmentWaveByteOffsetSystemSGPR; + + // Graphics info. + unsigned PSInputAddr; + bool ReturnsVoid; + +public: + // FIXME: Make private + unsigned LDSWaveSpillSize; + unsigned PSInputEna; + std::map<unsigned, unsigned> LaneVGPRs; + unsigned ScratchOffsetReg; + unsigned NumUserSGPRs; + unsigned NumSystemSGPRs; + +private: + bool HasSpilledSGPRs; + bool HasSpilledVGPRs; + + // Feature bits required for inputs passed in user SGPRs. + bool PrivateSegmentBuffer : 1; + bool DispatchPtr : 1; + bool QueuePtr : 1; + bool DispatchID : 1; + bool KernargSegmentPtr : 1; + bool FlatScratchInit : 1; + bool GridWorkgroupCountX : 1; + bool GridWorkgroupCountY : 1; + bool GridWorkgroupCountZ : 1; + + // Feature bits required for inputs passed in system SGPRs. + bool WorkGroupIDX : 1; // Always initialized. + bool WorkGroupIDY : 1; + bool WorkGroupIDZ : 1; + bool WorkGroupInfo : 1; + bool PrivateSegmentWaveByteOffset : 1; + + bool WorkItemIDX : 1; // Always initialized. + bool WorkItemIDY : 1; + bool WorkItemIDZ : 1; + + + MCPhysReg getNextUserSGPR() const { + assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); + return AMDGPU::SGPR0 + NumUserSGPRs; + } + + MCPhysReg getNextSystemSGPR() const { + return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; + } + +public: + struct SpilledReg { + unsigned VGPR; + int Lane; + SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) { } + SpilledReg() : VGPR(0), Lane(-1) { } + bool hasLane() { return Lane != -1;} + }; + + // SIMachineFunctionInfo definition + + SIMachineFunctionInfo(const MachineFunction &MF); + SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex, + unsigned SubIdx); + bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; }; + unsigned getTIDReg() const { return TIDReg; }; + void setTIDReg(unsigned Reg) { TIDReg = Reg; } + + // Add user SGPRs. + unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI); + unsigned addDispatchPtr(const SIRegisterInfo &TRI); + unsigned addQueuePtr(const SIRegisterInfo &TRI); + unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI); + + // Add system SGPRs. + unsigned addWorkGroupIDX() { + WorkGroupIDXSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDXSystemSGPR; + } + + unsigned addWorkGroupIDY() { + WorkGroupIDYSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDYSystemSGPR; + } + + unsigned addWorkGroupIDZ() { + WorkGroupIDZSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDZSystemSGPR; + } + + unsigned addWorkGroupInfo() { + WorkGroupInfoSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupInfoSystemSGPR; + } + + unsigned addPrivateSegmentWaveByteOffset() { + PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return PrivateSegmentWaveByteOffsetSystemSGPR; + } + + bool hasPrivateSegmentBuffer() const { + return PrivateSegmentBuffer; + } + + bool hasDispatchPtr() const { + return DispatchPtr; + } + + bool hasQueuePtr() const { + return QueuePtr; + } + + bool hasDispatchID() const { + return DispatchID; + } + + bool hasKernargSegmentPtr() const { + return KernargSegmentPtr; + } + + bool hasFlatScratchInit() const { + return FlatScratchInit; + } + + bool hasGridWorkgroupCountX() const { + return GridWorkgroupCountX; + } + + bool hasGridWorkgroupCountY() const { + return GridWorkgroupCountY; + } + + bool hasGridWorkgroupCountZ() const { + return GridWorkgroupCountZ; + } + + bool hasWorkGroupIDX() const { + return WorkGroupIDX; + } + + bool hasWorkGroupIDY() const { + return WorkGroupIDY; + } + + bool hasWorkGroupIDZ() const { + return WorkGroupIDZ; + } + + bool hasWorkGroupInfo() const { + return WorkGroupInfo; + } + + bool hasPrivateSegmentWaveByteOffset() const { + return PrivateSegmentWaveByteOffset; + } + + bool hasWorkItemIDX() const { + return WorkItemIDX; + } + + bool hasWorkItemIDY() const { + return WorkItemIDY; + } + + bool hasWorkItemIDZ() const { + return WorkItemIDZ; + } + + unsigned getNumUserSGPRs() const { + return NumUserSGPRs; + } + + unsigned getNumPreloadedSGPRs() const { + return NumUserSGPRs + NumSystemSGPRs; + } + + unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const { + return PrivateSegmentWaveByteOffsetSystemSGPR; + } + + /// \brief Returns the physical register reserved for use as the resource + /// descriptor for scratch accesses. + unsigned getScratchRSrcReg() const { + return ScratchRSrcReg; + } + + void setScratchRSrcReg(unsigned Reg) { + assert(Reg != AMDGPU::NoRegister && "Should never be unset"); + ScratchRSrcReg = Reg; + } + + unsigned getScratchWaveOffsetReg() const { + return ScratchWaveOffsetReg; + } + + void setScratchWaveOffsetReg(unsigned Reg) { + assert(Reg != AMDGPU::NoRegister && "Should never be unset"); + ScratchWaveOffsetReg = Reg; + } + + bool hasSpilledSGPRs() const { + return HasSpilledSGPRs; + } + + void setHasSpilledSGPRs(bool Spill = true) { + HasSpilledSGPRs = Spill; + } + + bool hasSpilledVGPRs() const { + return HasSpilledVGPRs; + } + + void setHasSpilledVGPRs(bool Spill = true) { + HasSpilledVGPRs = Spill; + } + + unsigned getPSInputAddr() const { + return PSInputAddr; + } + + bool isPSInputAllocated(unsigned Index) const { + return PSInputAddr & (1 << Index); + } + + void markPSInputAllocated(unsigned Index) { + PSInputAddr |= 1 << Index; + } + + bool returnsVoid() const { + return ReturnsVoid; + } + + void setIfReturnsVoid(bool Value) { + ReturnsVoid = Value; + } + + unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const; +}; + +} // End namespace llvm + + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp new file mode 100644 index 0000000..1cfa984 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -0,0 +1,1968 @@ +//===-- SIMachineScheduler.cpp - SI Scheduler Interface -*- C++ -*-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief SI Machine Scheduler interface +// +//===----------------------------------------------------------------------===// + +#include "SIMachineScheduler.h" +#include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/RegisterPressure.h" + +using namespace llvm; + +#define DEBUG_TYPE "misched" + +// This scheduler implements a different scheduling algorithm than +// GenericScheduler. +// +// There are several specific architecture behaviours that can't be modelled +// for GenericScheduler: +// . When accessing the result of an SGPR load instruction, you have to wait +// for all the SGPR load instructions before your current instruction to +// have finished. +// . When accessing the result of an VGPR load instruction, you have to wait +// for all the VGPR load instructions previous to the VGPR load instruction +// you are interested in to finish. +// . The less the register pressure, the best load latencies are hidden +// +// Moreover some specifities (like the fact a lot of instructions in the shader +// have few dependencies) makes the generic scheduler have some unpredictable +// behaviours. For example when register pressure becomes high, it can either +// manage to prevent register pressure from going too high, or it can +// increase register pressure even more than if it hadn't taken register +// pressure into account. +// +// Also some other bad behaviours are generated, like loading at the beginning +// of the shader a constant in VGPR you won't need until the end of the shader. +// +// The scheduling problem for SI can distinguish three main parts: +// . Hiding high latencies (texture sampling, etc) +// . Hiding low latencies (SGPR constant loading, etc) +// . Keeping register usage low for better latency hiding and general +// performance +// +// Some other things can also affect performance, but are hard to predict +// (cache usage, the fact the HW can issue several instructions from different +// wavefronts if different types, etc) +// +// This scheduler tries to solve the scheduling problem by dividing it into +// simpler sub-problems. It divides the instructions into blocks, schedules +// locally inside the blocks where it takes care of low latencies, and then +// chooses the order of the blocks by taking care of high latencies. +// Dividing the instructions into blocks helps control keeping register +// usage low. +// +// First the instructions are put into blocks. +// We want the blocks help control register usage and hide high latencies +// later. To help control register usage, we typically want all local +// computations, when for example you create a result that can be comsummed +// right away, to be contained in a block. Block inputs and outputs would +// typically be important results that are needed in several locations of +// the shader. Since we do want blocks to help hide high latencies, we want +// the instructions inside the block to have a minimal set of dependencies +// on high latencies. It will make it easy to pick blocks to hide specific +// high latencies. +// The block creation algorithm is divided into several steps, and several +// variants can be tried during the scheduling process. +// +// Second the order of the instructions inside the blocks is choosen. +// At that step we do take into account only register usage and hiding +// low latency instructions +// +// Third the block order is choosen, there we try to hide high latencies +// and keep register usage low. +// +// After the third step, a pass is done to improve the hiding of low +// latencies. +// +// Actually when talking about 'low latency' or 'high latency' it includes +// both the latency to get the cache (or global mem) data go to the register, +// and the bandwith limitations. +// Increasing the number of active wavefronts helps hide the former, but it +// doesn't solve the latter, thus why even if wavefront count is high, we have +// to try have as many instructions hiding high latencies as possible. +// The OpenCL doc says for example latency of 400 cycles for a global mem access, +// which is hidden by 10 instructions if the wavefront count is 10. + +// Some figures taken from AMD docs: +// Both texture and constant L1 caches are 4-way associative with 64 bytes +// lines. +// Constant cache is shared with 4 CUs. +// For texture sampling, the address generation unit receives 4 texture +// addresses per cycle, thus we could expect texture sampling latency to be +// equivalent to 4 instructions in the very best case (a VGPR is 64 work items, +// instructions in a wavefront group are executed every 4 cycles), +// or 16 instructions if the other wavefronts associated to the 3 other VALUs +// of the CU do texture sampling too. (Don't take these figures too seriously, +// as I'm not 100% sure of the computation) +// Data exports should get similar latency. +// For constant loading, the cache is shader with 4 CUs. +// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit" +// I guess if the other CU don't read the cache, it can go up to 64B/cycle. +// It means a simple s_buffer_load should take one instruction to hide, as +// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same +// cache line. +// +// As of today the driver doesn't preload the constants in cache, thus the +// first loads get extra latency. The doc says global memory access can be +// 300-600 cycles. We do not specially take that into account when scheduling +// As we expect the driver to be able to preload the constants soon. + + +// common code // + +#ifndef NDEBUG + +static const char *getReasonStr(SIScheduleCandReason Reason) { + switch (Reason) { + case NoCand: return "NOCAND"; + case RegUsage: return "REGUSAGE"; + case Latency: return "LATENCY"; + case Successor: return "SUCCESSOR"; + case Depth: return "DEPTH"; + case NodeOrder: return "ORDER"; + } + llvm_unreachable("Unknown reason!"); +} + +#endif + +static bool tryLess(int TryVal, int CandVal, + SISchedulerCandidate &TryCand, + SISchedulerCandidate &Cand, + SIScheduleCandReason Reason) { + if (TryVal < CandVal) { + TryCand.Reason = Reason; + return true; + } + if (TryVal > CandVal) { + if (Cand.Reason > Reason) + Cand.Reason = Reason; + return true; + } + Cand.setRepeat(Reason); + return false; +} + +static bool tryGreater(int TryVal, int CandVal, + SISchedulerCandidate &TryCand, + SISchedulerCandidate &Cand, + SIScheduleCandReason Reason) { + if (TryVal > CandVal) { + TryCand.Reason = Reason; + return true; + } + if (TryVal < CandVal) { + if (Cand.Reason > Reason) + Cand.Reason = Reason; + return true; + } + Cand.setRepeat(Reason); + return false; +} + +// SIScheduleBlock // + +void SIScheduleBlock::addUnit(SUnit *SU) { + NodeNum2Index[SU->NodeNum] = SUnits.size(); + SUnits.push_back(SU); +} + +#ifndef NDEBUG + +void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) { + + dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); + dbgs() << '\n'; +} +#endif + +void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand, + SISchedCandidate &TryCand) { + // Initialize the candidate if needed. + if (!Cand.isValid()) { + TryCand.Reason = NodeOrder; + return; + } + + if (Cand.SGPRUsage > 60 && + tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage)) + return; + + // Schedule low latency instructions as top as possible. + // Order of priority is: + // . Low latency instructions which do not depend on other low latency + // instructions we haven't waited for + // . Other instructions which do not depend on low latency instructions + // we haven't waited for + // . Low latencies + // . All other instructions + // Goal is to get: low latency instructions - independant instructions + // - (eventually some more low latency instructions) + // - instructions that depend on the first low latency instructions. + // If in the block there is a lot of constant loads, the SGPR usage + // could go quite high, thus above the arbitrary limit of 60 will encourage + // use the already loaded constants (in order to release some SGPRs) before + // loading more. + if (tryLess(TryCand.HasLowLatencyNonWaitedParent, + Cand.HasLowLatencyNonWaitedParent, + TryCand, Cand, SIScheduleCandReason::Depth)) + return; + + if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency, + TryCand, Cand, SIScheduleCandReason::Depth)) + return; + + if (TryCand.IsLowLatency && + tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset, + TryCand, Cand, SIScheduleCandReason::Depth)) + return; + + if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage)) + return; + + // Fall through to original instruction order. + if (TryCand.SU->NodeNum < Cand.SU->NodeNum) { + TryCand.Reason = NodeOrder; + } +} + +SUnit* SIScheduleBlock::pickNode() { + SISchedCandidate TopCand; + + for (SUnit* SU : TopReadySUs) { + SISchedCandidate TryCand; + std::vector<unsigned> pressure; + std::vector<unsigned> MaxPressure; + // Predict register usage after this instruction. + TryCand.SU = SU; + TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure); + TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()]; + TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()]; + TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum]; + TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum]; + TryCand.HasLowLatencyNonWaitedParent = + HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]; + tryCandidateTopDown(TopCand, TryCand); + if (TryCand.Reason != NoCand) + TopCand.setBest(TryCand); + } + + return TopCand.SU; +} + + +// Schedule something valid. +void SIScheduleBlock::fastSchedule() { + TopReadySUs.clear(); + if (Scheduled) + undoSchedule(); + + for (SUnit* SU : SUnits) { + if (!SU->NumPredsLeft) + TopReadySUs.push_back(SU); + } + + while (!TopReadySUs.empty()) { + SUnit *SU = TopReadySUs[0]; + ScheduledSUnits.push_back(SU); + nodeScheduled(SU); + } + + Scheduled = true; +} + +// Returns if the register was set between first and last. +static bool isDefBetween(unsigned Reg, + SlotIndex First, SlotIndex Last, + const MachineRegisterInfo *MRI, + const LiveIntervals *LIS) { + for (MachineRegisterInfo::def_instr_iterator + UI = MRI->def_instr_begin(Reg), + UE = MRI->def_instr_end(); UI != UE; ++UI) { + const MachineInstr* MI = &*UI; + if (MI->isDebugValue()) + continue; + SlotIndex InstSlot = LIS->getInstructionIndex(MI).getRegSlot(); + if (InstSlot >= First && InstSlot <= Last) + return true; + } + return false; +} + +void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, + MachineBasicBlock::iterator EndBlock) { + IntervalPressure Pressure, BotPressure; + RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure); + LiveIntervals *LIS = DAG->getLIS(); + MachineRegisterInfo *MRI = DAG->getMRI(); + DAG->initRPTracker(TopRPTracker); + DAG->initRPTracker(BotRPTracker); + DAG->initRPTracker(RPTracker); + + // Goes though all SU. RPTracker captures what had to be alive for the SUs + // to execute, and what is still alive at the end. + for (SUnit* SU : ScheduledSUnits) { + RPTracker.setPos(SU->getInstr()); + RPTracker.advance(); + } + + // Close the RPTracker to finalize live ins/outs. + RPTracker.closeRegion(); + + // Initialize the live ins and live outs. + TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs); + BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); + + // Do not Track Physical Registers, because it messes up. + for (unsigned Reg : RPTracker.getPressure().LiveInRegs) { + if (TargetRegisterInfo::isVirtualRegister(Reg)) + LiveInRegs.insert(Reg); + } + LiveOutRegs.clear(); + // There is several possibilities to distinguish: + // 1) Reg is not input to any instruction in the block, but is output of one + // 2) 1) + read in the block and not needed after it + // 3) 1) + read in the block but needed in another block + // 4) Reg is input of an instruction but another block will read it too + // 5) Reg is input of an instruction and then rewritten in the block. + // result is not read in the block (implies used in another block) + // 6) Reg is input of an instruction and then rewritten in the block. + // result is read in the block and not needed in another block + // 7) Reg is input of an instruction and then rewritten in the block. + // result is read in the block but also needed in another block + // LiveInRegs will contains all the regs in situation 4, 5, 6, 7 + // We want LiveOutRegs to contain only Regs whose content will be read after + // in another block, and whose content was written in the current block, + // that is we want it to get 1, 3, 5, 7 + // Since we made the MIs of a block to be packed all together before + // scheduling, then the LiveIntervals were correct, and the RPTracker was + // able to correctly handle 5 vs 6, 2 vs 3. + // (Note: This is not sufficient for RPTracker to not do mistakes for case 4) + // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7 + // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7 + // The use of findDefBetween removes the case 4. + for (unsigned Reg : RPTracker.getPressure().LiveOutRegs) { + if (TargetRegisterInfo::isVirtualRegister(Reg) && + isDefBetween(Reg, LIS->getInstructionIndex(BeginBlock).getRegSlot(), + LIS->getInstructionIndex(EndBlock).getRegSlot(), + MRI, LIS)) { + LiveOutRegs.insert(Reg); + } + } + + // Pressure = sum_alive_registers register size + // Internally llvm will represent some registers as big 128 bits registers + // for example, but they actually correspond to 4 actual 32 bits registers. + // Thus Pressure is not equal to num_alive_registers * constant. + LiveInPressure = TopPressure.MaxSetPressure; + LiveOutPressure = BotPressure.MaxSetPressure; + + // Prepares TopRPTracker for top down scheduling. + TopRPTracker.closeTop(); +} + +void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock, + MachineBasicBlock::iterator EndBlock) { + if (!Scheduled) + fastSchedule(); + + // PreScheduling phase to set LiveIn and LiveOut. + initRegPressure(BeginBlock, EndBlock); + undoSchedule(); + + // Schedule for real now. + + TopReadySUs.clear(); + + for (SUnit* SU : SUnits) { + if (!SU->NumPredsLeft) + TopReadySUs.push_back(SU); + } + + while (!TopReadySUs.empty()) { + SUnit *SU = pickNode(); + ScheduledSUnits.push_back(SU); + TopRPTracker.setPos(SU->getInstr()); + TopRPTracker.advance(); + nodeScheduled(SU); + } + + // TODO: compute InternalAdditionnalPressure. + InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size()); + + // Check everything is right. +#ifndef NDEBUG + assert(SUnits.size() == ScheduledSUnits.size() && + TopReadySUs.empty()); + for (SUnit* SU : SUnits) { + assert(SU->isScheduled && + SU->NumPredsLeft == 0); + } +#endif + + Scheduled = true; +} + +void SIScheduleBlock::undoSchedule() { + for (SUnit* SU : SUnits) { + SU->isScheduled = false; + for (SDep& Succ : SU->Succs) { + if (BC->isSUInBlock(Succ.getSUnit(), ID)) + undoReleaseSucc(SU, &Succ); + } + } + HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); + ScheduledSUnits.clear(); + Scheduled = false; +} + +void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) { + SUnit *SuccSU = SuccEdge->getSUnit(); + + if (SuccEdge->isWeak()) { + ++SuccSU->WeakPredsLeft; + return; + } + ++SuccSU->NumPredsLeft; +} + +void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) { + SUnit *SuccSU = SuccEdge->getSUnit(); + + if (SuccEdge->isWeak()) { + --SuccSU->WeakPredsLeft; + return; + } +#ifndef NDEBUG + if (SuccSU->NumPredsLeft == 0) { + dbgs() << "*** Scheduling failed! ***\n"; + SuccSU->dump(DAG); + dbgs() << " has been released too many times!\n"; + llvm_unreachable(nullptr); + } +#endif + + --SuccSU->NumPredsLeft; +} + +/// Release Successors of the SU that are in the block or not. +void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) { + for (SDep& Succ : SU->Succs) { + SUnit *SuccSU = Succ.getSUnit(); + + if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock) + continue; + + releaseSucc(SU, &Succ); + if (SuccSU->NumPredsLeft == 0 && InOrOutBlock) + TopReadySUs.push_back(SuccSU); + } +} + +void SIScheduleBlock::nodeScheduled(SUnit *SU) { + // Is in TopReadySUs + assert (!SU->NumPredsLeft); + std::vector<SUnit*>::iterator I = + std::find(TopReadySUs.begin(), TopReadySUs.end(), SU); + if (I == TopReadySUs.end()) { + dbgs() << "Data Structure Bug in SI Scheduler\n"; + llvm_unreachable(nullptr); + } + TopReadySUs.erase(I); + + releaseSuccessors(SU, true); + // Scheduling this node will trigger a wait, + // thus propagate to other instructions that they do not need to wait either. + if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]) + HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); + + if (DAG->IsLowLatencySU[SU->NodeNum]) { + for (SDep& Succ : SU->Succs) { + std::map<unsigned, unsigned>::iterator I = + NodeNum2Index.find(Succ.getSUnit()->NodeNum); + if (I != NodeNum2Index.end()) + HasLowLatencyNonWaitedParent[I->second] = 1; + } + } + SU->isScheduled = true; +} + +void SIScheduleBlock::finalizeUnits() { + // We remove links from outside blocks to enable scheduling inside the block. + for (SUnit* SU : SUnits) { + releaseSuccessors(SU, false); + if (DAG->IsHighLatencySU[SU->NodeNum]) + HighLatencyBlock = true; + } + HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0); +} + +// we maintain ascending order of IDs +void SIScheduleBlock::addPred(SIScheduleBlock *Pred) { + unsigned PredID = Pred->getID(); + + // Check if not already predecessor. + for (SIScheduleBlock* P : Preds) { + if (PredID == P->getID()) + return; + } + Preds.push_back(Pred); + +#ifndef NDEBUG + for (SIScheduleBlock* S : Succs) { + if (PredID == S->getID()) + assert(!"Loop in the Block Graph!\n"); + } +#endif +} + +void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) { + unsigned SuccID = Succ->getID(); + + // Check if not already predecessor. + for (SIScheduleBlock* S : Succs) { + if (SuccID == S->getID()) + return; + } + if (Succ->isHighLatencyBlock()) + ++NumHighLatencySuccessors; + Succs.push_back(Succ); +#ifndef NDEBUG + for (SIScheduleBlock* P : Preds) { + if (SuccID == P->getID()) + assert("Loop in the Block Graph!\n"); + } +#endif +} + +#ifndef NDEBUG +void SIScheduleBlock::printDebug(bool full) { + dbgs() << "Block (" << ID << ")\n"; + if (!full) + return; + + dbgs() << "\nContains High Latency Instruction: " + << HighLatencyBlock << '\n'; + dbgs() << "\nDepends On:\n"; + for (SIScheduleBlock* P : Preds) { + P->printDebug(false); + } + + dbgs() << "\nSuccessors:\n"; + for (SIScheduleBlock* S : Succs) { + S->printDebug(false); + } + + if (Scheduled) { + dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' ' + << LiveInPressure[DAG->getVGPRSetID()] << '\n'; + dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' ' + << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n"; + dbgs() << "LiveIns:\n"; + for (unsigned Reg : LiveInRegs) + dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' '; + + dbgs() << "\nLiveOuts:\n"; + for (unsigned Reg : LiveOutRegs) + dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' '; + } + + dbgs() << "\nInstructions:\n"; + if (!Scheduled) { + for (SUnit* SU : SUnits) { + SU->dump(DAG); + } + } else { + for (SUnit* SU : SUnits) { + SU->dump(DAG); + } + } + + dbgs() << "///////////////////////\n"; +} + +#endif + +// SIScheduleBlockCreator // + +SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) : +DAG(DAG) { +} + +SIScheduleBlockCreator::~SIScheduleBlockCreator() { +} + +SIScheduleBlocks +SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) { + std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B = + Blocks.find(BlockVariant); + if (B == Blocks.end()) { + SIScheduleBlocks Res; + createBlocksForVariant(BlockVariant); + topologicalSort(); + scheduleInsideBlocks(); + fillStats(); + Res.Blocks = CurrentBlocks; + Res.TopDownIndex2Block = TopDownIndex2Block; + Res.TopDownBlock2Index = TopDownBlock2Index; + Blocks[BlockVariant] = Res; + return Res; + } else { + return B->second; + } +} + +bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) { + if (SU->NodeNum >= DAG->SUnits.size()) + return false; + return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID; +} + +void SIScheduleBlockCreator::colorHighLatenciesAlone() { + unsigned DAGSize = DAG->SUnits.size(); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + if (DAG->IsHighLatencySU[SU->NodeNum]) { + CurrentColoring[SU->NodeNum] = NextReservedID++; + } + } +} + +void SIScheduleBlockCreator::colorHighLatenciesGroups() { + unsigned DAGSize = DAG->SUnits.size(); + unsigned NumHighLatencies = 0; + unsigned GroupSize; + unsigned Color = NextReservedID; + unsigned Count = 0; + std::set<unsigned> FormingGroup; + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + if (DAG->IsHighLatencySU[SU->NodeNum]) + ++NumHighLatencies; + } + + if (NumHighLatencies == 0) + return; + + if (NumHighLatencies <= 6) + GroupSize = 2; + else if (NumHighLatencies <= 12) + GroupSize = 3; + else + GroupSize = 4; + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + if (DAG->IsHighLatencySU[SU->NodeNum]) { + unsigned CompatibleGroup = true; + unsigned ProposedColor = Color; + for (unsigned j : FormingGroup) { + // TODO: Currently CompatibleGroup will always be false, + // because the graph enforces the load order. This + // can be fixed, but as keeping the load order is often + // good for performance that causes a performance hit (both + // the default scheduler and this scheduler). + // When this scheduler determines a good load order, + // this can be fixed. + if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) || + !DAG->canAddEdge(&DAG->SUnits[j], SU)) + CompatibleGroup = false; + } + if (!CompatibleGroup || ++Count == GroupSize) { + FormingGroup.clear(); + Color = ++NextReservedID; + if (!CompatibleGroup) { + ProposedColor = Color; + FormingGroup.insert(SU->NodeNum); + } + Count = 0; + } else { + FormingGroup.insert(SU->NodeNum); + } + CurrentColoring[SU->NodeNum] = ProposedColor; + } + } +} + +void SIScheduleBlockCreator::colorComputeReservedDependencies() { + unsigned DAGSize = DAG->SUnits.size(); + std::map<std::set<unsigned>, unsigned> ColorCombinations; + + CurrentTopDownReservedDependencyColoring.clear(); + CurrentBottomUpReservedDependencyColoring.clear(); + + CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0); + CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0); + + // Traverse TopDown, and give different colors to SUs depending + // on which combination of High Latencies they depend on. + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->TopDownIndex2SU[i]]; + std::set<unsigned> SUColors; + + // Already given. + if (CurrentColoring[SU->NodeNum]) { + CurrentTopDownReservedDependencyColoring[SU->NodeNum] = + CurrentColoring[SU->NodeNum]; + continue; + } + + for (SDep& PredDep : SU->Preds) { + SUnit *Pred = PredDep.getSUnit(); + if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) + continue; + if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0) + SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]); + } + // Color 0 by default. + if (SUColors.empty()) + continue; + // Same color than parents. + if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) + CurrentTopDownReservedDependencyColoring[SU->NodeNum] = + *SUColors.begin(); + else { + std::map<std::set<unsigned>, unsigned>::iterator Pos = + ColorCombinations.find(SUColors); + if (Pos != ColorCombinations.end()) { + CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second; + } else { + CurrentTopDownReservedDependencyColoring[SU->NodeNum] = + NextNonReservedID; + ColorCombinations[SUColors] = NextNonReservedID++; + } + } + } + + ColorCombinations.clear(); + + // Same as before, but BottomUp. + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + std::set<unsigned> SUColors; + + // Already given. + if (CurrentColoring[SU->NodeNum]) { + CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = + CurrentColoring[SU->NodeNum]; + continue; + } + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0) + SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]); + } + // Keep color 0. + if (SUColors.empty()) + continue; + // Same color than parents. + if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) + CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = + *SUColors.begin(); + else { + std::map<std::set<unsigned>, unsigned>::iterator Pos = + ColorCombinations.find(SUColors); + if (Pos != ColorCombinations.end()) { + CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second; + } else { + CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = + NextNonReservedID; + ColorCombinations[SUColors] = NextNonReservedID++; + } + } + } +} + +void SIScheduleBlockCreator::colorAccordingToReservedDependencies() { + unsigned DAGSize = DAG->SUnits.size(); + std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations; + + // Every combination of colors given by the top down + // and bottom up Reserved node dependency + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + std::pair<unsigned, unsigned> SUColors; + + // High latency instructions: already given. + if (CurrentColoring[SU->NodeNum]) + continue; + + SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum]; + SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum]; + + std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos = + ColorCombinations.find(SUColors); + if (Pos != ColorCombinations.end()) { + CurrentColoring[SU->NodeNum] = Pos->second; + } else { + CurrentColoring[SU->NodeNum] = NextNonReservedID; + ColorCombinations[SUColors] = NextNonReservedID++; + } + } +} + +void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { + unsigned DAGSize = DAG->SUnits.size(); + std::vector<int> PendingColoring = CurrentColoring; + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + std::set<unsigned> SUColors; + std::set<unsigned> SUColorsPending; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 || + CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 || + CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0) + SUColors.insert(CurrentColoring[Succ->NodeNum]); + SUColorsPending.insert(PendingColoring[Succ->NodeNum]); + } + if (SUColors.size() == 1 && SUColorsPending.size() == 1) + PendingColoring[SU->NodeNum] = *SUColors.begin(); + else // TODO: Attribute new colors depending on color + // combination of children. + PendingColoring[SU->NodeNum] = NextNonReservedID++; + } + CurrentColoring = PendingColoring; +} + + +void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() { + unsigned DAGSize = DAG->SUnits.size(); + unsigned PreviousColor; + std::set<unsigned> SeenColors; + + if (DAGSize <= 1) + return; + + PreviousColor = CurrentColoring[0]; + + for (unsigned i = 1, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + unsigned CurrentColor = CurrentColoring[i]; + unsigned PreviousColorSave = PreviousColor; + assert(i == SU->NodeNum); + + if (CurrentColor != PreviousColor) + SeenColors.insert(PreviousColor); + PreviousColor = CurrentColor; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + if (SeenColors.find(CurrentColor) == SeenColors.end()) + continue; + + if (PreviousColorSave != CurrentColor) + CurrentColoring[i] = NextNonReservedID++; + else + CurrentColoring[i] = CurrentColoring[i-1]; + } +} + +void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() { + unsigned DAGSize = DAG->SUnits.size(); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + std::set<unsigned> SUColors; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + // No predecessor: Vgpr constant loading. + // Low latency instructions usually have a predecessor (the address) + if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum]) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + SUColors.insert(CurrentColoring[Succ->NodeNum]); + } + if (SUColors.size() == 1) + CurrentColoring[SU->NodeNum] = *SUColors.begin(); + } +} + +void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() { + unsigned DAGSize = DAG->SUnits.size(); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + std::set<unsigned> SUColors; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + SUColors.insert(CurrentColoring[Succ->NodeNum]); + } + if (SUColors.size() == 1) + CurrentColoring[SU->NodeNum] = *SUColors.begin(); + } +} + +void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() { + unsigned DAGSize = DAG->SUnits.size(); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + std::set<unsigned> SUColors; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + SUColors.insert(CurrentColoring[Succ->NodeNum]); + } + if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize) + CurrentColoring[SU->NodeNum] = *SUColors.begin(); + } +} + +void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { + unsigned DAGSize = DAG->SUnits.size(); + std::map<unsigned, unsigned> ColorCount; + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + unsigned color = CurrentColoring[SU->NodeNum]; + std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color); + if (Pos != ColorCount.end()) { + ++ColorCount[color]; + } else { + ColorCount[color] = 1; + } + } + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + unsigned color = CurrentColoring[SU->NodeNum]; + std::set<unsigned> SUColors; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + if (ColorCount[color] > 1) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + SUColors.insert(CurrentColoring[Succ->NodeNum]); + } + if (SUColors.size() == 1 && *SUColors.begin() != color) { + --ColorCount[color]; + CurrentColoring[SU->NodeNum] = *SUColors.begin(); + ++ColorCount[*SUColors.begin()]; + } + } +} + +void SIScheduleBlockCreator::cutHugeBlocks() { + // TODO +} + +void SIScheduleBlockCreator::regroupNoUserInstructions() { + unsigned DAGSize = DAG->SUnits.size(); + int GroupID = NextNonReservedID++; + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + bool hasSuccessor = false; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + hasSuccessor = true; + } + if (!hasSuccessor) + CurrentColoring[SU->NodeNum] = GroupID; + } +} + +void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) { + unsigned DAGSize = DAG->SUnits.size(); + std::map<unsigned,unsigned> RealID; + + CurrentBlocks.clear(); + CurrentColoring.clear(); + CurrentColoring.resize(DAGSize, 0); + Node2CurrentBlock.clear(); + + // Restore links previous scheduling variant has overridden. + DAG->restoreSULinksLeft(); + + NextReservedID = 1; + NextNonReservedID = DAGSize + 1; + + DEBUG(dbgs() << "Coloring the graph\n"); + + if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped) + colorHighLatenciesGroups(); + else + colorHighLatenciesAlone(); + colorComputeReservedDependencies(); + colorAccordingToReservedDependencies(); + colorEndsAccordingToDependencies(); + if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive) + colorForceConsecutiveOrderInGroup(); + regroupNoUserInstructions(); + colorMergeConstantLoadsNextGroup(); + colorMergeIfPossibleNextGroupOnlyForReserved(); + + // Put SUs of same color into same block + Node2CurrentBlock.resize(DAGSize, -1); + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + unsigned Color = CurrentColoring[SU->NodeNum]; + if (RealID.find(Color) == RealID.end()) { + int ID = CurrentBlocks.size(); + BlockPtrs.push_back( + make_unique<SIScheduleBlock>(DAG, this, ID)); + CurrentBlocks.push_back(BlockPtrs.rbegin()->get()); + RealID[Color] = ID; + } + CurrentBlocks[RealID[Color]]->addUnit(SU); + Node2CurrentBlock[SU->NodeNum] = RealID[Color]; + } + + // Build dependencies between blocks. + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + int SUID = Node2CurrentBlock[i]; + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + if (Node2CurrentBlock[Succ->NodeNum] != SUID) + CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]); + } + for (SDep& PredDep : SU->Preds) { + SUnit *Pred = PredDep.getSUnit(); + if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) + continue; + if (Node2CurrentBlock[Pred->NodeNum] != SUID) + CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]); + } + } + + // Free root and leafs of all blocks to enable scheduling inside them. + for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + Block->finalizeUnits(); + } + DEBUG( + dbgs() << "Blocks created:\n\n"; + for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + Block->printDebug(true); + } + ); +} + +// Two functions taken from Codegen/MachineScheduler.cpp + +/// If this iterator is a debug value, increment until reaching the End or a +/// non-debug instruction. +static MachineBasicBlock::const_iterator +nextIfDebug(MachineBasicBlock::const_iterator I, + MachineBasicBlock::const_iterator End) { + for(; I != End; ++I) { + if (!I->isDebugValue()) + break; + } + return I; +} + +/// Non-const version. +static MachineBasicBlock::iterator +nextIfDebug(MachineBasicBlock::iterator I, + MachineBasicBlock::const_iterator End) { + // Cast the return value to nonconst MachineInstr, then cast to an + // instr_iterator, which does not check for null, finally return a + // bundle_iterator. + return MachineBasicBlock::instr_iterator( + const_cast<MachineInstr*>( + &*nextIfDebug(MachineBasicBlock::const_iterator(I), End))); +} + +void SIScheduleBlockCreator::topologicalSort() { + unsigned DAGSize = CurrentBlocks.size(); + std::vector<int> WorkList; + + DEBUG(dbgs() << "Topological Sort\n"); + + WorkList.reserve(DAGSize); + TopDownIndex2Block.resize(DAGSize); + TopDownBlock2Index.resize(DAGSize); + BottomUpIndex2Block.resize(DAGSize); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + unsigned Degree = Block->getSuccs().size(); + TopDownBlock2Index[i] = Degree; + if (Degree == 0) { + WorkList.push_back(i); + } + } + + int Id = DAGSize; + while (!WorkList.empty()) { + int i = WorkList.back(); + SIScheduleBlock *Block = CurrentBlocks[i]; + WorkList.pop_back(); + TopDownBlock2Index[i] = --Id; + TopDownIndex2Block[Id] = i; + for (SIScheduleBlock* Pred : Block->getPreds()) { + if (!--TopDownBlock2Index[Pred->getID()]) + WorkList.push_back(Pred->getID()); + } + } + +#ifndef NDEBUG + // Check correctness of the ordering. + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + for (SIScheduleBlock* Pred : Block->getPreds()) { + assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] && + "Wrong Top Down topological sorting"); + } + } +#endif + + BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(), + TopDownIndex2Block.rend()); +} + +void SIScheduleBlockCreator::scheduleInsideBlocks() { + unsigned DAGSize = CurrentBlocks.size(); + + DEBUG(dbgs() << "\nScheduling Blocks\n\n"); + + // We do schedule a valid scheduling such that a Block corresponds + // to a range of instructions. + DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n"); + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + Block->fastSchedule(); + } + + // Note: the following code, and the part restoring previous position + // is by far the most expensive operation of the Scheduler. + + // Do not update CurrentTop. + MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop(); + std::vector<MachineBasicBlock::iterator> PosOld; + std::vector<MachineBasicBlock::iterator> PosNew; + PosOld.reserve(DAG->SUnits.size()); + PosNew.reserve(DAG->SUnits.size()); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + int BlockIndice = TopDownIndex2Block[i]; + SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; + std::vector<SUnit*> SUs = Block->getScheduledUnits(); + + for (SUnit* SU : SUs) { + MachineInstr *MI = SU->getInstr(); + MachineBasicBlock::iterator Pos = MI; + PosOld.push_back(Pos); + if (&*CurrentTopFastSched == MI) { + PosNew.push_back(Pos); + CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched, + DAG->getCurrentBottom()); + } else { + // Update the instruction stream. + DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI); + + // Update LiveIntervals. + // Note: Moving all instructions and calling handleMove everytime + // is the most cpu intensive operation of the scheduler. + // It would gain a lot if there was a way to recompute the + // LiveIntervals for the entire scheduling region. + DAG->getLIS()->handleMove(MI, /*UpdateFlags=*/true); + PosNew.push_back(CurrentTopFastSched); + } + } + } + + // Now we have Block of SUs == Block of MI. + // We do the final schedule for the instructions inside the block. + // The property that all the SUs of the Block are grouped together as MI + // is used for correct reg usage tracking. + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + std::vector<SUnit*> SUs = Block->getScheduledUnits(); + Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr()); + } + + DEBUG(dbgs() << "Restoring MI Pos\n"); + // Restore old ordering (which prevents a LIS->handleMove bug). + for (unsigned i = PosOld.size(), e = 0; i != e; --i) { + MachineBasicBlock::iterator POld = PosOld[i-1]; + MachineBasicBlock::iterator PNew = PosNew[i-1]; + if (PNew != POld) { + // Update the instruction stream. + DAG->getBB()->splice(POld, DAG->getBB(), PNew); + + // Update LiveIntervals. + DAG->getLIS()->handleMove(POld, /*UpdateFlags=*/true); + } + } + + DEBUG( + for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + Block->printDebug(true); + } + ); +} + +void SIScheduleBlockCreator::fillStats() { + unsigned DAGSize = CurrentBlocks.size(); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + int BlockIndice = TopDownIndex2Block[i]; + SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; + if (Block->getPreds().size() == 0) + Block->Depth = 0; + else { + unsigned Depth = 0; + for (SIScheduleBlock *Pred : Block->getPreds()) { + if (Depth < Pred->Depth + 1) + Depth = Pred->Depth + 1; + } + Block->Depth = Depth; + } + } + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + int BlockIndice = BottomUpIndex2Block[i]; + SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; + if (Block->getSuccs().size() == 0) + Block->Height = 0; + else { + unsigned Height = 0; + for (SIScheduleBlock *Succ : Block->getSuccs()) { + if (Height < Succ->Height + 1) + Height = Succ->Height + 1; + } + Block->Height = Height; + } + } +} + +// SIScheduleBlockScheduler // + +SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, + SISchedulerBlockSchedulerVariant Variant, + SIScheduleBlocks BlocksStruct) : + DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks), + LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0), + SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) { + + // Fill the usage of every output + // Warning: while by construction we always have a link between two blocks + // when one needs a result from the other, the number of users of an output + // is not the sum of child blocks having as input the same virtual register. + // Here is an example. A produces x and y. B eats x and produces x'. + // C eats x' and y. The register coalescer may have attributed the same + // virtual register to x and x'. + // To count accurately, we do a topological sort. In case the register is + // found for several parents, we increment the usage of the one with the + // highest topological index. + LiveOutRegsNumUsages.resize(Blocks.size()); + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + SIScheduleBlock *Block = Blocks[i]; + for (unsigned Reg : Block->getInRegs()) { + bool Found = false; + int topoInd = -1; + for (SIScheduleBlock* Pred: Block->getPreds()) { + std::set<unsigned> PredOutRegs = Pred->getOutRegs(); + std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); + + if (RegPos != PredOutRegs.end()) { + Found = true; + if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) { + topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()]; + } + } + } + + if (!Found) + continue; + + int PredID = BlocksStruct.TopDownIndex2Block[topoInd]; + std::map<unsigned, unsigned>::iterator RegPos = + LiveOutRegsNumUsages[PredID].find(Reg); + if (RegPos != LiveOutRegsNumUsages[PredID].end()) { + ++LiveOutRegsNumUsages[PredID][Reg]; + } else { + LiveOutRegsNumUsages[PredID][Reg] = 1; + } + } + } + + LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0); + BlockNumPredsLeft.resize(Blocks.size()); + BlockNumSuccsLeft.resize(Blocks.size()); + + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + SIScheduleBlock *Block = Blocks[i]; + BlockNumPredsLeft[i] = Block->getPreds().size(); + BlockNumSuccsLeft[i] = Block->getSuccs().size(); + } + +#ifndef NDEBUG + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + SIScheduleBlock *Block = Blocks[i]; + assert(Block->getID() == i); + } +#endif + + std::set<unsigned> InRegs = DAG->getInRegs(); + addLiveRegs(InRegs); + + // Fill LiveRegsConsumers for regs that were already + // defined before scheduling. + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + SIScheduleBlock *Block = Blocks[i]; + for (unsigned Reg : Block->getInRegs()) { + bool Found = false; + for (SIScheduleBlock* Pred: Block->getPreds()) { + std::set<unsigned> PredOutRegs = Pred->getOutRegs(); + std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); + + if (RegPos != PredOutRegs.end()) { + Found = true; + break; + } + } + + if (!Found) { + if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end()) + LiveRegsConsumers[Reg] = 1; + else + ++LiveRegsConsumers[Reg]; + } + } + } + + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + SIScheduleBlock *Block = Blocks[i]; + if (BlockNumPredsLeft[i] == 0) { + ReadyBlocks.push_back(Block); + } + } + + while (SIScheduleBlock *Block = pickBlock()) { + BlocksScheduled.push_back(Block); + blockScheduled(Block); + } + + DEBUG( + dbgs() << "Block Order:"; + for (SIScheduleBlock* Block : BlocksScheduled) { + dbgs() << ' ' << Block->getID(); + } + ); +} + +bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand, + SIBlockSchedCandidate &TryCand) { + if (!Cand.isValid()) { + TryCand.Reason = NodeOrder; + return true; + } + + // Try to hide high latencies. + if (tryLess(TryCand.LastPosHighLatParentScheduled, + Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency)) + return true; + // Schedule high latencies early so you can hide them better. + if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency, + TryCand, Cand, Latency)) + return true; + if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height, + TryCand, Cand, Depth)) + return true; + if (tryGreater(TryCand.NumHighLatencySuccessors, + Cand.NumHighLatencySuccessors, + TryCand, Cand, Successor)) + return true; + return false; +} + +bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand, + SIBlockSchedCandidate &TryCand) { + if (!Cand.isValid()) { + TryCand.Reason = NodeOrder; + return true; + } + + if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0, + TryCand, Cand, RegUsage)) + return true; + if (tryGreater(TryCand.NumSuccessors > 0, + Cand.NumSuccessors > 0, + TryCand, Cand, Successor)) + return true; + if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth)) + return true; + if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff, + TryCand, Cand, RegUsage)) + return true; + return false; +} + +SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { + SIBlockSchedCandidate Cand; + std::vector<SIScheduleBlock*>::iterator Best; + SIScheduleBlock *Block; + if (ReadyBlocks.empty()) + return nullptr; + + DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(), + VregCurrentUsage, SregCurrentUsage); + if (VregCurrentUsage > maxVregUsage) + maxVregUsage = VregCurrentUsage; + if (VregCurrentUsage > maxSregUsage) + maxSregUsage = VregCurrentUsage; + DEBUG( + dbgs() << "Picking New Blocks\n"; + dbgs() << "Available: "; + for (SIScheduleBlock* Block : ReadyBlocks) + dbgs() << Block->getID() << ' '; + dbgs() << "\nCurrent Live:\n"; + for (unsigned Reg : LiveRegs) + dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' '; + dbgs() << '\n'; + dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; + dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n'; + ); + + Cand.Block = nullptr; + for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(), + E = ReadyBlocks.end(); I != E; ++I) { + SIBlockSchedCandidate TryCand; + TryCand.Block = *I; + TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock(); + TryCand.VGPRUsageDiff = + checkRegUsageImpact(TryCand.Block->getInRegs(), + TryCand.Block->getOutRegs())[DAG->getVGPRSetID()]; + TryCand.NumSuccessors = TryCand.Block->getSuccs().size(); + TryCand.NumHighLatencySuccessors = + TryCand.Block->getNumHighLatencySuccessors(); + TryCand.LastPosHighLatParentScheduled = + (unsigned int) std::max<int> (0, + LastPosHighLatencyParentScheduled[TryCand.Block->getID()] - + LastPosWaitedHighLatency); + TryCand.Height = TryCand.Block->Height; + // Try not to increase VGPR usage too much, else we may spill. + if (VregCurrentUsage > 120 || + Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) { + if (!tryCandidateRegUsage(Cand, TryCand) && + Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage) + tryCandidateLatency(Cand, TryCand); + } else { + if (!tryCandidateLatency(Cand, TryCand)) + tryCandidateRegUsage(Cand, TryCand); + } + if (TryCand.Reason != NoCand) { + Cand.setBest(TryCand); + Best = I; + DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' ' + << getReasonStr(Cand.Reason) << '\n'); + } + } + + DEBUG( + dbgs() << "Picking: " << Cand.Block->getID() << '\n'; + dbgs() << "Is a block with high latency instruction: " + << (Cand.IsHighLatency ? "yes\n" : "no\n"); + dbgs() << "Position of last high latency dependency: " + << Cand.LastPosHighLatParentScheduled << '\n'; + dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n'; + dbgs() << '\n'; + ); + + Block = Cand.Block; + ReadyBlocks.erase(Best); + return Block; +} + +// Tracking of currently alive registers to determine VGPR Usage. + +void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) { + for (unsigned Reg : Regs) { + // For now only track virtual registers. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + // If not already in the live set, then add it. + (void) LiveRegs.insert(Reg); + } +} + +void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block, + std::set<unsigned> &Regs) { + for (unsigned Reg : Regs) { + // For now only track virtual registers. + std::set<unsigned>::iterator Pos = LiveRegs.find(Reg); + assert (Pos != LiveRegs.end() && // Reg must be live. + LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && + LiveRegsConsumers[Reg] >= 1); + --LiveRegsConsumers[Reg]; + if (LiveRegsConsumers[Reg] == 0) + LiveRegs.erase(Pos); + } +} + +void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) { + for (SIScheduleBlock* Block : Parent->getSuccs()) { + --BlockNumPredsLeft[Block->getID()]; + if (BlockNumPredsLeft[Block->getID()] == 0) { + ReadyBlocks.push_back(Block); + } + // TODO: Improve check. When the dependency between the high latency + // instructions and the instructions of the other blocks are WAR or WAW + // there will be no wait triggered. We would like these cases to not + // update LastPosHighLatencyParentScheduled. + if (Parent->isHighLatencyBlock()) + LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled; + } +} + +void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { + decreaseLiveRegs(Block, Block->getInRegs()); + addLiveRegs(Block->getOutRegs()); + releaseBlockSuccs(Block); + for (std::map<unsigned, unsigned>::iterator RegI = + LiveOutRegsNumUsages[Block->getID()].begin(), + E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) { + std::pair<unsigned, unsigned> RegP = *RegI; + if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end()) + LiveRegsConsumers[RegP.first] = RegP.second; + else { + assert(LiveRegsConsumers[RegP.first] == 0); + LiveRegsConsumers[RegP.first] += RegP.second; + } + } + if (LastPosHighLatencyParentScheduled[Block->getID()] > + (unsigned)LastPosWaitedHighLatency) + LastPosWaitedHighLatency = + LastPosHighLatencyParentScheduled[Block->getID()]; + ++NumBlockScheduled; +} + +std::vector<int> +SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs, + std::set<unsigned> &OutRegs) { + std::vector<int> DiffSetPressure; + DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0); + + for (unsigned Reg : InRegs) { + // For now only track virtual registers. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + if (LiveRegsConsumers[Reg] > 1) + continue; + PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); + for (; PSetI.isValid(); ++PSetI) { + DiffSetPressure[*PSetI] -= PSetI.getWeight(); + } + } + + for (unsigned Reg : OutRegs) { + // For now only track virtual registers. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); + for (; PSetI.isValid(); ++PSetI) { + DiffSetPressure[*PSetI] += PSetI.getWeight(); + } + } + + return DiffSetPressure; +} + +// SIScheduler // + +struct SIScheduleBlockResult +SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, + SISchedulerBlockSchedulerVariant ScheduleVariant) { + SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant); + SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks); + std::vector<SIScheduleBlock*> ScheduledBlocks; + struct SIScheduleBlockResult Res; + + ScheduledBlocks = Scheduler.getBlocks(); + + for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) { + SIScheduleBlock *Block = ScheduledBlocks[b]; + std::vector<SUnit*> SUs = Block->getScheduledUnits(); + + for (SUnit* SU : SUs) + Res.SUs.push_back(SU->NodeNum); + } + + Res.MaxSGPRUsage = Scheduler.getSGPRUsage(); + Res.MaxVGPRUsage = Scheduler.getVGPRUsage(); + return Res; +} + +// SIScheduleDAGMI // + +SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) : + ScheduleDAGMILive(C, make_unique<GenericScheduler>(C)) { + SITII = static_cast<const SIInstrInfo*>(TII); + SITRI = static_cast<const SIRegisterInfo*>(TRI); + + VGPRSetID = SITRI->getVGPR32PressureSet(); + SGPRSetID = SITRI->getSGPR32PressureSet(); +} + +SIScheduleDAGMI::~SIScheduleDAGMI() { +} + +ScheduleDAGInstrs *llvm::createSIMachineScheduler(MachineSchedContext *C) { + return new SIScheduleDAGMI(C); +} + +// Code adapted from scheduleDAG.cpp +// Does a topological sort over the SUs. +// Both TopDown and BottomUp +void SIScheduleDAGMI::topologicalSort() { + std::vector<int> TopDownSU2Index; + unsigned DAGSize = SUnits.size(); + std::vector<SUnit*> WorkList; + + DEBUG(dbgs() << "Topological Sort\n"); + WorkList.reserve(DAGSize); + + TopDownIndex2SU.resize(DAGSize); + TopDownSU2Index.resize(DAGSize); + BottomUpIndex2SU.resize(DAGSize); + + WorkList.push_back(&getExitSU()); + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &SUnits[i]; + int NodeNum = SU->NodeNum; + unsigned Degree = SU->Succs.size(); + TopDownSU2Index[NodeNum] = Degree; + if (Degree == 0) { + assert(SU->Succs.empty() && "SUnit should have no successors"); + WorkList.push_back(SU); + } + } + + int Id = DAGSize; + while (!WorkList.empty()) { + SUnit *SU = WorkList.back(); + WorkList.pop_back(); + if (SU->NodeNum < DAGSize) { + TopDownSU2Index[SU->NodeNum] = --Id; + TopDownIndex2SU[Id] = SU->NodeNum; + } + for (SDep& Pred : SU->Preds) { + SUnit *SU = Pred.getSUnit(); + if (SU->NodeNum < DAGSize && !--TopDownSU2Index[SU->NodeNum]) + WorkList.push_back(SU); + } + } + + BottomUpIndex2SU = std::vector<int>(TopDownIndex2SU.rbegin(), + TopDownIndex2SU.rend()); + +#ifndef NDEBUG + // Check correctness of the ordering + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &SUnits[i]; + for (SDep& Pred : SU->Preds) { + if (Pred.getSUnit()->NodeNum >= DAGSize) + continue; + assert(TopDownSU2Index[SU->NodeNum] > + TopDownSU2Index[Pred.getSUnit()->NodeNum] && + "Wrong Top Down topological sorting"); + } + } + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &SUnits[i]; + for (SDep& Succ : SU->Succs) { + if (Succ.getSUnit()->NodeNum >= DAGSize) + continue; + assert(TopDownSU2Index[SU->NodeNum] < + TopDownSU2Index[Succ.getSUnit()->NodeNum] && + "Wrong Bottom Up topological sorting"); + } + } +#endif +} + +// Move low latencies further from their user without +// increasing SGPR usage (in general) +// This is to be replaced by a better pass that would +// take into account SGPR usage (based on VGPR Usage +// and the corresponding wavefront count), that would +// try to merge groups of loads if it make sense, etc +void SIScheduleDAGMI::moveLowLatencies() { + unsigned DAGSize = SUnits.size(); + int LastLowLatencyUser = -1; + int LastLowLatencyPos = -1; + + for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) { + SUnit *SU = &SUnits[ScheduledSUnits[i]]; + bool IsLowLatencyUser = false; + unsigned MinPos = 0; + + for (SDep& PredDep : SU->Preds) { + SUnit *Pred = PredDep.getSUnit(); + if (SITII->isLowLatencyInstruction(Pred->getInstr())) { + IsLowLatencyUser = true; + } + if (Pred->NodeNum >= DAGSize) + continue; + unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum]; + if (PredPos >= MinPos) + MinPos = PredPos + 1; + } + + if (SITII->isLowLatencyInstruction(SU->getInstr())) { + unsigned BestPos = LastLowLatencyUser + 1; + if ((int)BestPos <= LastLowLatencyPos) + BestPos = LastLowLatencyPos + 1; + if (BestPos < MinPos) + BestPos = MinPos; + if (BestPos < i) { + for (unsigned u = i; u > BestPos; --u) { + ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; + ScheduledSUnits[u] = ScheduledSUnits[u-1]; + } + ScheduledSUnits[BestPos] = SU->NodeNum; + ScheduledSUnitsInv[SU->NodeNum] = BestPos; + } + LastLowLatencyPos = BestPos; + if (IsLowLatencyUser) + LastLowLatencyUser = BestPos; + } else if (IsLowLatencyUser) { + LastLowLatencyUser = i; + // Moves COPY instructions on which depends + // the low latency instructions too. + } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) { + bool CopyForLowLat = false; + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SITII->isLowLatencyInstruction(Succ->getInstr())) { + CopyForLowLat = true; + } + } + if (!CopyForLowLat) + continue; + if (MinPos < i) { + for (unsigned u = i; u > MinPos; --u) { + ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; + ScheduledSUnits[u] = ScheduledSUnits[u-1]; + } + ScheduledSUnits[MinPos] = SU->NodeNum; + ScheduledSUnitsInv[SU->NodeNum] = MinPos; + } + } + } +} + +void SIScheduleDAGMI::restoreSULinksLeft() { + for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { + SUnits[i].isScheduled = false; + SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft; + SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft; + SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft; + SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft; + } +} + +// Return the Vgpr and Sgpr usage corresponding to some virtual registers. +template<typename _Iterator> void +SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End, + unsigned &VgprUsage, unsigned &SgprUsage) { + VgprUsage = 0; + SgprUsage = 0; + for (_Iterator RegI = First; RegI != End; ++RegI) { + unsigned Reg = *RegI; + // For now only track virtual registers + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + PSetIterator PSetI = MRI.getPressureSets(Reg); + for (; PSetI.isValid(); ++PSetI) { + if (*PSetI == VGPRSetID) + VgprUsage += PSetI.getWeight(); + else if (*PSetI == SGPRSetID) + SgprUsage += PSetI.getWeight(); + } + } +} + +void SIScheduleDAGMI::schedule() +{ + SmallVector<SUnit*, 8> TopRoots, BotRoots; + SIScheduleBlockResult Best, Temp; + DEBUG(dbgs() << "Preparing Scheduling\n"); + + buildDAGWithRegPressure(); + DEBUG( + for(SUnit& SU : SUnits) + SU.dumpAll(this) + ); + + Topo.InitDAGTopologicalSorting(); + topologicalSort(); + findRootsAndBiasEdges(TopRoots, BotRoots); + // We reuse several ScheduleDAGMI and ScheduleDAGMILive + // functions, but to make them happy we must initialize + // the default Scheduler implementation (even if we do not + // run it) + SchedImpl->initialize(this); + initQueues(TopRoots, BotRoots); + + // Fill some stats to help scheduling. + + SUnitsLinksBackup = SUnits; + IsLowLatencySU.clear(); + LowLatencyOffset.clear(); + IsHighLatencySU.clear(); + + IsLowLatencySU.resize(SUnits.size(), 0); + LowLatencyOffset.resize(SUnits.size(), 0); + IsHighLatencySU.resize(SUnits.size(), 0); + + for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { + SUnit *SU = &SUnits[i]; + unsigned BaseLatReg, OffLatReg; + if (SITII->isLowLatencyInstruction(SU->getInstr())) { + IsLowLatencySU[i] = 1; + if (SITII->getMemOpBaseRegImmOfs(SU->getInstr(), BaseLatReg, + OffLatReg, TRI)) + LowLatencyOffset[i] = OffLatReg; + } else if (SITII->isHighLatencyInstruction(SU->getInstr())) + IsHighLatencySU[i] = 1; + } + + SIScheduler Scheduler(this); + Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone, + SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage); +#if 0 // To enable when handleMove fix lands + // if VGPR usage is extremely high, try other good performing variants + // which could lead to lower VGPR usage + if (Best.MaxVGPRUsage > 180) { + std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = { + { LatenciesAlone, BlockRegUsageLatency }, +// { LatenciesAlone, BlockRegUsage }, + { LatenciesGrouped, BlockLatencyRegUsage }, +// { LatenciesGrouped, BlockRegUsageLatency }, +// { LatenciesGrouped, BlockRegUsage }, + { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, +// { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, +// { LatenciesAlonePlusConsecutive, BlockRegUsage } + }; + for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { + Temp = Scheduler.scheduleVariant(v.first, v.second); + if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) + Best = Temp; + } + } + // if VGPR usage is still extremely high, we may spill. Try other variants + // which are less performing, but that could lead to lower VGPR usage. + if (Best.MaxVGPRUsage > 200) { + std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = { +// { LatenciesAlone, BlockRegUsageLatency }, + { LatenciesAlone, BlockRegUsage }, +// { LatenciesGrouped, BlockLatencyRegUsage }, + { LatenciesGrouped, BlockRegUsageLatency }, + { LatenciesGrouped, BlockRegUsage }, +// { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, + { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, + { LatenciesAlonePlusConsecutive, BlockRegUsage } + }; + for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { + Temp = Scheduler.scheduleVariant(v.first, v.second); + if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) + Best = Temp; + } + } +#endif + ScheduledSUnits = Best.SUs; + ScheduledSUnitsInv.resize(SUnits.size()); + + for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { + ScheduledSUnitsInv[ScheduledSUnits[i]] = i; + } + + moveLowLatencies(); + + // Tell the outside world about the result of the scheduling. + + assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); + TopRPTracker.setPos(CurrentTop); + + for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(), + E = ScheduledSUnits.end(); I != E; ++I) { + SUnit *SU = &SUnits[*I]; + + scheduleMI(SU, true); + + DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " + << *SU->getInstr()); + } + + assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); + + placeDebugValues(); + + DEBUG({ + unsigned BBNum = begin()->getParent()->getNumber(); + dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n"; + dumpSchedule(); + dbgs() << '\n'; + }); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h new file mode 100644 index 0000000..b270136 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h @@ -0,0 +1,489 @@ +//===-- SIMachineScheduler.h - SI Scheduler Interface -*- C++ -*-------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief SI Machine Scheduler interface +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H +#define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H + +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/RegisterPressure.h" + +using namespace llvm; + +namespace llvm { + +enum SIScheduleCandReason { + NoCand, + RegUsage, + Latency, + Successor, + Depth, + NodeOrder +}; + +struct SISchedulerCandidate { + // The reason for this candidate. + SIScheduleCandReason Reason; + + // Set of reasons that apply to multiple candidates. + uint32_t RepeatReasonSet; + + SISchedulerCandidate() + : Reason(NoCand), RepeatReasonSet(0) {} + + bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); } + void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); } +}; + +class SIScheduleDAGMI; +class SIScheduleBlockCreator; + +class SIScheduleBlock { + SIScheduleDAGMI *DAG; + SIScheduleBlockCreator *BC; + + std::vector<SUnit*> SUnits; + std::map<unsigned, unsigned> NodeNum2Index; + std::vector<SUnit*> TopReadySUs; + std::vector<SUnit*> ScheduledSUnits; + + /// The top of the unscheduled zone. + IntervalPressure TopPressure; + RegPressureTracker TopRPTracker; + + // Pressure: number of said class of registers needed to + // store the live virtual and real registers. + // We do care only of SGPR32 and VGPR32 and do track only virtual registers. + // Pressure of additional registers required inside the block. + std::vector<unsigned> InternalAdditionnalPressure; + // Pressure of input and output registers + std::vector<unsigned> LiveInPressure; + std::vector<unsigned> LiveOutPressure; + // Registers required by the block, and outputs. + // We do track only virtual registers. + // Note that some registers are not 32 bits, + // and thus the pressure is not equal + // to the number of live registers. + std::set<unsigned> LiveInRegs; + std::set<unsigned> LiveOutRegs; + + bool Scheduled; + bool HighLatencyBlock; + + std::vector<unsigned> HasLowLatencyNonWaitedParent; + + // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table. + unsigned ID; + + std::vector<SIScheduleBlock*> Preds; // All blocks predecessors. + std::vector<SIScheduleBlock*> Succs; // All blocks successors. + unsigned NumHighLatencySuccessors; + +public: + SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC, + unsigned ID): + DAG(DAG), BC(BC), SUnits(), TopReadySUs(), ScheduledSUnits(), + TopRPTracker(TopPressure), Scheduled(false), + HighLatencyBlock(false), ID(ID), + Preds(), Succs(), NumHighLatencySuccessors(0) {}; + + ~SIScheduleBlock() {}; + + unsigned getID() const { return ID; } + + /// Functions for Block construction. + void addUnit(SUnit *SU); + + // When all SUs have been added. + void finalizeUnits(); + + // Add block pred, which has instruction predecessor of SU. + void addPred(SIScheduleBlock *Pred); + void addSucc(SIScheduleBlock *Succ); + + const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; } + const std::vector<SIScheduleBlock*>& getSuccs() const { return Succs; } + + unsigned Height; // Maximum topdown path length to block without outputs + unsigned Depth; // Maximum bottomup path length to block without inputs + + unsigned getNumHighLatencySuccessors() const { + return NumHighLatencySuccessors; + } + + bool isHighLatencyBlock() { return HighLatencyBlock; } + + // This is approximative. + // Ideally should take into accounts some instructions (rcp, etc) + // are 4 times slower. + int getCost() { return SUnits.size(); } + + // The block Predecessors and Successors must be all registered + // before fastSchedule(). + // Fast schedule with no particular requirement. + void fastSchedule(); + + std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; } + + // Complete schedule that will try to minimize reg pressure and + // low latencies, and will fill liveins and liveouts. + // Needs all MIs to be grouped between BeginBlock and EndBlock. + // The MIs can be moved after the scheduling, + // it is just used to allow correct track of live registers. + void schedule(MachineBasicBlock::iterator BeginBlock, + MachineBasicBlock::iterator EndBlock); + + bool isScheduled() { return Scheduled; } + + + // Needs the block to be scheduled inside + // TODO: find a way to compute it. + std::vector<unsigned> &getInternalAdditionnalRegUsage() { + return InternalAdditionnalPressure; + } + + std::set<unsigned> &getInRegs() { return LiveInRegs; } + std::set<unsigned> &getOutRegs() { return LiveOutRegs; } + + void printDebug(bool Full); + +private: + struct SISchedCandidate : SISchedulerCandidate { + // The best SUnit candidate. + SUnit *SU; + + unsigned SGPRUsage; + unsigned VGPRUsage; + bool IsLowLatency; + unsigned LowLatencyOffset; + bool HasLowLatencyNonWaitedParent; + + SISchedCandidate() + : SU(nullptr) {} + + bool isValid() const { return SU; } + + // Copy the status of another candidate without changing policy. + void setBest(SISchedCandidate &Best) { + assert(Best.Reason != NoCand && "uninitialized Sched candidate"); + SU = Best.SU; + Reason = Best.Reason; + SGPRUsage = Best.SGPRUsage; + VGPRUsage = Best.VGPRUsage; + IsLowLatency = Best.IsLowLatency; + LowLatencyOffset = Best.LowLatencyOffset; + HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent; + } + }; + + void undoSchedule(); + + void undoReleaseSucc(SUnit *SU, SDep *SuccEdge); + void releaseSucc(SUnit *SU, SDep *SuccEdge); + // InOrOutBlock: restrict to links pointing inside the block (true), + // or restrict to links pointing outside the block (false). + void releaseSuccessors(SUnit *SU, bool InOrOutBlock); + + void nodeScheduled(SUnit *SU); + void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand); + void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand); + SUnit* pickNode(); + void traceCandidate(const SISchedCandidate &Cand); + void initRegPressure(MachineBasicBlock::iterator BeginBlock, + MachineBasicBlock::iterator EndBlock); +}; + +struct SIScheduleBlocks { + std::vector<SIScheduleBlock*> Blocks; + std::vector<int> TopDownIndex2Block; + std::vector<int> TopDownBlock2Index; +}; + +enum SISchedulerBlockCreatorVariant { + LatenciesAlone, + LatenciesGrouped, + LatenciesAlonePlusConsecutive +}; + +class SIScheduleBlockCreator { + SIScheduleDAGMI *DAG; + // unique_ptr handles freeing memory for us. + std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs; + std::map<SISchedulerBlockCreatorVariant, + SIScheduleBlocks> Blocks; + std::vector<SIScheduleBlock*> CurrentBlocks; + std::vector<int> Node2CurrentBlock; + + // Topological sort + // Maps topological index to the node number. + std::vector<int> TopDownIndex2Block; + std::vector<int> TopDownBlock2Index; + std::vector<int> BottomUpIndex2Block; + + // 0 -> Color not given. + // 1 to SUnits.size() -> Reserved group (you should only add elements to them). + // Above -> Other groups. + int NextReservedID; + int NextNonReservedID; + std::vector<int> CurrentColoring; + std::vector<int> CurrentTopDownReservedDependencyColoring; + std::vector<int> CurrentBottomUpReservedDependencyColoring; + +public: + SIScheduleBlockCreator(SIScheduleDAGMI *DAG); + ~SIScheduleBlockCreator(); + + SIScheduleBlocks + getBlocks(SISchedulerBlockCreatorVariant BlockVariant); + + bool isSUInBlock(SUnit *SU, unsigned ID); + +private: + // Give a Reserved color to every high latency. + void colorHighLatenciesAlone(); + + // Create groups of high latencies with a Reserved color. + void colorHighLatenciesGroups(); + + // Compute coloring for topdown and bottom traversals with + // different colors depending on dependencies on Reserved colors. + void colorComputeReservedDependencies(); + + // Give color to all non-colored SUs according to Reserved groups dependencies. + void colorAccordingToReservedDependencies(); + + // Divides Blocks having no bottom up or top down dependencies on Reserved groups. + // The new colors are computed according to the dependencies on the other blocks + // formed with colorAccordingToReservedDependencies. + void colorEndsAccordingToDependencies(); + + // Cut groups into groups with SUs in consecutive order (except for Reserved groups). + void colorForceConsecutiveOrderInGroup(); + + // Merge Constant loads that have all their users into another group to the group. + // (TODO: else if all their users depend on the same group, put them there) + void colorMergeConstantLoadsNextGroup(); + + // Merge SUs that have all their users into another group to the group + void colorMergeIfPossibleNextGroup(); + + // Merge SUs that have all their users into another group to the group, + // but only for Reserved groups. + void colorMergeIfPossibleNextGroupOnlyForReserved(); + + // Merge SUs that have all their users into another group to the group, + // but only if the group is no more than a few SUs. + void colorMergeIfPossibleSmallGroupsToNextGroup(); + + // Divides Blocks with important size. + // Idea of implementation: attribute new colors depending on topdown and + // bottom up links to other blocks. + void cutHugeBlocks(); + + // Put in one group all instructions with no users in this scheduling region + // (we'd want these groups be at the end). + void regroupNoUserInstructions(); + + void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant); + + void topologicalSort(); + + void scheduleInsideBlocks(); + + void fillStats(); +}; + +enum SISchedulerBlockSchedulerVariant { + BlockLatencyRegUsage, + BlockRegUsageLatency, + BlockRegUsage +}; + +class SIScheduleBlockScheduler { + SIScheduleDAGMI *DAG; + SISchedulerBlockSchedulerVariant Variant; + std::vector<SIScheduleBlock*> Blocks; + + std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages; + std::set<unsigned> LiveRegs; + // Num of schedulable unscheduled blocks reading the register. + std::map<unsigned, unsigned> LiveRegsConsumers; + + std::vector<unsigned> LastPosHighLatencyParentScheduled; + int LastPosWaitedHighLatency; + + std::vector<SIScheduleBlock*> BlocksScheduled; + unsigned NumBlockScheduled; + std::vector<SIScheduleBlock*> ReadyBlocks; + + unsigned VregCurrentUsage; + unsigned SregCurrentUsage; + + // Currently is only approximation. + unsigned maxVregUsage; + unsigned maxSregUsage; + + std::vector<unsigned> BlockNumPredsLeft; + std::vector<unsigned> BlockNumSuccsLeft; + +public: + SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, + SISchedulerBlockSchedulerVariant Variant, + SIScheduleBlocks BlocksStruct); + ~SIScheduleBlockScheduler() {}; + + std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; }; + + unsigned getVGPRUsage() { return maxVregUsage; }; + unsigned getSGPRUsage() { return maxSregUsage; }; + +private: + struct SIBlockSchedCandidate : SISchedulerCandidate { + // The best Block candidate. + SIScheduleBlock *Block; + + bool IsHighLatency; + int VGPRUsageDiff; + unsigned NumSuccessors; + unsigned NumHighLatencySuccessors; + unsigned LastPosHighLatParentScheduled; + unsigned Height; + + SIBlockSchedCandidate() + : Block(nullptr) {} + + bool isValid() const { return Block; } + + // Copy the status of another candidate without changing policy. + void setBest(SIBlockSchedCandidate &Best) { + assert(Best.Reason != NoCand && "uninitialized Sched candidate"); + Block = Best.Block; + Reason = Best.Reason; + IsHighLatency = Best.IsHighLatency; + VGPRUsageDiff = Best.VGPRUsageDiff; + NumSuccessors = Best.NumSuccessors; + NumHighLatencySuccessors = Best.NumHighLatencySuccessors; + LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled; + Height = Best.Height; + } + }; + + bool tryCandidateLatency(SIBlockSchedCandidate &Cand, + SIBlockSchedCandidate &TryCand); + bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand, + SIBlockSchedCandidate &TryCand); + SIScheduleBlock *pickBlock(); + + void addLiveRegs(std::set<unsigned> &Regs); + void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs); + void releaseBlockSuccs(SIScheduleBlock *Parent); + void blockScheduled(SIScheduleBlock *Block); + + // Check register pressure change + // by scheduling a block with these LiveIn and LiveOut. + std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs, + std::set<unsigned> &OutRegs); + + void schedule(); +}; + +struct SIScheduleBlockResult { + std::vector<unsigned> SUs; + unsigned MaxSGPRUsage; + unsigned MaxVGPRUsage; +}; + +class SIScheduler { + SIScheduleDAGMI *DAG; + SIScheduleBlockCreator BlockCreator; + +public: + SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {}; + + ~SIScheduler() {}; + + struct SIScheduleBlockResult + scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, + SISchedulerBlockSchedulerVariant ScheduleVariant); +}; + +class SIScheduleDAGMI : public ScheduleDAGMILive { + const SIInstrInfo *SITII; + const SIRegisterInfo *SITRI; + + std::vector<SUnit> SUnitsLinksBackup; + + // For moveLowLatencies. After all Scheduling variants are tested. + std::vector<unsigned> ScheduledSUnits; + std::vector<unsigned> ScheduledSUnitsInv; + + unsigned VGPRSetID; + unsigned SGPRSetID; + +public: + SIScheduleDAGMI(MachineSchedContext *C); + + ~SIScheduleDAGMI() override; + + // Entry point for the schedule. + void schedule() override; + + // To init Block's RPTracker. + void initRPTracker(RegPressureTracker &RPTracker) { + RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin); + } + + MachineBasicBlock *getBB() { return BB; } + MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; }; + MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; }; + LiveIntervals *getLIS() { return LIS; } + MachineRegisterInfo *getMRI() { return &MRI; } + const TargetRegisterInfo *getTRI() { return TRI; } + SUnit& getEntrySU() { return EntrySU; }; + SUnit& getExitSU() { return ExitSU; }; + + void restoreSULinksLeft(); + + template<typename _Iterator> void fillVgprSgprCost(_Iterator First, + _Iterator End, + unsigned &VgprUsage, + unsigned &SgprUsage); + std::set<unsigned> getInRegs() { + std::set<unsigned> InRegs (RPTracker.getPressure().LiveInRegs.begin(), + RPTracker.getPressure().LiveInRegs.end()); + return InRegs; + }; + + unsigned getVGPRSetID() const { return VGPRSetID; } + unsigned getSGPRSetID() const { return SGPRSetID; } + +private: + void topologicalSort(); + // After scheduling is done, improve low latency placements. + void moveLowLatencies(); + +public: + // Some stats for scheduling inside blocks. + std::vector<unsigned> IsLowLatencySU; + std::vector<unsigned> LowLatencyOffset; + std::vector<unsigned> IsHighLatencySU; + // Topological sort + // Maps topological index to the node number. + std::vector<int> TopDownIndex2SU; + std::vector<int> BottomUpIndex2SU; +}; + +} // namespace llvm + +#endif /* SIMACHINESCHEDULER_H_ */ diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp new file mode 100644 index 0000000..609f5e7 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -0,0 +1,691 @@ +//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief SI implementation of the TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#include "SIRegisterInfo.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" + +using namespace llvm; + +SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() { + unsigned NumRegPressureSets = getNumRegPressureSets(); + + SGPR32SetID = NumRegPressureSets; + VGPR32SetID = NumRegPressureSets; + for (unsigned i = 0; i < NumRegPressureSets; ++i) { + if (strncmp("SGPR_32", getRegPressureSetName(i), 7) == 0) + SGPR32SetID = i; + else if (strncmp("VGPR_32", getRegPressureSetName(i), 7) == 0) + VGPR32SetID = i; + } + assert(SGPR32SetID < NumRegPressureSets && + VGPR32SetID < NumRegPressureSets); +} + +void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const { + MCRegAliasIterator R(Reg, this, true); + + for (; R.isValid(); ++R) + Reserved.set(*R); +} + +unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( + const MachineFunction &MF) const { + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + if (ST.hasSGPRInitBug()) { + // Leave space for flat_scr, xnack_mask, vcc, and alignment + unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 8 - 4; + unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); + return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); + } + + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // 96/97 need to be reserved for flat_scr, 98/99 for xnack_mask, and + // 100/101 for vcc. This is the next sgpr128 down. + return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95; + } + + return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99; +} + +unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( + const MachineFunction &MF) const { + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + if (ST.hasSGPRInitBug()) { + unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1; + return AMDGPU::SGPR_32RegClass.getRegister(Idx); + } + + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // Next register before reservations for flat_scr, xnack_mask, vcc, + // and scratch resource. + return AMDGPU::SGPR91; + } + + return AMDGPU::SGPR95; +} + +BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { + BitVector Reserved(getNumRegs()); + Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); + + // EXEC_LO and EXEC_HI could be allocated and used as regular register, but + // this seems likely to result in bugs, so I'm marking them as reserved. + reserveRegisterTuples(Reserved, AMDGPU::EXEC); + reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); + + // Reserve the last 2 registers so we will always have at least 2 more that + // will physically contain VCC. + reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103); + + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation + // for VCC/XNACK_MASK/FLAT_SCR. + // + // TODO The SGPRs that alias to XNACK_MASK could be used as general purpose + // SGPRs when the XNACK feature is not used. This is currently not done + // because the code that counts SGPRs cannot account for such holes. + reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97); + reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99); + reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101); + } + + // Tonga and Iceland can only allocate a fixed number of SGPRs due + // to a hw bug. + if (ST.hasSGPRInitBug()) { + unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); + // Reserve some SGPRs for FLAT_SCRATCH, XNACK_MASK, and VCC (6 SGPRs). + unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6; + + for (unsigned i = Limit; i < NumSGPRs; ++i) { + unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); + reserveRegisterTuples(Reserved, Reg); + } + } + + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + if (ScratchWaveOffsetReg != AMDGPU::NoRegister) { + // Reserve 1 SGPR for scratch wave offset in case we need to spill. + reserveRegisterTuples(Reserved, ScratchWaveOffsetReg); + } + + unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); + if (ScratchRSrcReg != AMDGPU::NoRegister) { + // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need + // to spill. + // TODO: May need to reserve a VGPR if doing LDS spilling. + reserveRegisterTuples(Reserved, ScratchRSrcReg); + assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); + } + + return Reserved; +} + +unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const { + const AMDGPUSubtarget &STI = MF.getSubtarget<AMDGPUSubtarget>(); + // FIXME: We should adjust the max number of waves based on LDS size. + unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(), + STI.getMaxWavesPerCU()); + unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU()); + + unsigned VSLimit = SGPRLimit + VGPRLimit; + + for (regclass_iterator I = regclass_begin(), E = regclass_end(); + I != E; ++I) { + const TargetRegisterClass *RC = *I; + + unsigned NumSubRegs = std::max((int)RC->getSize() / 4, 1); + unsigned Limit; + + if (isPseudoRegClass(RC)) { + // FIXME: This is a hack. We should never be considering the pressure of + // these since no virtual register should ever have this class. + Limit = VSLimit; + } else if (isSGPRClass(RC)) { + Limit = SGPRLimit / NumSubRegs; + } else { + Limit = VGPRLimit / NumSubRegs; + } + + const int *Sets = getRegClassPressureSets(RC); + assert(Sets); + for (unsigned i = 0; Sets[i] != -1; ++i) { + if (Sets[i] == (int)Idx) + return Limit; + } + } + return 256; +} + +bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { + return Fn.getFrameInfo()->hasStackObjects(); +} + +static unsigned getNumSubRegsForSpillOp(unsigned Op) { + + switch (Op) { + case AMDGPU::SI_SPILL_S512_SAVE: + case AMDGPU::SI_SPILL_S512_RESTORE: + case AMDGPU::SI_SPILL_V512_SAVE: + case AMDGPU::SI_SPILL_V512_RESTORE: + return 16; + case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V256_RESTORE: + return 8; + case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S128_RESTORE: + case AMDGPU::SI_SPILL_V128_SAVE: + case AMDGPU::SI_SPILL_V128_RESTORE: + return 4; + case AMDGPU::SI_SPILL_V96_SAVE: + case AMDGPU::SI_SPILL_V96_RESTORE: + return 3; + case AMDGPU::SI_SPILL_S64_SAVE: + case AMDGPU::SI_SPILL_S64_RESTORE: + case AMDGPU::SI_SPILL_V64_SAVE: + case AMDGPU::SI_SPILL_V64_RESTORE: + return 2; + case AMDGPU::SI_SPILL_S32_SAVE: + case AMDGPU::SI_SPILL_S32_RESTORE: + case AMDGPU::SI_SPILL_V32_SAVE: + case AMDGPU::SI_SPILL_V32_RESTORE: + return 1; + default: llvm_unreachable("Invalid spill opcode"); + } +} + +void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, + unsigned LoadStoreOp, + unsigned Value, + unsigned ScratchRsrcReg, + unsigned ScratchOffset, + int64_t Offset, + RegScavenger *RS) const { + + MachineBasicBlock *MBB = MI->getParent(); + const MachineFunction *MF = MI->getParent()->getParent(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo()); + LLVMContext &Ctx = MF->getFunction()->getContext(); + DebugLoc DL = MI->getDebugLoc(); + bool IsLoad = TII->get(LoadStoreOp).mayLoad(); + + bool RanOutOfSGPRs = false; + unsigned SOffset = ScratchOffset; + + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + unsigned Size = NumSubRegs * 4; + + if (!isUInt<12>(Offset + Size)) { + SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0); + if (SOffset == AMDGPU::NoRegister) { + RanOutOfSGPRs = true; + SOffset = AMDGPU::SGPR0; + } + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) + .addReg(ScratchOffset) + .addImm(Offset); + Offset = 0; + } + + if (RanOutOfSGPRs) + Ctx.emitError("Ran out of SGPRs for spilling VGPRS"); + + for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) { + unsigned SubReg = NumSubRegs > 1 ? + getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) : + Value; + + BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) + .addReg(SubReg, getDefRegState(IsLoad)) + .addReg(ScratchRsrcReg) + .addReg(SOffset) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addReg(Value, RegState::Implicit | getDefRegState(IsLoad)) + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + } +} + +void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, + int SPAdj, unsigned FIOperandNum, + RegScavenger *RS) const { + MachineFunction *MF = MI->getParent()->getParent(); + MachineBasicBlock *MBB = MI->getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo()); + DebugLoc DL = MI->getDebugLoc(); + + MachineOperand &FIOp = MI->getOperand(FIOperandNum); + int Index = MI->getOperand(FIOperandNum).getIndex(); + + switch (MI->getOpcode()) { + // SGPR register spill + case AMDGPU::SI_SPILL_S512_SAVE: + case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S64_SAVE: + case AMDGPU::SI_SPILL_S32_SAVE: { + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { + unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), + &AMDGPU::SGPR_32RegClass, i); + struct SIMachineFunctionInfo::SpilledReg Spill = + MFI->getSpilledReg(MF, Index, i); + + BuildMI(*MBB, MI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + Spill.VGPR) + .addReg(SubReg) + .addImm(Spill.Lane); + + // FIXME: Since this spills to another register instead of an actual + // frame index, we should delete the frame index when all references to + // it are fixed. + } + MI->eraseFromParent(); + break; + } + + // SGPR register restore + case AMDGPU::SI_SPILL_S512_RESTORE: + case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_S128_RESTORE: + case AMDGPU::SI_SPILL_S64_RESTORE: + case AMDGPU::SI_SPILL_S32_RESTORE: { + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { + unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), + &AMDGPU::SGPR_32RegClass, i); + struct SIMachineFunctionInfo::SpilledReg Spill = + MFI->getSpilledReg(MF, Index, i); + + BuildMI(*MBB, MI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), + SubReg) + .addReg(Spill.VGPR) + .addImm(Spill.Lane) + .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); + } + + // TODO: only do this when it is needed + switch (MF->getSubtarget<AMDGPUSubtarget>().getGeneration()) { + case AMDGPUSubtarget::SOUTHERN_ISLANDS: + // "VALU writes SGPR" -> "SMRD reads that SGPR" needs 4 wait states + // ("S_NOP 3") on SI + TII->insertWaitStates(MI, 4); + break; + case AMDGPUSubtarget::SEA_ISLANDS: + break; + default: // VOLCANIC_ISLANDS and later + // "VALU writes SGPR -> VMEM reads that SGPR" needs 5 wait states + // ("S_NOP 4") on VI and later. This also applies to VALUs which write + // VCC, but we're unlikely to see VMEM use VCC. + TII->insertWaitStates(MI, 5); + } + + MI->eraseFromParent(); + break; + } + + // VGPR register spill + case AMDGPU::SI_SPILL_V512_SAVE: + case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V128_SAVE: + case AMDGPU::SI_SPILL_V96_SAVE: + case AMDGPU::SI_SPILL_V64_SAVE: + case AMDGPU::SI_SPILL_V32_SAVE: + buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, + TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), + FrameInfo->getObjectOffset(Index), RS); + MI->eraseFromParent(); + break; + case AMDGPU::SI_SPILL_V32_RESTORE: + case AMDGPU::SI_SPILL_V64_RESTORE: + case AMDGPU::SI_SPILL_V96_RESTORE: + case AMDGPU::SI_SPILL_V128_RESTORE: + case AMDGPU::SI_SPILL_V256_RESTORE: + case AMDGPU::SI_SPILL_V512_RESTORE: { + buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, + TII->getNamedOperand(*MI, AMDGPU::OpName::dst)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), + FrameInfo->getObjectOffset(Index), RS); + MI->eraseFromParent(); + break; + } + + default: { + int64_t Offset = FrameInfo->getObjectOffset(Index); + FIOp.ChangeToImmediate(Offset); + if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) { + unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, SPAdj); + BuildMI(*MBB, MI, MI->getDebugLoc(), + TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) + .addImm(Offset); + FIOp.ChangeToRegister(TmpReg, false, false, true); + } + } + } +} + +unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const { + return getEncodingValue(Reg) & 0xff; +} + +// FIXME: This is very slow. It might be worth creating a map from physreg to +// register class. +const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { + assert(!TargetRegisterInfo::isVirtualRegister(Reg)); + + static const TargetRegisterClass *const BaseClasses[] = { + &AMDGPU::VGPR_32RegClass, + &AMDGPU::SReg_32RegClass, + &AMDGPU::VReg_64RegClass, + &AMDGPU::SReg_64RegClass, + &AMDGPU::VReg_96RegClass, + &AMDGPU::VReg_128RegClass, + &AMDGPU::SReg_128RegClass, + &AMDGPU::VReg_256RegClass, + &AMDGPU::SReg_256RegClass, + &AMDGPU::VReg_512RegClass, + &AMDGPU::SReg_512RegClass + }; + + for (const TargetRegisterClass *BaseClass : BaseClasses) { + if (BaseClass->contains(Reg)) { + return BaseClass; + } + } + return nullptr; +} + +// TODO: It might be helpful to have some target specific flags in +// TargetRegisterClass to mark which classes are VGPRs to make this trivial. +bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { + switch (RC->getSize()) { + case 4: + return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; + case 8: + return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr; + case 12: + return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr; + case 16: + return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr; + case 32: + return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr; + case 64: + return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; + default: + llvm_unreachable("Invalid register class size"); + } +} + +const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( + const TargetRegisterClass *SRC) const { + switch (SRC->getSize()) { + case 4: + return &AMDGPU::VGPR_32RegClass; + case 8: + return &AMDGPU::VReg_64RegClass; + case 12: + return &AMDGPU::VReg_96RegClass; + case 16: + return &AMDGPU::VReg_128RegClass; + case 32: + return &AMDGPU::VReg_256RegClass; + case 64: + return &AMDGPU::VReg_512RegClass; + default: + llvm_unreachable("Invalid register class size"); + } +} + +const TargetRegisterClass *SIRegisterInfo::getSubRegClass( + const TargetRegisterClass *RC, unsigned SubIdx) const { + if (SubIdx == AMDGPU::NoSubRegister) + return RC; + + // We can assume that each lane corresponds to one 32-bit register. + unsigned Count = countPopulation(getSubRegIndexLaneMask(SubIdx)); + if (isSGPRClass(RC)) { + switch (Count) { + case 1: + return &AMDGPU::SGPR_32RegClass; + case 2: + return &AMDGPU::SReg_64RegClass; + case 4: + return &AMDGPU::SReg_128RegClass; + case 8: + return &AMDGPU::SReg_256RegClass; + case 16: /* fall-through */ + default: + llvm_unreachable("Invalid sub-register class size"); + } + } else { + switch (Count) { + case 1: + return &AMDGPU::VGPR_32RegClass; + case 2: + return &AMDGPU::VReg_64RegClass; + case 3: + return &AMDGPU::VReg_96RegClass; + case 4: + return &AMDGPU::VReg_128RegClass; + case 8: + return &AMDGPU::VReg_256RegClass; + case 16: /* fall-through */ + default: + llvm_unreachable("Invalid sub-register class size"); + } + } +} + +bool SIRegisterInfo::shouldRewriteCopySrc( + const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const { + // We want to prefer the smallest register class possible, so we don't want to + // stop and rewrite on anything that looks like a subregister + // extract. Operations mostly don't care about the super register class, so we + // only want to stop on the most basic of copies between the smae register + // class. + // + // e.g. if we have something like + // vreg0 = ... + // vreg1 = ... + // vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1, vreg2, sub2 + // vreg3 = COPY vreg2, sub0 + // + // We want to look through the COPY to find: + // => vreg3 = COPY vreg0 + + // Plain copy. + return getCommonSubClass(DefRC, SrcRC) != nullptr; +} + +unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg, + const TargetRegisterClass *SubRC, + unsigned Channel) const { + + switch (Reg) { + case AMDGPU::VCC: + switch(Channel) { + case 0: return AMDGPU::VCC_LO; + case 1: return AMDGPU::VCC_HI; + default: llvm_unreachable("Invalid SubIdx for VCC"); + } + + case AMDGPU::FLAT_SCR: + switch (Channel) { + case 0: + return AMDGPU::FLAT_SCR_LO; + case 1: + return AMDGPU::FLAT_SCR_HI; + default: + llvm_unreachable("Invalid SubIdx for FLAT_SCR"); + } + break; + + case AMDGPU::EXEC: + switch (Channel) { + case 0: + return AMDGPU::EXEC_LO; + case 1: + return AMDGPU::EXEC_HI; + default: + llvm_unreachable("Invalid SubIdx for EXEC"); + } + break; + } + + const TargetRegisterClass *RC = getPhysRegClass(Reg); + // 32-bit registers don't have sub-registers, so we can just return the + // Reg. We need to have this check here, because the calculation below + // using getHWRegIndex() will fail with special 32-bit registers like + // VCC_LO, VCC_HI, EXEC_LO, EXEC_HI and M0. + if (RC->getSize() == 4) { + assert(Channel == 0); + return Reg; + } + + unsigned Index = getHWRegIndex(Reg); + return SubRC->getRegister(Index + Channel); +} + +bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { + return OpType == AMDGPU::OPERAND_REG_IMM32; +} + +bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { + if (opCanUseLiteralConstant(OpType)) + return true; + + return OpType == AMDGPU::OPERAND_REG_INLINE_C; +} + +// FIXME: Most of these are flexible with HSA and we don't need to reserve them +// as input registers if unused. Whether the dispatch ptr is necessary should be +// easy to detect from used intrinsics. Scratch setup is harder to know. +unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, + enum PreloadedValue Value) const { + + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + (void)ST; + switch (Value) { + case SIRegisterInfo::WORKGROUP_ID_X: + assert(MFI->hasWorkGroupIDX()); + return MFI->WorkGroupIDXSystemSGPR; + case SIRegisterInfo::WORKGROUP_ID_Y: + assert(MFI->hasWorkGroupIDY()); + return MFI->WorkGroupIDYSystemSGPR; + case SIRegisterInfo::WORKGROUP_ID_Z: + assert(MFI->hasWorkGroupIDZ()); + return MFI->WorkGroupIDZSystemSGPR; + case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: + return MFI->PrivateSegmentWaveByteOffsetSystemSGPR; + case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER: + assert(ST.isAmdHsaOS() && "Non-HSA ABI currently uses relocations"); + assert(MFI->hasPrivateSegmentBuffer()); + return MFI->PrivateSegmentBufferUserSGPR; + case SIRegisterInfo::KERNARG_SEGMENT_PTR: + assert(MFI->hasKernargSegmentPtr()); + return MFI->KernargSegmentPtrUserSGPR; + case SIRegisterInfo::DISPATCH_PTR: + assert(MFI->hasDispatchPtr()); + return MFI->DispatchPtrUserSGPR; + case SIRegisterInfo::QUEUE_PTR: + llvm_unreachable("not implemented"); + case SIRegisterInfo::WORKITEM_ID_X: + assert(MFI->hasWorkItemIDX()); + return AMDGPU::VGPR0; + case SIRegisterInfo::WORKITEM_ID_Y: + assert(MFI->hasWorkItemIDY()); + return AMDGPU::VGPR1; + case SIRegisterInfo::WORKITEM_ID_Z: + assert(MFI->hasWorkItemIDZ()); + return AMDGPU::VGPR2; + } + llvm_unreachable("unexpected preloaded value type"); +} + +/// \brief Returns a register that is not used at any point in the function. +/// If all registers are used, then this function will return +// AMDGPU::NoRegister. +unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, + const TargetRegisterClass *RC) const { + for (unsigned Reg : *RC) + if (!MRI.isPhysRegUsed(Reg)) + return Reg; + return AMDGPU::NoRegister; +} + +unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const { + switch(WaveCount) { + case 10: return 24; + case 9: return 28; + case 8: return 32; + case 7: return 36; + case 6: return 40; + case 5: return 48; + case 4: return 64; + case 3: return 84; + case 2: return 128; + default: return 256; + } +} + +unsigned SIRegisterInfo::getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen, + unsigned WaveCount) const { + if (gen >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + switch (WaveCount) { + case 10: return 80; + case 9: return 80; + case 8: return 96; + default: return 102; + } + } else { + switch(WaveCount) { + case 10: return 48; + case 9: return 56; + case 8: return 64; + case 7: return 72; + case 6: return 80; + case 5: return 96; + default: return 103; + } + } +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h new file mode 100644 index 0000000..9410e20 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -0,0 +1,164 @@ +//===-- SIRegisterInfo.h - SI Register Info Interface ----------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface definition for SIRegisterInfo +// +//===----------------------------------------------------------------------===// + + +#ifndef LLVM_LIB_TARGET_R600_SIREGISTERINFO_H +#define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H + +#include "AMDGPURegisterInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" + +namespace llvm { + +struct SIRegisterInfo : public AMDGPURegisterInfo { +private: + unsigned SGPR32SetID; + unsigned VGPR32SetID; + + void reserveRegisterTuples(BitVector &, unsigned Reg) const; + +public: + SIRegisterInfo(); + + /// Return the end register initially reserved for the scratch buffer in case + /// spilling is needed. + unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; + + /// Return the end register initially reserved for the scratch wave offset in + /// case spilling is needed. + unsigned reservedPrivateSegmentWaveByteOffsetReg( + const MachineFunction &MF) const; + + BitVector getReservedRegs(const MachineFunction &MF) const override; + + unsigned getRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const override; + + bool requiresRegisterScavenging(const MachineFunction &Fn) const override; + + void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, + unsigned FIOperandNum, + RegScavenger *RS) const override; + + unsigned getHWRegIndex(unsigned Reg) const override; + + /// \brief Return the 'base' register class for this register. + /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc. + const TargetRegisterClass *getPhysRegClass(unsigned Reg) const; + + /// \returns true if this class contains only SGPR registers + bool isSGPRClass(const TargetRegisterClass *RC) const { + return !hasVGPRs(RC); + } + + /// \returns true if this class ID contains only SGPR registers + bool isSGPRClassID(unsigned RCID) const { + return isSGPRClass(getRegClass(RCID)); + } + + bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const { + if (TargetRegisterInfo::isVirtualRegister(Reg)) + return isSGPRClass(MRI.getRegClass(Reg)); + return getPhysRegClass(Reg); + } + + /// \returns true if this class contains VGPR registers. + bool hasVGPRs(const TargetRegisterClass *RC) const; + + /// returns true if this is a pseudoregister class combination of VGPRs and + /// SGPRs for operand modeling. FIXME: We should set isAllocatable = 0 on + /// them. + static bool isPseudoRegClass(const TargetRegisterClass *RC) { + return RC == &AMDGPU::VS_32RegClass || RC == &AMDGPU::VS_64RegClass; + } + + /// \returns A VGPR reg class with the same width as \p SRC + const TargetRegisterClass *getEquivalentVGPRClass( + const TargetRegisterClass *SRC) const; + + /// \returns The register class that is used for a sub-register of \p RC for + /// the given \p SubIdx. If \p SubIdx equals NoSubRegister, \p RC will + /// be returned. + const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC, + unsigned SubIdx) const; + + bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const override; + + /// \p Channel This is the register channel (e.g. a value from 0-16), not the + /// SubReg index. + /// \returns The sub-register of Reg that is in Channel. + unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC, + unsigned Channel) const; + + /// \returns True if operands defined with this operand type can accept + /// a literal constant (i.e. any 32-bit immediate). + bool opCanUseLiteralConstant(unsigned OpType) const; + + /// \returns True if operands defined with this operand type can accept + /// an inline constant. i.e. An integer value in the range (-16, 64) or + /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. + bool opCanUseInlineConstant(unsigned OpType) const; + + enum PreloadedValue { + // SGPRS: + PRIVATE_SEGMENT_BUFFER = 0, + DISPATCH_PTR = 1, + QUEUE_PTR = 2, + KERNARG_SEGMENT_PTR = 3, + WORKGROUP_ID_X = 10, + WORKGROUP_ID_Y = 11, + WORKGROUP_ID_Z = 12, + PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, + + // VGPRS: + FIRST_VGPR_VALUE = 15, + WORKITEM_ID_X = FIRST_VGPR_VALUE, + WORKITEM_ID_Y = 16, + WORKITEM_ID_Z = 17 + }; + + /// \brief Returns the physical register that \p Value is stored in. + unsigned getPreloadedValue(const MachineFunction &MF, + enum PreloadedValue Value) const; + + /// \brief Give the maximum number of VGPRs that can be used by \p WaveCount + /// concurrent waves. + unsigned getNumVGPRsAllowed(unsigned WaveCount) const; + + /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount + /// concurrent waves. + unsigned getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen, + unsigned WaveCount) const; + + unsigned findUnusedRegister(const MachineRegisterInfo &MRI, + const TargetRegisterClass *RC) const; + + unsigned getSGPR32PressureSet() const { return SGPR32SetID; }; + unsigned getVGPR32PressureSet() const { return VGPR32SetID; }; + +private: + void buildScratchLoadStore(MachineBasicBlock::iterator MI, + unsigned LoadStoreOp, unsigned Value, + unsigned ScratchRsrcReg, unsigned ScratchOffset, + int64_t Offset, RegScavenger *RS) const; +}; + +} // End namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td new file mode 100644 index 0000000..bfaf937 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -0,0 +1,327 @@ +//===-- SIRegisterInfo.td - SI Register defs ---------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Declarations that describe the SI registers +//===----------------------------------------------------------------------===// +class SIReg <string n, bits<16> regIdx = 0> : Register<n>, + DwarfRegNum<[!cast<int>(HWEncoding)]> { + let Namespace = "AMDGPU"; + + // This is the not yet the complete register encoding. An additional + // bit is set for VGPRs. + let HWEncoding = regIdx; +} + +// Special Registers +def VCC_LO : SIReg<"vcc_lo", 106>; +def VCC_HI : SIReg<"vcc_hi", 107>; + +// VCC for 64-bit instructions +def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>, + DwarfRegAlias<VCC_LO> { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = 106; +} + +def EXEC_LO : SIReg<"exec_lo", 126>; +def EXEC_HI : SIReg<"exec_hi", 127>; + +def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>, + DwarfRegAlias<EXEC_LO> { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = 126; +} + +def SCC : SIReg<"scc", 253>; +def M0 : SIReg <"m0", 124>; + +multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> { + def _ci : SIReg<n, ci_e>; + def _vi : SIReg<n, vi_e>; + def "" : SIReg<"", 0>; +} + +class FlatReg <Register lo, Register hi, bits<16> encoding> : + RegisterWithSubRegs<"flat_scratch", [lo, hi]>, + DwarfRegAlias<lo> { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = encoding; +} + +defm FLAT_SCR_LO : FLAT_SCR_LOHI_m<"flat_scratch_lo", 104, 102>; // Offset in units of 256-bytes. +defm FLAT_SCR_HI : FLAT_SCR_LOHI_m<"flat_scratch_hi", 105, 103>; // Size is the per-thread scratch size, in bytes. + +def FLAT_SCR_ci : FlatReg<FLAT_SCR_LO_ci, FLAT_SCR_HI_ci, 104>; +def FLAT_SCR_vi : FlatReg<FLAT_SCR_LO_vi, FLAT_SCR_HI_vi, 102>; +def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>; + +// SGPR registers +foreach Index = 0-103 in { + def SGPR#Index : SIReg <"SGPR"#Index, Index>; +} + +// VGPR registers +foreach Index = 0-255 in { + def VGPR#Index : SIReg <"VGPR"#Index, Index> { + let HWEncoding{8} = 1; + } +} + +//===----------------------------------------------------------------------===// +// Groupings using register classes and tuples +//===----------------------------------------------------------------------===// + +// TODO: Do we need to set DwarfRegAlias on register tuples? + +// SGPR 32-bit registers +def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, + (add (sequence "SGPR%u", 0, 103))>; + +// SGPR 64-bit registers +def SGPR_64Regs : RegisterTuples<[sub0, sub1], + [(add (decimate SGPR_32, 2)), + (add (decimate (shl SGPR_32, 1), 2))]>; + +// SGPR 128-bit registers +def SGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3], + [(add (decimate SGPR_32, 4)), + (add (decimate (shl SGPR_32, 1), 4)), + (add (decimate (shl SGPR_32, 2), 4)), + (add (decimate (shl SGPR_32, 3), 4))]>; + +// SGPR 256-bit registers +def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], + [(add (decimate SGPR_32, 4)), + (add (decimate (shl SGPR_32, 1), 4)), + (add (decimate (shl SGPR_32, 2), 4)), + (add (decimate (shl SGPR_32, 3), 4)), + (add (decimate (shl SGPR_32, 4), 4)), + (add (decimate (shl SGPR_32, 5), 4)), + (add (decimate (shl SGPR_32, 6), 4)), + (add (decimate (shl SGPR_32, 7), 4))]>; + +// SGPR 512-bit registers +def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, + sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15], + [(add (decimate SGPR_32, 4)), + (add (decimate (shl SGPR_32, 1), 4)), + (add (decimate (shl SGPR_32, 2), 4)), + (add (decimate (shl SGPR_32, 3), 4)), + (add (decimate (shl SGPR_32, 4), 4)), + (add (decimate (shl SGPR_32, 5), 4)), + (add (decimate (shl SGPR_32, 6), 4)), + (add (decimate (shl SGPR_32, 7), 4)), + (add (decimate (shl SGPR_32, 8), 4)), + (add (decimate (shl SGPR_32, 9), 4)), + (add (decimate (shl SGPR_32, 10), 4)), + (add (decimate (shl SGPR_32, 11), 4)), + (add (decimate (shl SGPR_32, 12), 4)), + (add (decimate (shl SGPR_32, 13), 4)), + (add (decimate (shl SGPR_32, 14), 4)), + (add (decimate (shl SGPR_32, 15), 4))]>; + +// VGPR 32-bit registers +def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, + (add (sequence "VGPR%u", 0, 255))>; + +// VGPR 64-bit registers +def VGPR_64 : RegisterTuples<[sub0, sub1], + [(add (trunc VGPR_32, 255)), + (add (shl VGPR_32, 1))]>; + +// VGPR 96-bit registers +def VGPR_96 : RegisterTuples<[sub0, sub1, sub2], + [(add (trunc VGPR_32, 254)), + (add (shl VGPR_32, 1)), + (add (shl VGPR_32, 2))]>; + +// VGPR 128-bit registers +def VGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3], + [(add (trunc VGPR_32, 253)), + (add (shl VGPR_32, 1)), + (add (shl VGPR_32, 2)), + (add (shl VGPR_32, 3))]>; + +// VGPR 256-bit registers +def VGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], + [(add (trunc VGPR_32, 249)), + (add (shl VGPR_32, 1)), + (add (shl VGPR_32, 2)), + (add (shl VGPR_32, 3)), + (add (shl VGPR_32, 4)), + (add (shl VGPR_32, 5)), + (add (shl VGPR_32, 6)), + (add (shl VGPR_32, 7))]>; + +// VGPR 512-bit registers +def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, + sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15], + [(add (trunc VGPR_32, 241)), + (add (shl VGPR_32, 1)), + (add (shl VGPR_32, 2)), + (add (shl VGPR_32, 3)), + (add (shl VGPR_32, 4)), + (add (shl VGPR_32, 5)), + (add (shl VGPR_32, 6)), + (add (shl VGPR_32, 7)), + (add (shl VGPR_32, 8)), + (add (shl VGPR_32, 9)), + (add (shl VGPR_32, 10)), + (add (shl VGPR_32, 11)), + (add (shl VGPR_32, 12)), + (add (shl VGPR_32, 13)), + (add (shl VGPR_32, 14)), + (add (shl VGPR_32, 15))]>; + +//===----------------------------------------------------------------------===// +// Register classes used as source and destination +//===----------------------------------------------------------------------===// + +class RegImmMatcher<string name> : AsmOperandClass { + let Name = name; + let RenderMethod = "addRegOrImmOperands"; +} + +// Register class for all scalar registers (SGPRs + Special Registers) +def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32, + (add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI) +>; + +def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)>; + +def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32, + (add SGPR_64, VCC, EXEC, FLAT_SCR) +>; + +def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128)> { + // Requires 2 s_mov_b64 to copy + let CopyCost = 2; +} + +def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add SGPR_256)> { + // Requires 4 s_mov_b64 to copy + let CopyCost = 4; +} + +def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> { + // Requires 8 s_mov_b64 to copy + let CopyCost = 8; +} + +// Register class for all vector registers (VGPRs + Interploation Registers) +def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> { + // Requires 2 v_mov_b32 to copy + let CopyCost = 2; +} + +def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> { + let Size = 96; + + // Requires 3 v_mov_b32 to copy + let CopyCost = 3; +} + +def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> { + // Requires 4 v_mov_b32 to copy + let CopyCost = 4; +} + +def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add VGPR_256)> { + let CopyCost = 8; +} + +def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> { + let CopyCost = 16; +} + +def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { + let Size = 32; +} + +class RegImmOperand <RegisterClass rc> : RegisterOperand<rc> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_IMM32"; +} + +class RegInlineOperand <RegisterClass rc> : RegisterOperand<rc> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_INLINE_C"; +} + +//===----------------------------------------------------------------------===// +// SSrc_* Operands with an SGPR or a 32-bit immediate +//===----------------------------------------------------------------------===// + +def SSrc_32 : RegImmOperand<SReg_32> { + let ParserMatchClass = RegImmMatcher<"SSrc32">; +} + +def SSrc_64 : RegImmOperand<SReg_64> { + let ParserMatchClass = RegImmMatcher<"SSrc64">; +} + +//===----------------------------------------------------------------------===// +// SCSrc_* Operands with an SGPR or a inline constant +//===----------------------------------------------------------------------===// + +def SCSrc_32 : RegInlineOperand<SReg_32> { + let ParserMatchClass = RegImmMatcher<"SCSrc32">; +} + +//===----------------------------------------------------------------------===// +// VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate +//===----------------------------------------------------------------------===// + +def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)>; + +def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> { + let CopyCost = 2; +} + +def VSrc_32 : RegisterOperand<VS_32> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_IMM32"; + let ParserMatchClass = RegImmMatcher<"VSrc32">; +} + +def VSrc_64 : RegisterOperand<VS_64> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_IMM32"; + let ParserMatchClass = RegImmMatcher<"VSrc64">; +} + +//===----------------------------------------------------------------------===// +// VCSrc_* Operands with an SGPR, VGPR or an inline constant +//===----------------------------------------------------------------------===// + +def VCSrc_32 : RegisterOperand<VS_32> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_INLINE_C"; + let ParserMatchClass = RegImmMatcher<"VCSrc32">; +} + +def VCSrc_64 : RegisterOperand<VS_64> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_INLINE_C"; + let ParserMatchClass = RegImmMatcher<"VCSrc64">; +} + +//===----------------------------------------------------------------------===// +// SCSrc_* Operands with an SGPR or an inline constant +//===----------------------------------------------------------------------===// + +def SCSrc_64 : RegisterOperand<SReg_64> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_INLINE_C"; + let ParserMatchClass = RegImmMatcher<"SCSrc64">; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td new file mode 100644 index 0000000..cd77e51 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td @@ -0,0 +1,105 @@ +//===-- SISchedule.td - SI Scheduling definitons -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// MachineModel definitions for Southern Islands (SI) +// +//===----------------------------------------------------------------------===// + +def WriteBranch : SchedWrite; +def WriteExport : SchedWrite; +def WriteLDS : SchedWrite; +def WriteSALU : SchedWrite; +def WriteSMEM : SchedWrite; +def WriteVMEM : SchedWrite; +def WriteBarrier : SchedWrite; + +// Vector ALU instructions +def Write32Bit : SchedWrite; +def WriteQuarterRate32 : SchedWrite; +def WriteFullOrQuarterRate32 : SchedWrite; + +def WriteFloatFMA : SchedWrite; + +// Slow quarter rate f64 instruction. +def WriteDouble : SchedWrite; + +// half rate f64 instruction (same as v_add_f64) +def WriteDoubleAdd : SchedWrite; + +// Half rate 64-bit instructions. +def Write64Bit : SchedWrite; + +// FIXME: Should there be a class for instructions which are VALU +// instructions and have VALU rates, but write to the SALU (i.e. VOPC +// instructions) + +def SIFullSpeedModel : SchedMachineModel; +def SIQuarterSpeedModel : SchedMachineModel; + +// BufferSize = 0 means the processors are in-order. +let BufferSize = 0 in { + +// XXX: Are the resource counts correct? +def HWBranch : ProcResource<1>; +def HWExport : ProcResource<7>; // Taken from S_WAITCNT +def HWLGKM : ProcResource<31>; // Taken from S_WAITCNT +def HWSALU : ProcResource<1>; +def HWVMEM : ProcResource<15>; // Taken from S_WAITCNT +def HWVALU : ProcResource<1>; + +} + +class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources, + int latency> : WriteRes<write, resources> { + let Latency = latency; +} + +class HWVALUWriteRes<SchedWrite write, int latency> : + HWWriteRes<write, [HWVALU], latency>; + + +// The latency numbers are taken from AMD Accelerated Parallel Processing +// guide. They may not be accurate. + +// The latency values are 1 / (operations / cycle) / 4. +multiclass SICommonWriteRes { + + def : HWWriteRes<WriteBranch, [HWBranch], 100>; // XXX: Guessed ??? + def : HWWriteRes<WriteExport, [HWExport], 100>; // XXX: Guessed ??? + def : HWWriteRes<WriteLDS, [HWLGKM], 32>; // 2 - 64 + def : HWWriteRes<WriteSALU, [HWSALU], 1>; + def : HWWriteRes<WriteSMEM, [HWLGKM], 10>; // XXX: Guessed ??? + def : HWWriteRes<WriteVMEM, [HWVMEM], 450>; // 300 - 600 + def : HWWriteRes<WriteBarrier, [HWBranch], 500>; // XXX: Guessed ??? + + def : HWVALUWriteRes<Write32Bit, 1>; + def : HWVALUWriteRes<Write64Bit, 2>; + def : HWVALUWriteRes<WriteQuarterRate32, 4>; +} + + +let SchedModel = SIFullSpeedModel in { + +defm : SICommonWriteRes; + +def : HWVALUWriteRes<WriteFloatFMA, 1>; +def : HWVALUWriteRes<WriteDouble, 4>; +def : HWVALUWriteRes<WriteDoubleAdd, 2>; + +} // End SchedModel = SIFullSpeedModel + +let SchedModel = SIQuarterSpeedModel in { + +defm : SICommonWriteRes; + +def : HWVALUWriteRes<WriteFloatFMA, 16>; +def : HWVALUWriteRes<WriteDouble, 16>; +def : HWVALUWriteRes<WriteDoubleAdd, 8>; + +} // End SchedModel = SIQuarterSpeedModel diff --git a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp new file mode 100644 index 0000000..4f0913f --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -0,0 +1,334 @@ +//===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// The pass tries to use the 32-bit encoding for instructions when possible. +//===----------------------------------------------------------------------===// +// + +#include "AMDGPU.h" +#include "AMDGPUMCInstLower.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "si-shrink-instructions" + +STATISTIC(NumInstructionsShrunk, + "Number of 64-bit instruction reduced to 32-bit."); +STATISTIC(NumLiteralConstantsFolded, + "Number of literal constants folded into 32-bit instructions."); + +namespace llvm { + void initializeSIShrinkInstructionsPass(PassRegistry&); +} + +using namespace llvm; + +namespace { + +class SIShrinkInstructions : public MachineFunctionPass { +public: + static char ID; + +public: + SIShrinkInstructions() : MachineFunctionPass(ID) { + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Shrink Instructions"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIShrinkInstructions, DEBUG_TYPE, + "SI Lower il Copies", false, false) +INITIALIZE_PASS_END(SIShrinkInstructions, DEBUG_TYPE, + "SI Lower il Copies", false, false) + +char SIShrinkInstructions::ID = 0; + +FunctionPass *llvm::createSIShrinkInstructionsPass() { + return new SIShrinkInstructions(); +} + +static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI, + const MachineRegisterInfo &MRI) { + if (!MO->isReg()) + return false; + + if (TargetRegisterInfo::isVirtualRegister(MO->getReg())) + return TRI.hasVGPRs(MRI.getRegClass(MO->getReg())); + + return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg())); +} + +static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, + const SIRegisterInfo &TRI, + const MachineRegisterInfo &MRI) { + + const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); + // Can't shrink instruction with three operands. + // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add + // a special case for it. It can only be shrunk if the third operand + // is vcc. We should handle this the same way we handle vopc, by addding + // a register allocation hint pre-regalloc and then do the shrining + // post-regalloc. + if (Src2) { + switch (MI.getOpcode()) { + default: return false; + + case AMDGPU::V_MAC_F32_e64: + if (!isVGPR(Src2, TRI, MRI) || + TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) + return false; + break; + + case AMDGPU::V_CNDMASK_B32_e64: + break; + } + } + + const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + const MachineOperand *Src1Mod = + TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); + + if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0))) + return false; + + // We don't need to check src0, all input types are legal, so just make sure + // src0 isn't using any modifiers. + if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) + return false; + + // Check output modifiers + if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) + return false; + + if (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) + return false; + + return true; +} + +/// \brief This function checks \p MI for operands defined by a move immediate +/// instruction and then folds the literal constant into the instruction if it +/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction +/// and will only fold literal constants if we are still in SSA. +static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, + MachineRegisterInfo &MRI, bool TryToCommute = true) { + + if (!MRI.isSSA()) + return; + + assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); + + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); + MachineOperand &Src0 = MI.getOperand(Src0Idx); + + // Only one literal constant is allowed per instruction, so if src0 is a + // literal constant then we can't do any folding. + if (Src0.isImm() && + TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx))) + return; + + // Literal constants and SGPRs can only be used in Src0, so if Src0 is an + // SGPR, we cannot commute the instruction, so we can't fold any literal + // constants. + if (Src0.isReg() && !isVGPR(&Src0, TRI, MRI)) + return; + + // Try to fold Src0 + if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) { + unsigned Reg = Src0.getReg(); + MachineInstr *Def = MRI.getUniqueVRegDef(Reg); + if (Def && Def->isMoveImmediate()) { + MachineOperand &MovSrc = Def->getOperand(1); + bool ConstantFolded = false; + + if (MovSrc.isImm() && isUInt<32>(MovSrc.getImm())) { + Src0.ChangeToImmediate(MovSrc.getImm()); + ConstantFolded = true; + } + if (ConstantFolded) { + if (MRI.use_empty(Reg)) + Def->eraseFromParent(); + ++NumLiteralConstantsFolded; + return; + } + } + } + + // We have failed to fold src0, so commute the instruction and try again. + if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(&MI)) + foldImmediates(MI, TII, MRI, false); + +} + +// Copy MachineOperand with all flags except setting it as implicit. +static MachineOperand copyRegOperandAsImplicit(const MachineOperand &Orig) { + assert(!Orig.isImplicit()); + return MachineOperand::CreateReg(Orig.getReg(), + Orig.isDef(), + true, + Orig.isKill(), + Orig.isDead(), + Orig.isUndef(), + Orig.isEarlyClobber(), + Orig.getSubReg(), + Orig.isDebug(), + Orig.isInternalRead()); +} + +bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + std::vector<unsigned> I1Defs; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + + // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. + if (MI.getOpcode() == AMDGPU::S_MOV_B32) { + const MachineOperand &Src = MI.getOperand(1); + + if (Src.isImm()) { + if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4)) + MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); + } + + continue; + } + + if (!TII->hasVALU32BitEncoding(MI.getOpcode())) + continue; + + if (!canShrink(MI, TII, TRI, MRI)) { + // Try commuting the instruction and see if that enables us to shrink + // it. + if (!MI.isCommutable() || !TII->commuteInstruction(&MI) || + !canShrink(MI, TII, TRI, MRI)) + continue; + } + + // getVOPe32 could be -1 here if we started with an instruction that had + // a 32-bit encoding and then commuted it to an instruction that did not. + if (!TII->hasVALU32BitEncoding(MI.getOpcode())) + continue; + + int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); + + if (TII->isVOPC(Op32)) { + unsigned DstReg = MI.getOperand(0).getReg(); + if (TargetRegisterInfo::isVirtualRegister(DstReg)) { + // VOPC instructions can only write to the VCC register. We can't + // force them to use VCC here, because this is only one register and + // cannot deal with sequences which would require multiple copies of + // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) + // + // So, instead of forcing the instruction to write to VCC, we provide + // a hint to the register allocator to use VCC and then we we will run + // this pass again after RA and shrink it if it outputs to VCC. + MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC); + continue; + } + if (DstReg != AMDGPU::VCC) + continue; + } + + if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { + // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC + // instructions. + const MachineOperand *Src2 = + TII->getNamedOperand(MI, AMDGPU::OpName::src2); + if (!Src2->isReg()) + continue; + unsigned SReg = Src2->getReg(); + if (TargetRegisterInfo::isVirtualRegister(SReg)) { + MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC); + continue; + } + if (SReg != AMDGPU::VCC) + continue; + } + + // We can shrink this instruction + DEBUG(dbgs() << "Shrinking " << MI); + + MachineInstrBuilder Inst32 = + BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32)); + + // Add the dst operand if the 32-bit encoding also has an explicit $dst. + // For VOPC instructions, this is replaced by an implicit def of vcc. + int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::dst); + if (Op32DstIdx != -1) { + // dst + Inst32.addOperand(MI.getOperand(0)); + } else { + assert(MI.getOperand(0).getReg() == AMDGPU::VCC && + "Unexpected case"); + } + + + Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); + + const MachineOperand *Src1 = + TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (Src1) + Inst32.addOperand(*Src1); + + const MachineOperand *Src2 = + TII->getNamedOperand(MI, AMDGPU::OpName::src2); + if (Src2) { + int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); + if (Op32Src2Idx != -1) { + Inst32.addOperand(*Src2); + } else { + // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is + // replaced with an implicit read of vcc. + assert(Src2->getReg() == AMDGPU::VCC && + "Unexpected missing register operand"); + Inst32.addOperand(copyRegOperandAsImplicit(*Src2)); + } + } + + ++NumInstructionsShrunk; + MI.eraseFromParent(); + + foldImmediates(*Inst32, TII, MRI); + DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); + + + } + } + return false; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp new file mode 100644 index 0000000..d36c5d2 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp @@ -0,0 +1,158 @@ +//===-- SITypeRewriter.cpp - Remove unwanted types ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass removes performs the following type substitution on all +/// non-compute shaders: +/// +/// v16i8 => i128 +/// - v16i8 is used for constant memory resource descriptors. This type is +/// legal for some compute APIs, and we don't want to declare it as legal +/// in the backend, because we want the legalizer to expand all v16i8 +/// operations. +/// v1* => * +/// - Having v1* types complicates the legalizer and we can easily replace +/// - them with the element type. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" + +using namespace llvm; + +namespace { + +class SITypeRewriter : public FunctionPass, + public InstVisitor<SITypeRewriter> { + + static char ID; + Module *Mod; + Type *v16i8; + Type *v4i32; + +public: + SITypeRewriter() : FunctionPass(ID) { } + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + const char *getPassName() const override { + return "SI Type Rewriter"; + } + void visitLoadInst(LoadInst &I); + void visitCallInst(CallInst &I); + void visitBitCast(BitCastInst &I); +}; + +} // End anonymous namespace + +char SITypeRewriter::ID = 0; + +bool SITypeRewriter::doInitialization(Module &M) { + Mod = &M; + v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16); + v4i32 = VectorType::get(Type::getInt32Ty(M.getContext()), 4); + return false; +} + +bool SITypeRewriter::runOnFunction(Function &F) { + if (AMDGPU::getShaderType(F) == ShaderType::COMPUTE) + return false; + + visit(F); + visit(F); + + return false; +} + +void SITypeRewriter::visitLoadInst(LoadInst &I) { + Value *Ptr = I.getPointerOperand(); + Type *PtrTy = Ptr->getType(); + Type *ElemTy = PtrTy->getPointerElementType(); + IRBuilder<> Builder(&I); + if (ElemTy == v16i8) { + Value *BitCast = Builder.CreateBitCast(Ptr, + PointerType::get(v4i32,PtrTy->getPointerAddressSpace())); + LoadInst *Load = Builder.CreateLoad(BitCast); + SmallVector<std::pair<unsigned, MDNode *>, 8> MD; + I.getAllMetadataOtherThanDebugLoc(MD); + for (unsigned i = 0, e = MD.size(); i != e; ++i) { + Load->setMetadata(MD[i].first, MD[i].second); + } + Value *BitCastLoad = Builder.CreateBitCast(Load, I.getType()); + I.replaceAllUsesWith(BitCastLoad); + I.eraseFromParent(); + } +} + +void SITypeRewriter::visitCallInst(CallInst &I) { + IRBuilder<> Builder(&I); + + SmallVector <Value*, 8> Args; + SmallVector <Type*, 8> Types; + bool NeedToReplace = false; + Function *F = I.getCalledFunction(); + if (!F) + return; + + std::string Name = F->getName(); + for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) { + Value *Arg = I.getArgOperand(i); + if (Arg->getType() == v16i8) { + Args.push_back(Builder.CreateBitCast(Arg, v4i32)); + Types.push_back(v4i32); + NeedToReplace = true; + Name = Name + ".v4i32"; + } else if (Arg->getType()->isVectorTy() && + Arg->getType()->getVectorNumElements() == 1 && + Arg->getType()->getVectorElementType() == + Type::getInt32Ty(I.getContext())){ + Type *ElementTy = Arg->getType()->getVectorElementType(); + std::string TypeName = "i32"; + InsertElementInst *Def = cast<InsertElementInst>(Arg); + Args.push_back(Def->getOperand(1)); + Types.push_back(ElementTy); + std::string VecTypeName = "v1" + TypeName; + Name = Name.replace(Name.find(VecTypeName), VecTypeName.length(), TypeName); + NeedToReplace = true; + } else { + Args.push_back(Arg); + Types.push_back(Arg->getType()); + } + } + + if (!NeedToReplace) { + return; + } + Function *NewF = Mod->getFunction(Name); + if (!NewF) { + NewF = Function::Create(FunctionType::get(F->getReturnType(), Types, false), GlobalValue::ExternalLinkage, Name, Mod); + NewF->setAttributes(F->getAttributes()); + } + I.replaceAllUsesWith(Builder.CreateCall(NewF, Args)); + I.eraseFromParent(); +} + +void SITypeRewriter::visitBitCast(BitCastInst &I) { + IRBuilder<> Builder(&I); + if (I.getDestTy() != v4i32) { + return; + } + + if (BitCastInst *Op = dyn_cast<BitCastInst>(I.getOperand(0))) { + if (Op->getSrcTy() == v4i32) { + I.replaceAllUsesWith(Op->getOperand(0)); + I.eraseFromParent(); + } + } +} + +FunctionPass *llvm::createSITypeRewriter() { + return new SITypeRewriter(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp new file mode 100644 index 0000000..2112135 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp @@ -0,0 +1,30 @@ +//===-- TargetInfo/AMDGPUTargetInfo.cpp - TargetInfo for AMDGPU -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetMachine.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +/// \brief The target which suports all AMD GPUs. This will eventually +/// be deprecated and there will be a R600 target and a GCN target. +Target llvm::TheAMDGPUTarget; +/// \brief The target for GCN GPUs +Target llvm::TheGCNTarget; + +/// \brief Extern function to initialize the targets for the AMDGPU backend +extern "C" void LLVMInitializeAMDGPUTargetInfo() { + RegisterTarget<Triple::r600, false> + R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX"); + RegisterTarget<Triple::amdgcn, false> GCN(TheGCNTarget, "amdgcn", "AMD GCN GPUs"); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp new file mode 100644 index 0000000..3b4c235 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -0,0 +1,164 @@ +//===-- AMDGPUBaseInfo.cpp - AMDGPU Base encoding information--------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#include "AMDGPUBaseInfo.h" +#include "AMDGPU.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/SubtargetFeature.h" + +#define GET_SUBTARGETINFO_ENUM +#include "AMDGPUGenSubtargetInfo.inc" +#undef GET_SUBTARGETINFO_ENUM + +#define GET_REGINFO_ENUM +#include "AMDGPUGenRegisterInfo.inc" +#undef GET_REGINFO_ENUM + +namespace llvm { +namespace AMDGPU { + +IsaVersion getIsaVersion(const FeatureBitset &Features) { + + if (Features.test(FeatureISAVersion7_0_0)) + return {7, 0, 0}; + + if (Features.test(FeatureISAVersion7_0_1)) + return {7, 0, 1}; + + if (Features.test(FeatureISAVersion8_0_0)) + return {8, 0, 0}; + + if (Features.test(FeatureISAVersion8_0_1)) + return {8, 0, 1}; + + return {0, 0, 0}; +} + +void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, + const FeatureBitset &Features) { + + IsaVersion ISA = getIsaVersion(Features); + + memset(&Header, 0, sizeof(Header)); + + Header.amd_kernel_code_version_major = 1; + Header.amd_kernel_code_version_minor = 0; + Header.amd_machine_kind = 1; // AMD_MACHINE_KIND_AMDGPU + Header.amd_machine_version_major = ISA.Major; + Header.amd_machine_version_minor = ISA.Minor; + Header.amd_machine_version_stepping = ISA.Stepping; + Header.kernel_code_entry_byte_offset = sizeof(Header); + // wavefront_size is specified as a power of 2: 2^6 = 64 threads. + Header.wavefront_size = 6; + // These alignment values are specified in powers of two, so alignment = + // 2^n. The minimum alignment is 2^4 = 16. + Header.kernarg_segment_alignment = 4; + Header.group_segment_alignment = 4; + Header.private_segment_alignment = 4; +} + +MCSection *getHSATextSection(MCContext &Ctx) { + return Ctx.getELFSection(".hsatext", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_WRITE | + ELF::SHF_EXECINSTR | + ELF::SHF_AMDGPU_HSA_AGENT | + ELF::SHF_AMDGPU_HSA_CODE); +} + +MCSection *getHSADataGlobalAgentSection(MCContext &Ctx) { + return Ctx.getELFSection(".hsadata_global_agent", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_WRITE | + ELF::SHF_AMDGPU_HSA_GLOBAL | + ELF::SHF_AMDGPU_HSA_AGENT); +} + +MCSection *getHSADataGlobalProgramSection(MCContext &Ctx) { + return Ctx.getELFSection(".hsadata_global_program", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_WRITE | + ELF::SHF_AMDGPU_HSA_GLOBAL); +} + +MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx) { + return Ctx.getELFSection(".hsarodata_readonly_agent", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_AMDGPU_HSA_READONLY | + ELF::SHF_AMDGPU_HSA_AGENT); +} + +bool isGroupSegment(const GlobalValue *GV) { + return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; +} + +bool isGlobalSegment(const GlobalValue *GV) { + return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; +} + +bool isReadOnlySegment(const GlobalValue *GV) { + return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; +} + +static unsigned getIntegerAttribute(const Function &F, const char *Name, + unsigned Default) { + Attribute A = F.getFnAttribute(Name); + unsigned Result = Default; + + if (A.isStringAttribute()) { + StringRef Str = A.getValueAsString(); + if (Str.getAsInteger(0, Result)) { + LLVMContext &Ctx = F.getContext(); + Ctx.emitError("can't parse shader type"); + } + } + return Result; +} + +unsigned getShaderType(const Function &F) { + return getIntegerAttribute(F, "ShaderType", ShaderType::COMPUTE); +} + +unsigned getInitialPSInputAddr(const Function &F) { + return getIntegerAttribute(F, "InitialPSInputAddr", 0); +} + +bool isSI(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands]; +} + +bool isCI(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureSeaIslands]; +} + +bool isVI(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]; +} + +unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { + + switch(Reg) { + default: break; + case AMDGPU::FLAT_SCR: + assert(!isSI(STI)); + return isCI(STI) ? AMDGPU::FLAT_SCR_ci : AMDGPU::FLAT_SCR_vi; + + case AMDGPU::FLAT_SCR_LO: + assert(!isSI(STI)); + return isCI(STI) ? AMDGPU::FLAT_SCR_LO_ci : AMDGPU::FLAT_SCR_LO_vi; + + case AMDGPU::FLAT_SCR_HI: + assert(!isSI(STI)); + return isCI(STI) ? AMDGPU::FLAT_SCR_HI_ci : AMDGPU::FLAT_SCR_HI_vi; + } + return Reg; +} + +} // End namespace AMDGPU +} // End namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h new file mode 100644 index 0000000..57cbe1b5 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -0,0 +1,62 @@ +//===-- AMDGPUBaseInfo.h - Top level definitions for AMDGPU -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H +#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H + +#include "AMDKernelCodeT.h" + +namespace llvm { + +class FeatureBitset; +class Function; +class GlobalValue; +class MCContext; +class MCSection; +class MCSubtargetInfo; + +namespace AMDGPU { + +struct IsaVersion { + unsigned Major; + unsigned Minor; + unsigned Stepping; +}; + +IsaVersion getIsaVersion(const FeatureBitset &Features); +void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, + const FeatureBitset &Features); +MCSection *getHSATextSection(MCContext &Ctx); + +MCSection *getHSADataGlobalAgentSection(MCContext &Ctx); + +MCSection *getHSADataGlobalProgramSection(MCContext &Ctx); + +MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx); + +bool isGroupSegment(const GlobalValue *GV); +bool isGlobalSegment(const GlobalValue *GV); +bool isReadOnlySegment(const GlobalValue *GV); + +unsigned getShaderType(const Function &F); +unsigned getInitialPSInputAddr(const Function &F); + + +bool isSI(const MCSubtargetInfo &STI); +bool isCI(const MCSubtargetInfo &STI); +bool isVI(const MCSubtargetInfo &STI); + +/// If \p Reg is a pseudo reg, return the correct hardware register given +/// \p STI otherwise return \p Reg. +unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI); + +} // end namespace AMDGPU +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/VIInstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/VIInstrFormats.td new file mode 100644 index 0000000..d8738f9 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/VIInstrFormats.td @@ -0,0 +1,166 @@ +//===-- VIInstrFormats.td - VI Instruction Encodings ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// VI Instruction format definitions. +// +//===----------------------------------------------------------------------===// + +class DSe_vi <bits<8> op> : Enc64 { + bits<8> vdst; + bits<1> gds; + bits<8> addr; + bits<8> data0; + bits<8> data1; + bits<8> offset0; + bits<8> offset1; + + let Inst{7-0} = offset0; + let Inst{15-8} = offset1; + let Inst{16} = gds; + let Inst{24-17} = op; + let Inst{31-26} = 0x36; //encoding + let Inst{39-32} = addr; + let Inst{47-40} = data0; + let Inst{55-48} = data1; + let Inst{63-56} = vdst; +} + +class MUBUFe_vi <bits<7> op> : Enc64 { + bits<12> offset; + bits<1> offen; + bits<1> idxen; + bits<1> glc; + bits<1> lds; + bits<8> vaddr; + bits<8> vdata; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; + + let Inst{11-0} = offset; + let Inst{12} = offen; + let Inst{13} = idxen; + let Inst{14} = glc; + let Inst{16} = lds; + let Inst{17} = slc; + let Inst{24-18} = op; + let Inst{31-26} = 0x38; //encoding + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{55} = tfe; + let Inst{63-56} = soffset; +} + +class MTBUFe_vi <bits<4> op> : Enc64 { + bits<12> offset; + bits<1> offen; + bits<1> idxen; + bits<1> glc; + bits<4> dfmt; + bits<3> nfmt; + bits<8> vaddr; + bits<8> vdata; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; + + let Inst{11-0} = offset; + let Inst{12} = offen; + let Inst{13} = idxen; + let Inst{14} = glc; + let Inst{18-15} = op; + let Inst{22-19} = dfmt; + let Inst{25-23} = nfmt; + let Inst{31-26} = 0x3a; //encoding + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{54} = slc; + let Inst{55} = tfe; + let Inst{63-56} = soffset; +} + +class SMEMe_vi <bits<8> op, bit imm> : Enc64 { + bits<7> sbase; + bits<7> sdata; + bits<1> glc; + bits<20> offset; + + let Inst{5-0} = sbase{6-1}; + let Inst{12-6} = sdata; + let Inst{16} = glc; + let Inst{17} = imm; + let Inst{25-18} = op; + let Inst{31-26} = 0x30; //encoding + let Inst{51-32} = offset; +} + +class VOP3e_vi <bits<10> op> : Enc64 { + bits<8> vdst; + bits<2> src0_modifiers; + bits<9> src0; + bits<2> src1_modifiers; + bits<9> src1; + bits<2> src2_modifiers; + bits<9> src2; + bits<1> clamp; + bits<2> omod; + + let Inst{7-0} = vdst; + let Inst{8} = src0_modifiers{1}; + let Inst{9} = src1_modifiers{1}; + let Inst{10} = src2_modifiers{1}; + let Inst{15} = clamp; + let Inst{25-16} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = src0; + let Inst{49-41} = src1; + let Inst{58-50} = src2; + let Inst{60-59} = omod; + let Inst{61} = src0_modifiers{0}; + let Inst{62} = src1_modifiers{0}; + let Inst{63} = src2_modifiers{0}; +} + +class VOP3be_vi <bits<10> op> : Enc64 { + bits<8> vdst; + bits<2> src0_modifiers; + bits<9> src0; + bits<2> src1_modifiers; + bits<9> src1; + bits<2> src2_modifiers; + bits<9> src2; + bits<7> sdst; + bits<2> omod; + bits<1> clamp; + + let Inst{7-0} = vdst; + let Inst{14-8} = sdst; + let Inst{15} = clamp; + let Inst{25-16} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = src0; + let Inst{49-41} = src1; + let Inst{58-50} = src2; + let Inst{60-59} = omod; + let Inst{61} = src0_modifiers{0}; + let Inst{62} = src1_modifiers{0}; + let Inst{63} = src2_modifiers{0}; +} + +class EXPe_vi : EXPe { + let Inst{31-26} = 0x31; //encoding +} + +class VINTRPe_vi <bits<2> op> : VINTRPe <op> { + let Inst{31-26} = 0x35; // encoding +} diff --git a/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td new file mode 100644 index 0000000..1a7801c --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td @@ -0,0 +1,112 @@ +//===-- VIInstructions.td - VI Instruction Defintions ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Instruction definitions for VI and newer. +//===----------------------------------------------------------------------===// + +let SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI in { + +//===----------------------------------------------------------------------===// +// VOP1 Instructions +//===----------------------------------------------------------------------===// + +defm V_CVT_F16_U16 : VOP1Inst <vop1<0, 0x39>, "v_cvt_f16_u16", VOP_F16_I16>; +defm V_CVT_F16_I16 : VOP1Inst <vop1<0, 0x3a>, "v_cvt_f16_i16", VOP_F16_I16>; +defm V_CVT_U16_F16 : VOP1Inst <vop1<0, 0x3b>, "v_cvt_u16_f16", VOP_I16_F16>; +defm V_CVT_I16_F16 : VOP1Inst <vop1<0, 0x3c>, "v_cvt_i16_f16", VOP_I16_F16>; +defm V_RCP_F16 : VOP1Inst <vop1<0, 0x3d>, "v_rcp_f16", VOP_F16_F16>; +defm V_SQRT_F16 : VOP1Inst <vop1<0, 0x3e>, "v_sqrt_f16", VOP_F16_F16>; +defm V_RSQ_F16 : VOP1Inst <vop1<0, 0x3f>, "v_rsq_f16", VOP_F16_F16>; +defm V_LOG_F16 : VOP1Inst <vop1<0, 0x40>, "v_log_f16", VOP_F16_F16>; +defm V_EXP_F16 : VOP1Inst <vop1<0, 0x41>, "v_exp_f16", VOP_F16_F16>; +defm V_FREXP_MANT_F16 : VOP1Inst <vop1<0, 0x42>, "v_frexp_mant_f16", + VOP_F16_F16 +>; +defm V_FREXP_EXP_I16_F16 : VOP1Inst <vop1<0, 0x43>, "v_frexp_exp_i16_f16", + VOP_I16_F16 +>; +defm V_FLOOR_F16 : VOP1Inst <vop1<0, 0x44>, "v_floor_f16", VOP_F16_F16>; +defm V_CEIL_F16 : VOP1Inst <vop1<0, 0x45>, "v_ceil_f16", VOP_F16_F16>; +defm V_TRUNC_F16 : VOP1Inst <vop1<0, 0x46>, "v_trunc_f16", VOP_F16_F16>; +defm V_RNDNE_F16 : VOP1Inst <vop1<0, 0x47>, "v_rndne_f16", VOP_F16_F16>; +defm V_FRACT_F16 : VOP1Inst <vop1<0, 0x48>, "v_fract_f16", VOP_F16_F16>; +defm V_SIN_F16 : VOP1Inst <vop1<0, 0x49>, "v_sin_f16", VOP_F16_F16>; +defm V_COS_F16 : VOP1Inst <vop1<0, 0x4a>, "v_cos_f16", VOP_F16_F16>; + +//===----------------------------------------------------------------------===// +// VOP2 Instructions +//===----------------------------------------------------------------------===// + +let isCommutable = 1 in { + +defm V_ADD_F16 : VOP2Inst <vop2<0, 0x1f>, "v_add_f16", VOP_F16_F16_F16>; +defm V_SUB_F16 : VOP2Inst <vop2<0, 0x20>, "v_sub_f16", VOP_F16_F16_F16>; +defm V_SUBREV_F16 : VOP2Inst <vop2<0, 0x21>, "v_subrev_f16", VOP_F16_F16_F16, + null_frag, "v_sub_f16" +>; +defm V_MUL_F16 : VOP2Inst <vop2<0, 0x22>, "v_mul_f16", VOP_F16_F16_F16>; +defm V_MAC_F16 : VOP2Inst <vop2<0, 0x23>, "v_mac_f16", VOP_F16_F16_F16>; +} // End isCommutable = 1 +defm V_MADMK_F16 : VOP2MADK <vop2<0,0x24>, "v_madmk_f16">; +let isCommutable = 1 in { +defm V_MADAK_F16 : VOP2MADK <vop2<0,0x25>, "v_madak_f16">; +defm V_ADD_U16 : VOP2Inst <vop2<0,0x26>, "v_add_u16", VOP_I16_I16_I16>; +defm V_SUB_U16 : VOP2Inst <vop2<0,0x27>, "v_sub_u16" , VOP_I16_I16_I16>; +defm V_SUBREV_U16 : VOP2Inst <vop2<0,0x28>, "v_subrev_u16", VOP_I16_I16_I16>; +defm V_MUL_LO_U16 : VOP2Inst <vop2<0,0x29>, "v_mul_lo_u16", VOP_I16_I16_I16>; +} // End isCommutable = 1 +defm V_LSHLREV_B16 : VOP2Inst <vop2<0,0x2a>, "v_lshlrev_b16", VOP_I16_I16_I16>; +defm V_LSHRREV_B16 : VOP2Inst <vop2<0,0x2b>, "v_lshrrev_b16", VOP_I16_I16_I16>; +defm V_ASHRREV_B16 : VOP2Inst <vop2<0,0x2c>, "v_ashrrev_b16", VOP_I16_I16_I16>; +let isCommutable = 1 in { +defm V_MAX_F16 : VOP2Inst <vop2<0,0x2d>, "v_max_f16", VOP_F16_F16_F16>; +defm V_MIN_F16 : VOP2Inst <vop2<0,0x2e>, "v_min_f16", VOP_F16_F16_F16>; +defm V_MAX_U16 : VOP2Inst <vop2<0,0x2f>, "v_max_u16", VOP_I16_I16_I16>; +defm V_MAX_I16 : VOP2Inst <vop2<0,0x30>, "v_max_i16", VOP_I16_I16_I16>; +defm V_MIN_U16 : VOP2Inst <vop2<0,0x31>, "v_min_u16", VOP_I16_I16_I16>; +defm V_MIN_I16 : VOP2Inst <vop2<0,0x32>, "v_min_i16", VOP_I16_I16_I16>; +} // End isCommutable = 1 +defm V_LDEXP_F16 : VOP2Inst <vop2<0,0x33>, "v_ldexp_f16", VOP_F16_F16_I16>; + +// Aliases to simplify matching of floating-point instructions that +// are VOP2 on SI and VOP3 on VI. + +class SI2_VI3Alias <string name, Instruction inst> : InstAlias < + name#" $dst, $src0, $src1", + (inst VGPR_32:$dst, 0, VCSrc_32:$src0, 0, VCSrc_32:$src1, 0, 0) +>, PredicateControl { + let UseInstAsmMatchConverter = 0; +} + +def : SI2_VI3Alias <"v_ldexp_f32", V_LDEXP_F32_e64_vi>; +def : SI2_VI3Alias <"v_cvt_pkaccum_u8_f32", V_CVT_PKACCUM_U8_F32_e64_vi>; +def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>; +def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>; +def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>; + +//===----------------------------------------------------------------------===// +// SMEM Instructions +//===----------------------------------------------------------------------===// + +def S_DCACHE_WB : SMEM_Inval <0x21, + "s_dcache_wb", int_amdgcn_s_dcache_wb>; + +def S_DCACHE_WB_VOL : SMEM_Inval <0x23, + "s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>; + +} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI + +let Predicates = [isVI] in { + +// 1. Offset as 20bit DWORD immediate +def : Pat < + (SIload_constant v4i32:$sbase, IMM20bit:$offset), + (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) +>; + +} // End Predicates = [isVI] |