diff options
author | dim <dim@FreeBSD.org> | 2015-12-30 13:13:10 +0000 |
---|---|---|
committer | dim <dim@FreeBSD.org> | 2015-12-30 13:13:10 +0000 |
commit | 9b5bf5c4f53d65d6a48722d7410ed7cb15f5ba3a (patch) | |
tree | b466a4817f79516eb1df8eae92bccf62ecc84003 /contrib/llvm/lib/Target/AMDGPU | |
parent | f09a28d1de99fda4f5517fb12670fc36552f4927 (diff) | |
parent | e194cd6d03d91631334d9d5e55b506036f423cc8 (diff) | |
download | FreeBSD-src-9b5bf5c4f53d65d6a48722d7410ed7cb15f5ba3a.zip FreeBSD-src-9b5bf5c4f53d65d6a48722d7410ed7cb15f5ba3a.tar.gz |
Update llvm to trunk r256633.
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU')
89 files changed, 6128 insertions, 2660 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h index 0a05d25..8c3cb56 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -44,15 +44,21 @@ FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); FunctionPass *createSIFixControlFlowLiveIntervalsPass(); -FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm); +FunctionPass *createSIFixSGPRCopiesPass(); FunctionPass *createSIFixSGPRLiveRangesPass(); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSIInsertWaits(TargetMachine &tm); -FunctionPass *createSIPrepareScratchRegs(); + +ModulePass *createAMDGPUAnnotateKernelFeaturesPass(); +void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); +extern char &AMDGPUAnnotateKernelFeaturesID; void initializeSIFoldOperandsPass(PassRegistry &); extern char &SIFoldOperandsID; +void initializeSIFixSGPRCopiesPass(PassRegistry &); +extern char &SIFixSGPRCopiesID; + void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; @@ -64,6 +70,8 @@ FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST); Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUISelDag(TargetMachine &tm); ModulePass *createAMDGPUAlwaysInlinePass(); +ModulePass *createAMDGPUOpenCLImageTypeLoweringPass(); +FunctionPass *createAMDGPUAnnotateUniformValues(); void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&); extern char &SIFixControlFlowLiveIntervalsID; @@ -71,6 +79,8 @@ extern char &SIFixControlFlowLiveIntervalsID; void initializeSIFixSGPRLiveRangesPass(PassRegistry&); extern char &SIFixSGPRLiveRangesID; +void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&); +extern char &AMDGPUAnnotateUniformValuesPassID; extern Target TheAMDGPUTarget; extern Target TheGCNTarget; @@ -85,8 +95,6 @@ enum TargetIndex { }; } -#define END_OF_TEXT_LABEL_NAME "EndOfTextLabel" - } // End namespace llvm namespace ShaderType { diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td index 68b5050..d4af8d2 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -108,6 +108,11 @@ def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature <"unsafe-ds-offset-fol "true", "Force using DS instruction immediate offsets on SI">; +def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global", + "FlatForGlobal", + "true", + "Force to generate flat instruction for global">; + def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", "FlatAddressSpace", "true", @@ -272,9 +277,14 @@ def isSICI : Predicate< "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS" >, AssemblerPredicate<"FeatureGCN1Encoding">; +def isVI : Predicate < + "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, + AssemblerPredicate<"FeatureGCN3Encoding">; + class PredicateControl { Predicate SubtargetPredicate; Predicate SIAssemblerPredicate = isSICI; + Predicate VIAssemblerPredicate = isVI; list<Predicate> AssemblerPredicates = []; Predicate AssemblerPredicate = TruePredicate; list<Predicate> OtherPredicates = []; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp new file mode 100644 index 0000000..3781839 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -0,0 +1,126 @@ +//===-- AMDGPUAnnotateKernelFeaturesPass.cpp ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This pass adds target attributes to functions which use intrinsics +/// which will impact calling convention lowering. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" + +#define DEBUG_TYPE "amdgpu-annotate-kernel-features" + +using namespace llvm; + +namespace { + +class AMDGPUAnnotateKernelFeatures : public ModulePass { +private: + void addAttrToCallers(Function *Intrin, StringRef AttrName); + bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>); + +public: + static char ID; + + AMDGPUAnnotateKernelFeatures() : ModulePass(ID) { } + bool runOnModule(Module &M) override; + const char *getPassName() const override { + return "AMDGPU Annotate Kernel Features"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + ModulePass::getAnalysisUsage(AU); + } +}; + +} + +char AMDGPUAnnotateKernelFeatures::ID = 0; + +char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; + + +INITIALIZE_PASS_BEGIN(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, + "Add AMDGPU function attributes", false, false) +INITIALIZE_PASS_END(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, + "Add AMDGPU function attributes", false, false) + + +void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin, + StringRef AttrName) { + SmallPtrSet<Function *, 4> SeenFuncs; + + for (User *U : Intrin->users()) { + // CallInst is the only valid user for an intrinsic. + CallInst *CI = cast<CallInst>(U); + + Function *CallingFunction = CI->getParent()->getParent(); + if (SeenFuncs.insert(CallingFunction).second) + CallingFunction->addFnAttr(AttrName); + } +} + +bool AMDGPUAnnotateKernelFeatures::addAttrsForIntrinsics( + Module &M, + ArrayRef<StringRef[2]> IntrinsicToAttr) { + bool Changed = false; + + for (const StringRef *Arr : IntrinsicToAttr) { + if (Function *Fn = M.getFunction(Arr[0])) { + addAttrToCallers(Fn, Arr[1]); + Changed = true; + } + } + + return Changed; +} + +bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) { + Triple TT(M.getTargetTriple()); + + static const StringRef IntrinsicToAttr[][2] = { + // .x omitted + { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" }, + { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" }, + + // .x omitted + { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" }, + { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" } + + }; + + static const StringRef HSAIntrinsicToAttr[][2] = { + { "llvm.r600.read.local.size.x", "amdgpu-dispatch-ptr" }, + { "llvm.r600.read.local.size.y", "amdgpu-dispatch-ptr" }, + { "llvm.r600.read.local.size.z", "amdgpu-dispatch-ptr" }, + + { "llvm.r600.read.global.size.x", "amdgpu-dispatch-ptr" }, + { "llvm.r600.read.global.size.y", "amdgpu-dispatch-ptr" }, + { "llvm.r600.read.global.size.z", "amdgpu-dispatch-ptr" }, + { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" } + }; + + // TODO: Intrinsics that require queue ptr. + + // We do not need to note the x workitem or workgroup id because they are + // always initialized. + + bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr); + if (TT.getOS() == Triple::AMDHSA) + Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr); + + return Changed; +} + +ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { + return new AMDGPUAnnotateKernelFeatures(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp new file mode 100644 index 0000000..dfddc34 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -0,0 +1,84 @@ +//===-- AMDGPUAnnotateUniformValues.cpp - ---------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass adds amdgpu.uniform metadata to IR values so this information +/// can be used during instruction selection. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUIntrinsicInfo.h" +#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "amdgpu-annotate-uniform" + +using namespace llvm; + +namespace { + +class AMDGPUAnnotateUniformValues : public FunctionPass, + public InstVisitor<AMDGPUAnnotateUniformValues> { + DivergenceAnalysis *DA; + +public: + static char ID; + AMDGPUAnnotateUniformValues() : + FunctionPass(ID) { } + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + const char *getPassName() const override { return "AMDGPU Annotate Uniform Values"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DivergenceAnalysis>(); + AU.setPreservesAll(); + } + + void visitLoadInst(LoadInst &I); + +}; + +} // End anonymous namespace + +INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE, + "Add AMDGPU uniform metadata", false, false) +INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE, + "Add AMDGPU uniform metadata", false, false) + +char AMDGPUAnnotateUniformValues::ID = 0; + +void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { + Value *Ptr = I.getPointerOperand(); + if (!DA->isUniform(Ptr)) + return; + + if (Instruction *PtrI = dyn_cast<Instruction>(Ptr)) + PtrI->setMetadata("amdgpu.uniform", MDNode::get(I.getContext(), {})); + +} + +bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) { + return false; +} + +bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) { + DA = &getAnalysis<DivergenceAnalysis>(); + visit(F); + + return true; +} + +FunctionPass * +llvm::createAMDGPUAnnotateUniformValues() { + return new AMDGPUAnnotateUniformValues(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 0a5309b..ba71dc0 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -100,14 +100,63 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() { } } -void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { - - // This label is used to mark the end of the .text section. - const TargetLoweringObjectFile &TLOF = getObjFileLowering(); - OutStreamer->SwitchSection(TLOF.getTextSection()); - MCSymbol *EndOfTextLabel = - OutContext.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); - OutStreamer->EmitLabel(EndOfTextLabel); +void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { + const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); + if (MFI->isKernel() && STM.isAmdHsaOS()) { + AMDGPUTargetStreamer *TS = + static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); + TS->EmitAMDGPUSymbolType(CurrentFnSym->getName(), + ELF::STT_AMDGPU_HSA_KERNEL); + } + + AsmPrinter::EmitFunctionEntryLabel(); +} + +static bool isModuleLinkage(const GlobalValue *GV) { + switch (GV->getLinkage()) { + case GlobalValue::InternalLinkage: + case GlobalValue::CommonLinkage: + return true; + case GlobalValue::ExternalLinkage: + return false; + default: llvm_unreachable("unknown linkage type"); + } +} + +void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { + + if (TM.getTargetTriple().getOS() != Triple::AMDHSA) { + AsmPrinter::EmitGlobalVariable(GV); + return; + } + + if (GV->isDeclaration() || GV->getLinkage() == GlobalValue::PrivateLinkage) { + AsmPrinter::EmitGlobalVariable(GV); + return; + } + + // Group segment variables aren't emitted in HSA. + if (AMDGPU::isGroupSegment(GV)) + return; + + AMDGPUTargetStreamer *TS = + static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); + if (isModuleLinkage(GV)) { + TS->EmitAMDGPUHsaModuleScopeGlobal(GV->getName()); + } else { + TS->EmitAMDGPUHsaProgramScopeGlobal(GV->getName()); + } + + const DataLayout &DL = getDataLayout(); + OutStreamer->PushSection(); + OutStreamer->SwitchSection( + getObjFileLowering().SectionForGlobal(GV, *Mang, TM)); + MCSymbol *GVSym = getSymbol(GV); + const Constant *C = GV->getInitializer(); + OutStreamer->EmitLabel(GVSym); + EmitGlobalConstant(DL, C); + OutStreamer->PopSection(); } bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { @@ -125,8 +174,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); SIProgramInfo KernelInfo; if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + getSIProgramInfo(KernelInfo, MF); if (!STM.isAmdHsaOS()) { - getSIProgramInfo(KernelInfo, MF); EmitProgramInfoSI(MF, KernelInfo); } // Emit directives @@ -165,6 +214,23 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { false); OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize), false); + + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " + + Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " + + Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " + + Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " + + Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + + Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)), + false); + } else { R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); OutStreamer->emitRawComment( @@ -278,27 +344,30 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, unsigned width = 0; bool isSGPR = false; - if (!MO.isReg()) { + if (!MO.isReg()) continue; - } + unsigned reg = MO.getReg(); - if (reg == AMDGPU::VCC || reg == AMDGPU::VCC_LO || - reg == AMDGPU::VCC_HI) { + switch (reg) { + case AMDGPU::EXEC: + case AMDGPU::SCC: + case AMDGPU::M0: + continue; + + case AMDGPU::VCC: + case AMDGPU::VCC_LO: + case AMDGPU::VCC_HI: VCCUsed = true; continue; - } else if (reg == AMDGPU::FLAT_SCR || - reg == AMDGPU::FLAT_SCR_LO || - reg == AMDGPU::FLAT_SCR_HI) { + + case AMDGPU::FLAT_SCR: + case AMDGPU::FLAT_SCR_LO: + case AMDGPU::FLAT_SCR_HI: FlatUsed = true; continue; - } - switch (reg) { - default: break; - case AMDGPU::SCC: - case AMDGPU::EXEC: - case AMDGPU::M0: - continue; + default: + break; } if (AMDGPU::SReg_32RegClass.contains(reg)) { @@ -348,11 +417,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, } } - if (VCCUsed) + if (VCCUsed || FlatUsed) MaxSGPR += 2; - if (FlatUsed) + if (FlatUsed) { MaxSGPR += 2; + // 2 additional for VI+. + if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + MaxSGPR += 2; + } // We found the maximum register index. They start at 0, so add one to get the // number of registers. @@ -368,6 +441,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; } + if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) { + LLVMContext &Ctx = MF.getFunction()->getContext(); + Ctx.emitError("too many user SGPRs used"); + } + ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4; ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8; // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode @@ -419,18 +497,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, S_00B848_FLOAT_MODE(ProgInfo.FloatMode) | S_00B848_PRIV(ProgInfo.Priv) | S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) | - S_00B848_IEEE_MODE(ProgInfo.DebugMode) | + S_00B848_DEBUG_MODE(ProgInfo.DebugMode) | S_00B848_IEEE_MODE(ProgInfo.IEEEMode); + // 0 = X, 1 = XY, 2 = XYZ + unsigned TIDIGCompCnt = 0; + if (MFI->hasWorkItemIDZ()) + TIDIGCompCnt = 2; + else if (MFI->hasWorkItemIDY()) + TIDIGCompCnt = 1; + ProgInfo.ComputePGMRSrc2 = S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | - S_00B84C_USER_SGPR(MFI->NumUserSGPRs) | - S_00B84C_TGID_X_EN(1) | - S_00B84C_TGID_Y_EN(1) | - S_00B84C_TGID_Z_EN(1) | - S_00B84C_TG_SIZE_EN(1) | - S_00B84C_TIDIG_COMP_CNT(2) | - S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks); + S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) | + S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) | + S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) | + S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) | + S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) | + S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) | + S_00B84C_EXCP_EN_MSB(0) | + S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) | + S_00B84C_EXCP_EN(0); } static unsigned getRsrcReg(unsigned ShaderType) { @@ -491,14 +578,53 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, header.compute_pgm_resource_registers = KernelInfo.ComputePGMRSrc1 | (KernelInfo.ComputePGMRSrc2 << 32); - header.code_properties = - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR | - AMD_CODE_PROPERTY_IS_PTR64; + header.code_properties = AMD_CODE_PROPERTY_IS_PTR64; + + if (MFI->hasPrivateSegmentBuffer()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; + } + + if (MFI->hasDispatchPtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; + + if (MFI->hasQueuePtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; + + if (MFI->hasKernargSegmentPtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; + + if (MFI->hasDispatchID()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; + + if (MFI->hasFlatScratchInit()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; + + // TODO: Private segment size + + if (MFI->hasGridWorkgroupCountX()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X; + } + + if (MFI->hasGridWorkgroupCountY()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y; + } + + if (MFI->hasGridWorkgroupCountZ()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z; + } + + if (MFI->hasDispatchPtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; header.kernarg_segment_byte_size = MFI->ABIArgOffset; header.wavefront_sgpr_count = KernelInfo.NumSGPR; header.workitem_vgpr_count = KernelInfo.NumVGPR; - + header.workitem_private_segment_byte_size = KernelInfo.ScratchSize; + header.workgroup_group_segment_byte_size = KernelInfo.LDSSize; AMDGPUTargetStreamer *TS = static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 345af9b..817cbfc 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -99,7 +99,9 @@ public: void EmitFunctionBodyStart() override; - void EmitEndOfAsmFile(Module &M) override; + void EmitFunctionEntryLabel() override; + + void EmitGlobalVariable(const GlobalVariable *GV) override; bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant, const char *ExtraCode, diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp new file mode 100644 index 0000000..2f6b302 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp @@ -0,0 +1,26 @@ +//===-- AMDGPUDiagnosticInfoUnsupported.cpp -------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUDiagnosticInfoUnsupported.h" + +using namespace llvm; + +DiagnosticInfoUnsupported::DiagnosticInfoUnsupported( + const Function &Fn, + const Twine &Desc, + DiagnosticSeverity Severity) + : DiagnosticInfo(getKindID(), Severity), + Description(Desc), + Fn(Fn) { } + +int DiagnosticInfoUnsupported::KindID = 0; + +void DiagnosticInfoUnsupported::print(DiagnosticPrinter &DP) const { + DP << "unsupported " << getDescription() << " in " << Fn.getName(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h new file mode 100644 index 0000000..0fd37e1 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h @@ -0,0 +1,48 @@ +//===-- AMDGPUDiagnosticInfoUnsupported.h - Error reporting -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H + +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" + +namespace llvm { + +/// Diagnostic information for unimplemented or unsupported feature reporting. +class DiagnosticInfoUnsupported : public DiagnosticInfo { +private: + const Twine &Description; + const Function &Fn; + + static int KindID; + + static int getKindID() { + if (KindID == 0) + KindID = llvm::getNextAvailablePluginDiagnosticKind(); + return KindID; + } + +public: + DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc, + DiagnosticSeverity Severity = DS_Error); + + const Function &getFunction() const { return Fn; } + const Twine &getDescription() const { return Description; } + + void print(DiagnosticPrinter &DP) const override; + + static bool classof(const DiagnosticInfo *DI) { + return DI->getKind() == getKindID(); + } +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp index 8175786..4d84d28 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp @@ -71,9 +71,15 @@ unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const { } /// \returns The number of registers allocated for \p FI. -int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF, - int FI) const { +int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF, + int FI, + unsigned &FrameReg) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); + const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); + + // Fill in FrameReg output argument. + FrameReg = RI->getFrameRegister(MF); + // Start the offset at 2 so we don't overwrite work group information. // XXX: We should only do this when the shader actually uses this // information. diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h index 9f31be1..257a3da 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h @@ -8,14 +8,12 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Interface to describe a layout of a stack frame on a AMDIL target -/// machine. +/// \brief Interface to describe a layout of a stack frame on an AMDGPU target. // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H -#define LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H -#include "llvm/CodeGen/MachineFunction.h" #include "llvm/Target/TargetFrameLowering.h" namespace llvm { @@ -34,7 +32,8 @@ public: /// \returns The number of 32-bit sub-registers that are used when storing /// values to the stack. unsigned getStackWidth(const MachineFunction &MF) const; - int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; const SpillSlot * getCalleeSavedSpillSlots(unsigned &NumEntries) const override; void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 64c54cc..b33040b 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -11,6 +11,8 @@ /// \brief Defines an instruction selector for the AMDGPU target. // //===----------------------------------------------------------------------===// + +#include "AMDGPUDiagnosticInfoUnsupported.h" #include "AMDGPUInstrInfo.h" #include "AMDGPUISelLowering.h" // For AMDGPUISD #include "AMDGPURegisterInfo.h" @@ -20,9 +22,9 @@ #include "SIISelLowering.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" -#include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/IR/Function.h" @@ -40,12 +42,14 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can // make the right decision when generating code for different targets. const AMDGPUSubtarget *Subtarget; + public: AMDGPUDAGToDAGISel(TargetMachine &TM); virtual ~AMDGPUDAGToDAGISel(); bool runOnMachineFunction(MachineFunction &MF) override; SDNode *Select(SDNode *N) override; const char *getPassName() const override; + void PreprocessISelDAG() override; void PostprocessISelDAG() override; private: @@ -91,7 +95,7 @@ private: bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, SDValue &Offset1) const; - void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, + bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, SDValue &TFE) const; @@ -108,6 +112,16 @@ private: SDValue &TFE) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset, SDValue &GLC) const; + bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, + bool &Imm) const; + bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, + bool &Imm) const; + bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; + bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const; + bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const; + bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const; + bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const; + bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const; SDNode *SelectAddrSpaceCast(SDNode *N); bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; @@ -273,6 +287,23 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { return N; } +static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { + switch (NumVectorElts) { + case 1: + return AMDGPU::SReg_32RegClassID; + case 2: + return AMDGPU::SReg_64RegClassID; + case 4: + return AMDGPU::SReg_128RegClassID; + case 8: + return AMDGPU::SReg_256RegClassID; + case 16: + return AMDGPU::SReg_512RegClassID; + } + + llvm_unreachable("invalid vector size"); +} + SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { unsigned int Opc = N->getOpcode(); if (N->isMachineOpcode()) { @@ -306,38 +337,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { EVT EltVT = VT.getVectorElementType(); assert(EltVT.bitsEq(MVT::i32)); if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - bool UseVReg = true; - for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); - U != E; ++U) { - if (!U->isMachineOpcode()) { - continue; - } - const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo()); - if (!RC) { - continue; - } - if (static_cast<const SIRegisterInfo *>(TRI)->isSGPRClass(RC)) { - UseVReg = false; - } - } - switch(NumVectorElts) { - case 1: RegClassID = UseVReg ? AMDGPU::VGPR_32RegClassID : - AMDGPU::SReg_32RegClassID; - break; - case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID : - AMDGPU::SReg_64RegClassID; - break; - case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID : - AMDGPU::SReg_128RegClassID; - break; - case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID : - AMDGPU::SReg_256RegClassID; - break; - case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID : - AMDGPU::SReg_512RegClassID; - break; - default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); - } + RegClassID = selectSGPRVectorRegClassID(NumVectorElts); } else { // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG // that adds a 128 bits reg copy when going through TwoAddressInstructions @@ -455,98 +455,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, N->getValueType(0), Ops); } - - case ISD::LOAD: { - LoadSDNode *LD = cast<LoadSDNode>(N); - SDLoc SL(N); - EVT VT = N->getValueType(0); - - if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) { - N = glueCopyToM0(N); - break; - } - - // To simplify the TableGen patters, we replace all i64 loads with - // v2i32 loads. Alternatively, we could promote i64 loads to v2i32 - // during DAG legalization, however, so places (ExpandUnalignedLoad) - // in the DAG legalizer assume that if i64 is legal, so doing this - // promotion early can cause problems. - - SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(), - LD->getBasePtr(), LD->getMemOperand()); - SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL, - MVT::i64, NewLoad); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1)); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast); - SDNode *Load = glueCopyToM0(NewLoad.getNode()); - SelectCode(Load); - N = BitCast.getNode(); - break; - } - + case ISD::LOAD: case ISD::STORE: { - // Handle i64 stores here for the same reason mentioned above for loads. - StoreSDNode *ST = cast<StoreSDNode>(N); - SDValue Value = ST->getValue(); - if (Value.getValueType() == MVT::i64 && !ST->isTruncatingStore()) { - - SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N), - MVT::v2i32, Value); - SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue, - ST->getBasePtr(), ST->getMemOperand()); - - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore); - - if (NewValue.getOpcode() == ISD::BITCAST) { - Select(NewStore.getNode()); - return SelectCode(NewValue.getNode()); - } - - // getNode() may fold the bitcast if its input was another bitcast. If that - // happens we should only select the new store. - N = NewStore.getNode(); - } - N = glueCopyToM0(N); break; } - case AMDGPUISD::REGISTER_LOAD: { - if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - break; - SDValue Addr, Offset; - - SDLoc DL(N); - SelectADDRIndirect(N->getOperand(1), Addr, Offset); - const SDValue Ops[] = { - Addr, - Offset, - CurDAG->getTargetConstant(0, DL, MVT::i32), - N->getOperand(0), - }; - return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, DL, - CurDAG->getVTList(MVT::i32, MVT::i64, - MVT::Other), - Ops); - } - case AMDGPUISD::REGISTER_STORE: { - if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - break; - SDValue Addr, Offset; - SelectADDRIndirect(N->getOperand(2), Addr, Offset); - SDLoc DL(N); - const SDValue Ops[] = { - N->getOperand(1), - Addr, - Offset, - CurDAG->getTargetConstant(0, DL, MVT::i32), - N->getOperand(0), - }; - return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, DL, - CurDAG->getVTList(MVT::Other), - Ops); - } - case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) @@ -575,7 +489,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { return getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, SDLoc(N), N->getOperand(0), OffsetVal, WidthVal); - } case AMDGPUISD::DIV_SCALE: { return SelectDIV_SCALE(N); @@ -601,7 +514,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { return SelectCode(N); } - bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) { assert(AS != 0 && "Use checkPrivateAddress instead."); if (!Ptr) @@ -681,7 +593,7 @@ bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const { if (checkPrivateAddress(N->getMemOperand())) { if (MMO) { const PseudoSourceValue *PSV = MMO->getPseudoValue(); - if (PSV && PSV == PseudoSourceValue::getConstantPool()) { + if (PSV && PSV->isConstantPool()) { return true; } } @@ -847,7 +759,8 @@ SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { unsigned Opc = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; - // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod + // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, + // omod SDValue Ops[8]; SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]); @@ -883,15 +796,39 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, Offset = N1; return true; } - } + } else if (Addr.getOpcode() == ISD::SUB) { + // sub C, x -> add (sub 0, x), C + if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) { + int64_t ByteOffset = C->getSExtValue(); + if (isUInt<16>(ByteOffset)) { + SDLoc DL(Addr); + SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); + + // XXX - This is kind of hacky. Create a dummy sub node so we can check + // the known bits in isDSOffsetLegal. We need to emit the selected node + // here, so this is thrown away. + SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32, + Zero, Addr.getOperand(1)); + + if (isDSOffsetLegal(Sub, ByteOffset, 16)) { + MachineSDNode *MachineSub + = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32, + Zero, Addr.getOperand(1)); + + Base = SDValue(MachineSub, 0); + Offset = Addr.getOperand(0); + return true; + } + } + } + } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { + // If we have a constant address, prefer to put the constant into the + // offset. This can save moves to load the constant address since multiple + // operations can share the zero base address register, and enables merging + // into read2 / write2 instructions. - SDLoc DL(Addr); + SDLoc DL(Addr); - // If we have a constant address, prefer to put the constant into the - // offset. This can save moves to load the constant address since multiple - // operations can share the zero base address register, and enables merging - // into read2 / write2 instructions. - if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { if (isUInt<16>(CAddr->getZExtValue())) { SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, @@ -904,10 +841,11 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, // default case Base = Addr; - Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16); return true; } +// TODO: If offset is too big, put low 16-bit into offset. bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, SDValue &Offset0, SDValue &Offset1) const { @@ -926,9 +864,35 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); return true; } - } - - if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { + } else if (Addr.getOpcode() == ISD::SUB) { + // sub C, x -> add (sub 0, x), C + if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) { + unsigned DWordOffset0 = C->getZExtValue() / 4; + unsigned DWordOffset1 = DWordOffset0 + 1; + + if (isUInt<8>(DWordOffset0)) { + SDLoc DL(Addr); + SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); + + // XXX - This is kind of hacky. Create a dummy sub node so we can check + // the known bits in isDSOffsetLegal. We need to emit the selected node + // here, so this is thrown away. + SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32, + Zero, Addr.getOperand(1)); + + if (isDSOffsetLegal(Sub, DWordOffset1, 8)) { + MachineSDNode *MachineSub + = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32, + Zero, Addr.getOperand(1)); + + Base = SDValue(MachineSub, 0); + Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); + return true; + } + } + } + } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { unsigned DWordOffset0 = CAddr->getZExtValue() / 4; unsigned DWordOffset1 = DWordOffset0 + 1; assert(4 * DWordOffset0 == CAddr->getZExtValue()); @@ -956,12 +920,16 @@ static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) { return isUInt<12>(Imm->getZExtValue()); } -void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, +bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, SDValue &TFE) const { + // Subtarget prefers to use flat instruction + if (Subtarget->useFlatForGlobal()) + return false; + SDLoc DL(Addr); GLC = CurDAG->getTargetConstant(0, DL, MVT::i1); @@ -994,14 +962,14 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, if (isLegalMUBUFImmOffset(C1)) { Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); - return; + return true; } else if (isUInt<32>(C1->getZExtValue())) { // Illegal offset, store it in soffset. Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)), 0); - return; + return true; } } @@ -1013,7 +981,7 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, Ptr = N0; VAddr = N1; Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); - return; + return true; } // default case -> offset @@ -1021,6 +989,7 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, Ptr = Addr; Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + return true; } bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, @@ -1033,8 +1002,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) return false; - SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE); + if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, + GLC, SLC, TFE)) + return false; ConstantSDNode *C = cast<ConstantSDNode>(Addr64); if (C->getSExtValue()) { @@ -1052,8 +1022,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, - SDValue &Offset, - SDValue &SLC) const { + SDValue &Offset, + SDValue &SLC) const { SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1); SDValue GLC, TFE; @@ -1066,36 +1036,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, SDLoc DL(Addr); MachineFunction &MF = CurDAG->getMachineFunction(); - const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const SITargetLowering& Lowering = - *static_cast<const SITargetLowering*>(getTargetLowering()); - - unsigned ScratchOffsetReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); - Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass, - ScratchOffsetReg, MVT::i32); - SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32); - SDValue ScratchRsrcDword0 = - SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32); - SDValue ScratchRsrcDword1 = - SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0); - - const SDValue RsrcOps[] = { - CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), - ScratchRsrcDword0, - CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - ScratchRsrcDword1, - CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32), - }; - SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::v2i32, RsrcOps), 0); - Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0); - SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, - MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32); + Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); + SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32); // (add n0, c1) if (CurDAG->isBaseWithConstantOffset(Addr)) { @@ -1126,8 +1070,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); - SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE); + if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, + GLC, SLC, TFE)) + return false; if (!cast<ConstantSDNode>(Offen)->getSExtValue() && !cast<ConstantSDNode>(Idxen)->getSExtValue() && @@ -1153,18 +1098,134 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); } +/// +/// \param EncodedOffset This is the immediate value that will be encoded +/// directly into the instruction. On SI/CI the \p EncodedOffset +/// will be in units of dwords and on VI+ it will be units of bytes. +static bool isLegalSMRDImmOffset(const AMDGPUSubtarget *ST, + int64_t EncodedOffset) { + return ST->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ? + isUInt<8>(EncodedOffset) : isUInt<20>(EncodedOffset); +} + +bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, + SDValue &Offset, bool &Imm) const { + + // FIXME: Handle non-constant offsets. + ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode); + if (!C) + return false; + + SDLoc SL(ByteOffsetNode); + AMDGPUSubtarget::Generation Gen = Subtarget->getGeneration(); + int64_t ByteOffset = C->getSExtValue(); + int64_t EncodedOffset = Gen < AMDGPUSubtarget::VOLCANIC_ISLANDS ? + ByteOffset >> 2 : ByteOffset; + + if (isLegalSMRDImmOffset(Subtarget, EncodedOffset)) { + Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32); + Imm = true; + return true; + } + + if (!isUInt<32>(EncodedOffset) || !isUInt<32>(ByteOffset)) + return false; + + if (Gen == AMDGPUSubtarget::SEA_ISLANDS && isUInt<32>(EncodedOffset)) { + // 32-bit Immediates are supported on Sea Islands. + Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32); + } else { + SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32); + Offset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, + C32Bit), 0); + } + Imm = false; + return true; +} + +bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, + SDValue &Offset, bool &Imm) const { + + SDLoc SL(Addr); + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + + if (SelectSMRDOffset(N1, Offset, Imm)) { + SBase = N0; + return true; + } + } + SBase = Addr; + Offset = CurDAG->getTargetConstant(0, SL, MVT::i32); + Imm = true; + return true; +} + +bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase, + SDValue &Offset) const { + bool Imm; + return SelectSMRD(Addr, SBase, Offset, Imm) && Imm; +} + +bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase, + SDValue &Offset) const { + + if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS) + return false; + + bool Imm; + if (!SelectSMRD(Addr, SBase, Offset, Imm)) + return false; + + return !Imm && isa<ConstantSDNode>(Offset); +} + +bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase, + SDValue &Offset) const { + bool Imm; + return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm && + !isa<ConstantSDNode>(Offset); +} + +bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr, + SDValue &Offset) const { + bool Imm; + return SelectSMRDOffset(Addr, Offset, Imm) && Imm; +} + +bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr, + SDValue &Offset) const { + if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS) + return false; + + bool Imm; + if (!SelectSMRDOffset(Addr, Offset, Imm)) + return false; + + return !Imm && isa<ConstantSDNode>(Offset); +} + +bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr, + SDValue &Offset) const { + bool Imm; + return SelectSMRDOffset(Addr, Offset, Imm) && !Imm && + !isa<ConstantSDNode>(Offset); +} + // FIXME: This is incorrect and only enough to be able to compile. SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N); SDLoc DL(N); + const MachineFunction &MF = CurDAG->getMachineFunction(); + DiagnosticInfoUnsupported NotImplemented(*MF.getFunction(), + "addrspacecast not implemented"); + CurDAG->getContext()->diagnose(NotImplemented); + assert(Subtarget->hasFlatAddressSpace() && "addrspacecast only supported with flat address space!"); - assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && - ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) && - "Cannot cast address space to / from constant address!"); - assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS || ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) && "Can only cast to / from flat address space!"); @@ -1190,7 +1251,6 @@ SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32)); } - if (DestSize > SrcSize) { assert(SrcSize == 32 && DestSize == 64); @@ -1371,6 +1431,65 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, return SelectVOP3Mods(In, Src, SrcMods); } +void AMDGPUDAGToDAGISel::PreprocessISelDAG() { + bool Modified = false; + + // XXX - Other targets seem to be able to do this without a worklist. + SmallVector<LoadSDNode *, 8> LoadsToReplace; + SmallVector<StoreSDNode *, 8> StoresToReplace; + + for (SDNode &Node : CurDAG->allnodes()) { + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(&Node)) { + EVT VT = LD->getValueType(0); + if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) + continue; + + // To simplify the TableGen patters, we replace all i64 loads with v2i32 + // loads. Alternatively, we could promote i64 loads to v2i32 during DAG + // legalization, however, so places (ExpandUnalignedLoad) in the DAG + // legalizer assume that if i64 is legal, so doing this promotion early + // can cause problems. + LoadsToReplace.push_back(LD); + } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(&Node)) { + // Handle i64 stores here for the same reason mentioned above for loads. + SDValue Value = ST->getValue(); + if (Value.getValueType() != MVT::i64 || ST->isTruncatingStore()) + continue; + StoresToReplace.push_back(ST); + } + } + + for (LoadSDNode *LD : LoadsToReplace) { + SDLoc SL(LD); + + SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SL, LD->getChain(), + LD->getBasePtr(), LD->getMemOperand()); + SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL, + MVT::i64, NewLoad); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1)); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 0), BitCast); + Modified = true; + } + + for (StoreSDNode *ST : StoresToReplace) { + SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(ST), + MVT::v2i32, ST->getValue()); + const SDValue StoreOps[] = { + ST->getChain(), + NewValue, + ST->getBasePtr(), + ST->getOffset() + }; + + CurDAG->UpdateNodeOperands(ST, StoreOps); + Modified = true; + } + + // XXX - Is this necessary? + if (Modified) + CurDAG->RemoveDeadNodes(); +} + void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast<const AMDGPUTargetLowering*>(getTargetLowering()); diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 3a65f3b..222f631 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -15,6 +15,7 @@ #include "AMDGPUISelLowering.h" #include "AMDGPU.h" +#include "AMDGPUDiagnosticInfoUnsupported.h" #include "AMDGPUFrameLowering.h" #include "AMDGPUIntrinsicInfo.h" #include "AMDGPURegisterInfo.h" @@ -27,50 +28,9 @@ #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/DiagnosticInfo.h" -#include "llvm/IR/DiagnosticPrinter.h" using namespace llvm; -namespace { - -/// Diagnostic information for unimplemented or unsupported feature reporting. -class DiagnosticInfoUnsupported : public DiagnosticInfo { -private: - const Twine &Description; - const Function &Fn; - - static int KindID; - - static int getKindID() { - if (KindID == 0) - KindID = llvm::getNextAvailablePluginDiagnosticKind(); - return KindID; - } - -public: - DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc, - DiagnosticSeverity Severity = DS_Error) - : DiagnosticInfo(getKindID(), Severity), - Description(Desc), - Fn(Fn) { } - - const Function &getFunction() const { return Fn; } - const Twine &getDescription() const { return Description; } - - void print(DiagnosticPrinter &DP) const override { - DP << "unsupported " << getDescription() << " in " << Fn.getName(); - } - - static bool classof(const DiagnosticInfo *DI) { - return DI->getKind() == getKindID(); - } -}; - -int DiagnosticInfoUnsupported::KindID = 0; -} - - static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) { @@ -113,6 +73,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setOperationAction(ISD::BR_JT, MVT::Other, Expand); setOperationAction(ISD::BRIND, MVT::Other, Expand); + // This is totally unsupported, just custom lower to produce an error. + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); + // We need to custom lower some of the intrinsics setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -352,7 +315,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Custom); - setOperationAction(ISD::UDIVREM, VT, Custom); + setOperationAction(ISD::UDIVREM, VT, Expand); setOperationAction(ISD::ADDC, VT, Expand); setOperationAction(ISD::SUBC, VT, Expand); setOperationAction(ISD::ADDE, VT, Expand); @@ -429,12 +392,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setSelectIsExpensive(false); PredictableSelectIsExpensive = false; - // There are no integer divide instructions, and these expand to a pretty - // large sequence of instructions. - setIntDivIsCheap(false); - setPow2SDivIsCheap(false); setFsqrtIsCheap(true); + // We want to find all load dependencies for long chains of stores to enable + // merging into very wide vectors. The problem is with vectors with > 4 + // elements. MergeConsecutiveStores will attempt to merge these because x8/x16 + // vectors are a legal type, even though we have to split the loads + // usually. When we can more precisely specify load legality per address + // space, we should be able to make FindBetterChain/MergeConsecutiveStores + // smarter so that they can figure out what to do in 2 iterations without all + // N > 4 stores on the same chain. + GatherAllAliasesMaxDepth = 16; + // FIXME: Need to really handle these. MaxStoresPerMemcpy = 4096; MaxStoresPerMemmove = 4096; @@ -534,6 +503,18 @@ bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT, return true; } +bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const { + // There are few operations which truly have vector input operands. Any vector + // operation is going to involve operations on each component, and a + // build_vector will be a copy per element, so it always makes sense to use a + // build_vector input in place of the extracted element to avoid a copy into a + // super register. + // + // We should probably only do this if all users are extracts only, but this + // should be the common case. + return true; +} + bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { // Truncate is just accessing a subregister. return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0); @@ -617,6 +598,15 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, return SDValue(); } +SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { + const Function &Fn = *DAG.getMachineFunction().getFunction(); + + DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "dynamic alloca"); + DAG.getContext()->diagnose(NoDynamicAlloca); + return SDValue(); +} + SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -643,6 +633,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); + case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); } return Op; } @@ -892,7 +883,9 @@ SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); unsigned FrameIndex = FIN->getIndex(); - unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); + unsigned IgnoredFrameReg; + unsigned Offset = + TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op), Op.getValueType()); } @@ -1043,9 +1036,6 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(1), Op.getOperand(2)); - case AMDGPUIntrinsic::AMDGPU_brev: - return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1)); - case Intrinsic::AMDGPU_class: return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1), Op.getOperand(2)); @@ -1057,6 +1047,8 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name. return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDGPU_brev: // Legacy name + return DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(1)); } } @@ -1077,6 +1069,7 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT VT = Op.getValueType(); + // TODO: Should this propagate fast-math-flags? SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT, DAG.getConstantFP(1.0f, DL, MVT::f32), Op.getOperand(1)); @@ -1167,45 +1160,6 @@ SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL, return SDValue(); } -// FIXME: Remove this when combines added to DAGCombiner. -SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL, - EVT VT, - SDValue LHS, - SDValue RHS, - SDValue True, - SDValue False, - SDValue CC, - SelectionDAG &DAG) const { - if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) - return SDValue(); - - ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); - switch (CCOpcode) { - case ISD::SETULE: - case ISD::SETULT: { - unsigned Opc = (LHS == True) ? ISD::UMIN : ISD::UMAX; - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - case ISD::SETLE: - case ISD::SETLT: { - unsigned Opc = (LHS == True) ? ISD::SMIN : ISD::SMAX; - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - case ISD::SETGT: - case ISD::SETGE: { - unsigned Opc = (LHS == True) ? ISD::SMAX : ISD::SMIN; - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - case ISD::SETUGE: - case ISD::SETUGT: { - unsigned Opc = (LHS == True) ? ISD::UMAX : ISD::UMIN; - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - default: - return SDValue(); - } -} - SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op, SelectionDAG &DAG) const { LoadSDNode *Load = cast<LoadSDNode>(Op); @@ -1260,7 +1214,8 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, EVT PtrVT = BasePtr.getValueType(); EVT MemVT = Load->getMemoryVT(); SDLoc SL(Op); - MachinePointerInfo SrcValue(Load->getMemOperand()->getValue()); + + const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo(); EVT LoVT, HiVT; EVT LoMemVT, HiMemVT; @@ -1269,23 +1224,27 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT); + + unsigned Size = LoMemVT.getStoreSize(); + unsigned BaseAlign = Load->getAlignment(); + unsigned HiAlign = MinAlign(BaseAlign, Size); + SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue, LoMemVT, Load->isVolatile(), Load->isNonTemporal(), - Load->isInvariant(), Load->getAlignment()); + Load->isInvariant(), BaseAlign); SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, - DAG.getConstant(LoMemVT.getStoreSize(), SL, - PtrVT)); + DAG.getConstant(Size, SL, PtrVT)); SDValue HiLoad = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, Load->isVolatile(), Load->isNonTemporal(), - Load->isInvariant(), Load->getAlignment()); + Load->isInvariant(), HiAlign); SDValue Ops[] = { DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad), @@ -1415,7 +1374,11 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, DAG.getConstant(LoMemVT.getStoreSize(), SL, PtrVT)); - MachinePointerInfo SrcValue(Store->getMemOperand()->getValue()); + const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo(); + unsigned BaseAlign = Store->getAlignment(); + unsigned Size = LoMemVT.getStoreSize(); + unsigned HiAlign = MinAlign(BaseAlign, Size); + SDValue LoStore = DAG.getTruncStore(Chain, SL, Lo, BasePtr, @@ -1423,15 +1386,15 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, LoMemVT, Store->isNonTemporal(), Store->isVolatile(), - Store->getAlignment()); + BaseAlign); SDValue HiStore = DAG.getTruncStore(Chain, SL, Hi, HiPtr, - SrcValue.getWithOffset(LoMemVT.getStoreSize()), + SrcValue.getWithOffset(Size), HiMemVT, Store->isNonTemporal(), Store->isVolatile(), - Store->getAlignment()); + HiAlign); return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); } @@ -1529,7 +1492,7 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && Store->getValue().getValueType().isVector()) { - return ScalarizeVectorStore(Op, DAG); + return SplitVectorStore(Op, DAG); } EVT MemVT = Store->getMemoryVT(); @@ -1630,6 +1593,7 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool // float fb = (float)ib; SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib); + // TODO: Should this propagate fast-math-flags? // float fq = native_divide(fa, fb); SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT, fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb)); @@ -1940,6 +1904,8 @@ SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { SDValue X = Op.getOperand(0); SDValue Y = Op.getOperand(1); + // TODO: Should this propagate fast-math-flags? + SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y); SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div); SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y); @@ -1968,6 +1934,7 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero); + // TODO: Should this propagate fast-math-flags? return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } @@ -2045,6 +2012,8 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64); SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src); + // TODO: Should this propagate fast-math-flags? + SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign); SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign); @@ -2074,6 +2043,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X); + // TODO: Should this propagate fast-math-flags? + SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T); SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff); @@ -2184,6 +2155,7 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero); + // TODO: Should this propagate fast-math-flags? return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } @@ -2206,7 +2178,7 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi, DAG.getConstant(32, SL, MVT::i32)); - + // TODO: Should this propagate fast-math-flags? return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo); } @@ -2231,6 +2203,7 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, DAG.getConstant(1, DL, MVT::i32)); SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi); + // TODO: Should this propagate fast-math-flags? FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi, DAG.getConstantFP(4294967296.0f, DL, MVT::f32)); // 2^32 return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi); @@ -2257,7 +2230,7 @@ SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, MVT::f64); SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL, MVT::f64); - + // TODO: Should this propagate fast-math-flags? SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0); SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul); @@ -2511,12 +2484,6 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, if (VT == MVT::f32) return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); - - // TODO: Implement min / max Evergreen instructions. - if (VT == MVT::i32 && - Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - return CombineIMinMax(DL, VT, LHS, RHS, True, False, CC, DAG); - } } break; @@ -2652,20 +2619,14 @@ bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const { if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { return CFP->isExactlyValue(1.0); } - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { - return C->isAllOnesValue(); - } - return false; + return isAllOnesConstant(Op); } bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const { if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { return CFP->getValueAPF().isZero(); } - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { - return C->isNullValue(); - } - return false; + return isNullConstant(Op); } SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, @@ -2738,7 +2699,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BFE_I32) NODE_NAME_CASE(BFI) NODE_NAME_CASE(BFM) - NODE_NAME_CASE(BREV) NODE_NAME_CASE(MUL_U24) NODE_NAME_CASE(MUL_I24) NODE_NAME_CASE(MAD_U24) @@ -2893,8 +2853,7 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( return 1; unsigned SignBits = 32 - Width->getZExtValue() + 1; - ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1)); - if (!Offset || !Offset->isNullValue()) + if (!isNullConstant(Op.getOperand(1))) return SignBits; // TODO: Could probably figure something out with non-0 offsets. diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 478b203..7314cc0 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -138,6 +138,7 @@ public: bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem, unsigned AS) const override; + bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override; bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; @@ -149,6 +150,9 @@ public: SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const override; + SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const; + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; void ReplaceNodeResults(SDNode * N, @@ -165,14 +169,6 @@ public: SDValue False, SDValue CC, DAGCombinerInfo &DCI) const; - SDValue CombineIMinMax(SDLoc DL, - EVT VT, - SDValue LHS, - SDValue RHS, - SDValue True, - SDValue False, - SDValue CC, - SelectionDAG &DAG) const; const char* getTargetNodeName(unsigned Opcode) const override; @@ -216,7 +212,7 @@ public: /// \brief Helper function that returns the byte offset of the given /// type of implicit parameter. - unsigned getImplicitParameterOffset(const AMDGPUMachineFunction *MFI, + uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const; }; @@ -267,7 +263,6 @@ enum NodeType : unsigned { BFE_I32, // Extract range of bits with sign extension to 32-bits. BFI, // (src0 & src1) | (~src0 & src2) BFM, // Insert a range of bits into a 32-bit word. - BREV, // Reverse bits. MUL_U24, MUL_I24, MAD_U24, diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp index 15a3d54..a266e71 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -164,11 +164,6 @@ MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl( // TODO: Implement this function return nullptr; } -bool AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI, - ArrayRef<unsigned> Ops) const { - // TODO: Implement this function - return false; -} bool AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, unsigned Reg, bool UnfoldLoad, @@ -312,7 +307,9 @@ int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { return -1; } - Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexOffset(MF, -1); + unsigned IgnoredFrameReg; + Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexReference( + MF, -1, IgnoredFrameReg); return getIndirectIndexBegin(MF) + Offset; } @@ -367,3 +364,14 @@ int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { return MCOp; } + +ArrayRef<std::pair<int, const char *>> +AMDGPUInstrInfo::getSerializableTargetIndices() const { + static const std::pair<int, const char *> TargetIndices[] = { + {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; + return makeArrayRef(TargetIndices); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h index 86d3962..53e8b23 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -103,8 +103,6 @@ public: /// read or write or -1 if indirect addressing is not used by this program. int getIndirectIndexEnd(const MachineFunction &MF) const; - bool canFoldMemoryOperand(const MachineInstr *MI, - ArrayRef<unsigned> Ops) const override; bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, unsigned Reg, bool UnfoldLoad, bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const override; @@ -147,6 +145,9 @@ public: return get(pseudoToMCOpcode(Opcode)); } + ArrayRef<std::pair<int, const char *>> + getSerializableTargetIndices() const override; + //===---------------------------------------------------------------------===// // Pure virtual funtions to be implemented by sub-classes. //===---------------------------------------------------------------------===// @@ -195,6 +196,7 @@ public: }; namespace AMDGPU { + LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex); } // End namespace AMDGPU diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index b413897..70e589c 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -191,8 +191,6 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; -def AMDGPUbrev : SDNode<"AMDGPUISD::BREV", SDTIntUnaryOp>; - // Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when // performing the mulitply. The result is a 32-bit value. def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp, diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 72cab39..11f6139 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -514,7 +514,7 @@ class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul> class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx, SubRegIndex sub_reg> : Pat< - (sub_type (vector_extract vec_type:$src, sub_idx)), + (sub_type (extractelt vec_type:$src, sub_idx)), (EXTRACT_SUBREG $src, sub_reg) >; @@ -522,7 +522,7 @@ class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx, class Insert_Element <ValueType elem_type, ValueType vec_type, int sub_idx, SubRegIndex sub_reg> : Pat < - (vector_insert vec_type:$vec, elem_type:$elem, sub_idx), + (insertelt vec_type:$vec, elem_type:$elem, sub_idx), (INSERT_SUBREG $vec, $elem, sub_reg) >; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td index ab489cd..1de3546 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td @@ -69,8 +69,8 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_barrier_local : Intrinsic<[], [], []>; - def int_AMDGPU_barrier_global : Intrinsic<[], [], []>; + def int_AMDGPU_barrier_local : Intrinsic<[], [], [IntrConvergent]>; + def int_AMDGPU_barrier_global : Intrinsic<[], [], [IntrConvergent]>; } // Legacy names for compatibility. diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 2083146..dfc652f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -61,7 +61,7 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { MCOp = MCOperand::createImm(MO.getImm()); break; case MachineOperand::MO_Register: - MCOp = MCOperand::createReg(MO.getReg()); + MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST)); break; case MachineOperand::MO_MachineBasicBlock: MCOp = MCOperand::createExpr(MCSymbolRefExpr::create( @@ -73,13 +73,6 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(Sym, Ctx)); break; } - case MachineOperand::MO_TargetIndex: { - assert(MO.getIndex() == AMDGPU::TI_CONSTDATA_START); - MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); - const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); - MCOp = MCOperand::createExpr(Expr); - break; - } case MachineOperand::MO_ExternalSymbol: { MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName())); const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); @@ -104,10 +97,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { #endif if (MI->isBundle()) { const MachineBasicBlock *MBB = MI->getParent(); - MachineBasicBlock::const_instr_iterator I = MI; - ++I; - while (I != MBB->end() && I->isInsideBundle()) { - EmitInstruction(I); + MachineBasicBlock::const_instr_iterator I = ++MI->getIterator(); + while (I != MBB->instr_end() && I->isInsideBundle()) { + EmitInstruction(&*I); ++I; } } else { @@ -136,8 +128,6 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter(); InstEmitter.encodeInstruction(TmpInst, CodeStream, Fixups, MF->getSubtarget<MCSubtargetInfo>()); - CodeStream.flush(); - HexLines.resize(HexLines.size() + 1); std::string &HexLine = HexLines.back(); raw_string_ostream HexStream(HexLine); diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 21c7da6..5413717 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -1,11 +1,10 @@ #include "AMDGPUMachineFunction.h" #include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" using namespace llvm; -static const char *const ShaderTypeAttribute = "ShaderType"; - // Pin the vtable to this file. void AMDGPUMachineFunction::anchor() {} @@ -13,13 +12,9 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo(), ShaderType(ShaderType::COMPUTE), LDSSize(0), + ABIArgOffset(0), ScratchSize(0), IsKernel(true) { - Attribute A = MF.getFunction()->getFnAttribute(ShaderTypeAttribute); - if (A.isStringAttribute()) { - StringRef Str = A.getValueAsString(); - if (Str.getAsInteger(0, ShaderType)) - llvm_unreachable("Can't parse shader type!"); - } + ShaderType = AMDGPU::getShaderType(*MF.getFunction()); } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h index f5e4694..46fcee8 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -37,6 +37,11 @@ public: return ShaderType; } + bool isKernel() const { + // FIXME: Assume everything is a kernel until function calls are supported. + return true; + } + unsigned ScratchSize; bool IsKernel; }; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp new file mode 100644 index 0000000..554bf1d --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp @@ -0,0 +1,373 @@ +//===-- AMDGPUOpenCLImageTypeLoweringPass.cpp -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass resolves calls to OpenCL image attribute, image resource ID and +/// sampler resource ID getter functions. +/// +/// Image attributes (size and format) are expected to be passed to the kernel +/// as kernel arguments immediately following the image argument itself, +/// therefore this pass adds image size and format arguments to the kernel +/// functions in the module. The kernel functions with image arguments are +/// re-created using the new signature. The new arguments are added to the +/// kernel metadata with kernel_arg_type set to "image_size" or "image_format". +/// Note: this pass may invalidate pointers to functions. +/// +/// Resource IDs of read-only images, write-only images and samplers are +/// defined to be their index among the kernel arguments of the same +/// type and access qualifier. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/Cloning.h" + +using namespace llvm; + +namespace { + +StringRef GetImageSizeFunc = "llvm.OpenCL.image.get.size"; +StringRef GetImageFormatFunc = "llvm.OpenCL.image.get.format"; +StringRef GetImageResourceIDFunc = "llvm.OpenCL.image.get.resource.id"; +StringRef GetSamplerResourceIDFunc = "llvm.OpenCL.sampler.get.resource.id"; + +StringRef ImageSizeArgMDType = "__llvm_image_size"; +StringRef ImageFormatArgMDType = "__llvm_image_format"; + +StringRef KernelsMDNodeName = "opencl.kernels"; +StringRef KernelArgMDNodeNames[] = { + "kernel_arg_addr_space", + "kernel_arg_access_qual", + "kernel_arg_type", + "kernel_arg_base_type", + "kernel_arg_type_qual"}; +const unsigned NumKernelArgMDNodes = 5; + +typedef SmallVector<Metadata *, 8> MDVector; +struct KernelArgMD { + MDVector ArgVector[NumKernelArgMDNodes]; +}; + +} // end anonymous namespace + +static inline bool +IsImageType(StringRef TypeString) { + return TypeString == "image2d_t" || TypeString == "image3d_t"; +} + +static inline bool +IsSamplerType(StringRef TypeString) { + return TypeString == "sampler_t"; +} + +static Function * +GetFunctionFromMDNode(MDNode *Node) { + if (!Node) + return nullptr; + + size_t NumOps = Node->getNumOperands(); + if (NumOps != NumKernelArgMDNodes + 1) + return nullptr; + + auto F = mdconst::dyn_extract<Function>(Node->getOperand(0)); + if (!F) + return nullptr; + + // Sanity checks. + size_t ExpectNumArgNodeOps = F->arg_size() + 1; + for (size_t i = 0; i < NumKernelArgMDNodes; ++i) { + MDNode *ArgNode = dyn_cast_or_null<MDNode>(Node->getOperand(i + 1)); + if (ArgNode->getNumOperands() != ExpectNumArgNodeOps) + return nullptr; + if (!ArgNode->getOperand(0)) + return nullptr; + + // FIXME: It should be possible to do image lowering when some metadata + // args missing or not in the expected order. + MDString *StringNode = dyn_cast<MDString>(ArgNode->getOperand(0)); + if (!StringNode || StringNode->getString() != KernelArgMDNodeNames[i]) + return nullptr; + } + + return F; +} + +static StringRef +AccessQualFromMD(MDNode *KernelMDNode, unsigned ArgIdx) { + MDNode *ArgAQNode = cast<MDNode>(KernelMDNode->getOperand(2)); + return cast<MDString>(ArgAQNode->getOperand(ArgIdx + 1))->getString(); +} + +static StringRef +ArgTypeFromMD(MDNode *KernelMDNode, unsigned ArgIdx) { + MDNode *ArgTypeNode = cast<MDNode>(KernelMDNode->getOperand(3)); + return cast<MDString>(ArgTypeNode->getOperand(ArgIdx + 1))->getString(); +} + +static MDVector +GetArgMD(MDNode *KernelMDNode, unsigned OpIdx) { + MDVector Res; + for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) { + MDNode *Node = cast<MDNode>(KernelMDNode->getOperand(i + 1)); + Res.push_back(Node->getOperand(OpIdx)); + } + return Res; +} + +static void +PushArgMD(KernelArgMD &MD, const MDVector &V) { + assert(V.size() == NumKernelArgMDNodes); + for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) { + MD.ArgVector[i].push_back(V[i]); + } +} + +namespace { + +class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass { + static char ID; + + LLVMContext *Context; + Type *Int32Type; + Type *ImageSizeType; + Type *ImageFormatType; + SmallVector<Instruction *, 4> InstsToErase; + + bool replaceImageUses(Argument &ImageArg, uint32_t ResourceID, + Argument &ImageSizeArg, + Argument &ImageFormatArg) { + bool Modified = false; + + for (auto &Use : ImageArg.uses()) { + auto Inst = dyn_cast<CallInst>(Use.getUser()); + if (!Inst) { + continue; + } + + Function *F = Inst->getCalledFunction(); + if (!F) + continue; + + Value *Replacement = nullptr; + StringRef Name = F->getName(); + if (Name.startswith(GetImageResourceIDFunc)) { + Replacement = ConstantInt::get(Int32Type, ResourceID); + } else if (Name.startswith(GetImageSizeFunc)) { + Replacement = &ImageSizeArg; + } else if (Name.startswith(GetImageFormatFunc)) { + Replacement = &ImageFormatArg; + } else { + continue; + } + + Inst->replaceAllUsesWith(Replacement); + InstsToErase.push_back(Inst); + Modified = true; + } + + return Modified; + } + + bool replaceSamplerUses(Argument &SamplerArg, uint32_t ResourceID) { + bool Modified = false; + + for (const auto &Use : SamplerArg.uses()) { + auto Inst = dyn_cast<CallInst>(Use.getUser()); + if (!Inst) { + continue; + } + + Function *F = Inst->getCalledFunction(); + if (!F) + continue; + + Value *Replacement = nullptr; + StringRef Name = F->getName(); + if (Name == GetSamplerResourceIDFunc) { + Replacement = ConstantInt::get(Int32Type, ResourceID); + } else { + continue; + } + + Inst->replaceAllUsesWith(Replacement); + InstsToErase.push_back(Inst); + Modified = true; + } + + return Modified; + } + + bool replaceImageAndSamplerUses(Function *F, MDNode *KernelMDNode) { + uint32_t NumReadOnlyImageArgs = 0; + uint32_t NumWriteOnlyImageArgs = 0; + uint32_t NumSamplerArgs = 0; + + bool Modified = false; + InstsToErase.clear(); + for (auto ArgI = F->arg_begin(); ArgI != F->arg_end(); ++ArgI) { + Argument &Arg = *ArgI; + StringRef Type = ArgTypeFromMD(KernelMDNode, Arg.getArgNo()); + + // Handle image types. + if (IsImageType(Type)) { + StringRef AccessQual = AccessQualFromMD(KernelMDNode, Arg.getArgNo()); + uint32_t ResourceID; + if (AccessQual == "read_only") { + ResourceID = NumReadOnlyImageArgs++; + } else if (AccessQual == "write_only") { + ResourceID = NumWriteOnlyImageArgs++; + } else { + llvm_unreachable("Wrong image access qualifier."); + } + + Argument &SizeArg = *(++ArgI); + Argument &FormatArg = *(++ArgI); + Modified |= replaceImageUses(Arg, ResourceID, SizeArg, FormatArg); + + // Handle sampler type. + } else if (IsSamplerType(Type)) { + uint32_t ResourceID = NumSamplerArgs++; + Modified |= replaceSamplerUses(Arg, ResourceID); + } + } + for (unsigned i = 0; i < InstsToErase.size(); ++i) { + InstsToErase[i]->eraseFromParent(); + } + + return Modified; + } + + std::tuple<Function *, MDNode *> + addImplicitArgs(Function *F, MDNode *KernelMDNode) { + bool Modified = false; + + FunctionType *FT = F->getFunctionType(); + SmallVector<Type *, 8> ArgTypes; + + // Metadata operands for new MDNode. + KernelArgMD NewArgMDs; + PushArgMD(NewArgMDs, GetArgMD(KernelMDNode, 0)); + + // Add implicit arguments to the signature. + for (unsigned i = 0; i < FT->getNumParams(); ++i) { + ArgTypes.push_back(FT->getParamType(i)); + MDVector ArgMD = GetArgMD(KernelMDNode, i + 1); + PushArgMD(NewArgMDs, ArgMD); + + if (!IsImageType(ArgTypeFromMD(KernelMDNode, i))) + continue; + + // Add size implicit argument. + ArgTypes.push_back(ImageSizeType); + ArgMD[2] = ArgMD[3] = MDString::get(*Context, ImageSizeArgMDType); + PushArgMD(NewArgMDs, ArgMD); + + // Add format implicit argument. + ArgTypes.push_back(ImageFormatType); + ArgMD[2] = ArgMD[3] = MDString::get(*Context, ImageFormatArgMDType); + PushArgMD(NewArgMDs, ArgMD); + + Modified = true; + } + if (!Modified) { + return std::make_tuple(nullptr, nullptr); + } + + // Create function with new signature and clone the old body into it. + auto NewFT = FunctionType::get(FT->getReturnType(), ArgTypes, false); + auto NewF = Function::Create(NewFT, F->getLinkage(), F->getName()); + ValueToValueMapTy VMap; + auto NewFArgIt = NewF->arg_begin(); + for (auto &Arg: F->args()) { + auto ArgName = Arg.getName(); + NewFArgIt->setName(ArgName); + VMap[&Arg] = &(*NewFArgIt++); + if (IsImageType(ArgTypeFromMD(KernelMDNode, Arg.getArgNo()))) { + (NewFArgIt++)->setName(Twine("__size_") + ArgName); + (NewFArgIt++)->setName(Twine("__format_") + ArgName); + } + } + SmallVector<ReturnInst*, 8> Returns; + CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns); + + // Build new MDNode. + SmallVector<llvm::Metadata *, 6> KernelMDArgs; + KernelMDArgs.push_back(ConstantAsMetadata::get(NewF)); + for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) + KernelMDArgs.push_back(MDNode::get(*Context, NewArgMDs.ArgVector[i])); + MDNode *NewMDNode = MDNode::get(*Context, KernelMDArgs); + + return std::make_tuple(NewF, NewMDNode); + } + + bool transformKernels(Module &M) { + NamedMDNode *KernelsMDNode = M.getNamedMetadata(KernelsMDNodeName); + if (!KernelsMDNode) + return false; + + bool Modified = false; + for (unsigned i = 0; i < KernelsMDNode->getNumOperands(); ++i) { + MDNode *KernelMDNode = KernelsMDNode->getOperand(i); + Function *F = GetFunctionFromMDNode(KernelMDNode); + if (!F) + continue; + + Function *NewF; + MDNode *NewMDNode; + std::tie(NewF, NewMDNode) = addImplicitArgs(F, KernelMDNode); + if (NewF) { + // Replace old function and metadata with new ones. + F->eraseFromParent(); + M.getFunctionList().push_back(NewF); + M.getOrInsertFunction(NewF->getName(), NewF->getFunctionType(), + NewF->getAttributes()); + KernelsMDNode->setOperand(i, NewMDNode); + + F = NewF; + KernelMDNode = NewMDNode; + Modified = true; + } + + Modified |= replaceImageAndSamplerUses(F, KernelMDNode); + } + + return Modified; + } + + public: + AMDGPUOpenCLImageTypeLoweringPass() : ModulePass(ID) {} + + bool runOnModule(Module &M) override { + Context = &M.getContext(); + Int32Type = Type::getInt32Ty(M.getContext()); + ImageSizeType = ArrayType::get(Int32Type, 3); + ImageFormatType = ArrayType::get(Int32Type, 2); + + return transformKernels(M); + } + + const char *getPassName() const override { + return "AMDGPU OpenCL Image Type Pass"; + } +}; + +char AMDGPUOpenCLImageTypeLoweringPass::ID = 0; + +} // end anonymous namespace + +ModulePass *llvm::createAMDGPUOpenCLImageTypeLoweringPass() { + return new AMDGPUOpenCLImageTypeLoweringPass(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 57b7a73..87d50d5 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -54,7 +54,7 @@ bool AMDGPUPromoteAlloca::doInitialization(Module &M) { bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { - const FunctionType *FTy = F.getFunctionType(); + FunctionType *FTy = F.getFunctionType(); LocalMemAvailable = ST.getLocalMemorySize(); @@ -63,7 +63,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { // possible these arguments require the entire local memory space, so // we cannot use local memory in the pass. for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) { - const Type *ParamTy = FTy->getParamType(i); + Type *ParamTy = FTy->getParamType(i); if (ParamTy->isPointerTy() && ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { LocalMemAvailable = 0; @@ -77,7 +77,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { // Check how much local memory is being used by global objects for (Module::global_iterator I = Mod->global_begin(), E = Mod->global_end(); I != E; ++I) { - GlobalVariable *GV = I; + GlobalVariable *GV = &*I; PointerType *GVTy = GV->getType(); if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) continue; @@ -101,7 +101,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { return false; } -static VectorType *arrayTypeToVecType(const Type *ArrayTy) { +static VectorType *arrayTypeToVecType(Type *ArrayTy) { return VectorType::get(ArrayTy->getArrayElementType(), ArrayTy->getArrayNumElements()); } @@ -276,6 +276,9 @@ static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) { } void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { + if (!I.isStaticAlloca()) + return; + IRBuilder<> Builder(&I); // First try to replace the alloca with a vector diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h index cfd800b..0344834 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h @@ -37,10 +37,6 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { assert(!"Unimplemented"); return BitVector(); } - virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const { - assert(!"Unimplemented"); return nullptr; - } - virtual unsigned getHWRegIndex(unsigned Reg) const { assert(!"Unimplemented"); return 0; } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 5f32a65..44e0c47 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -16,6 +16,7 @@ #include "R600ISelLowering.h" #include "R600InstrInfo.h" #include "R600MachineScheduler.h" +#include "SIFrameLowering.h" #include "SIISelLowering.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" @@ -44,6 +45,8 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, // disable it. SmallString<256> FullFS("+promote-alloca,+fp64-denormals,"); + if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. + FullFS += "+flat-for-global,"; FullFS += FS; if (GPU == "" && TT.getArch() == Triple::amdgcn) @@ -67,26 +70,36 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, DumpCode(false), R600ALUInst(false), HasVertexCache(false), TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false), FP64Denormals(false), FP32Denormals(false), FastFMAF32(false), - CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true), - EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false), - EnableUnsafeDSOffsetFolding(false), + CaymanISA(false), FlatAddressSpace(false), FlatForGlobal(false), + EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true), + EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false), WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false), GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0), IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false), - FrameLowering(TargetFrameLowering::StackGrowsUp, - 64 * 16, // Maximum stack alignment (long16) - 0), + FrameLowering(nullptr), InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) { initializeSubtargetDependencies(TT, GPU, FS); + const unsigned MaxStackAlign = 64 * 16; // Maximum stack alignment (long16) + if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { InstrInfo.reset(new R600InstrInfo(*this)); TLInfo.reset(new R600TargetLowering(TM, *this)); + + // FIXME: Should have R600 specific FrameLowering + FrameLowering.reset(new AMDGPUFrameLowering( + TargetFrameLowering::StackGrowsUp, + MaxStackAlign, + 0)); } else { InstrInfo.reset(new SIInstrInfo(*this)); TLInfo.reset(new SITargetLowering(TM, *this)); + FrameLowering.reset(new SIFrameLowering( + TargetFrameLowering::StackGrowsUp, + MaxStackAlign, + 0)); } } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 735f01d..9c7bb88 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -1,4 +1,4 @@ -//=====-- AMDGPUSubtarget.h - Define Subtarget for the AMDIL ---*- C++ -*-====// +//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====// // // The LLVM Compiler Infrastructure // @@ -12,17 +12,15 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H -#define LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H + #include "AMDGPU.h" #include "AMDGPUFrameLowering.h" #include "AMDGPUInstrInfo.h" -#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPUISelLowering.h" #include "AMDGPUSubtarget.h" -#include "R600ISelLowering.h" -#include "AMDKernelCodeT.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Target/TargetSubtargetInfo.h" @@ -72,6 +70,7 @@ private: bool FastFMAF32; bool CaymanISA; bool FlatAddressSpace; + bool FlatForGlobal; bool EnableIRStructurizer; bool EnablePromoteAlloca; bool EnableIfCvt; @@ -88,10 +87,10 @@ private: bool CIInsts; bool FeatureDisable; int LDSBankCount; - unsigned IsaVersion; + unsigned IsaVersion; bool EnableHugeScratchBuffer; - AMDGPUFrameLowering FrameLowering; + std::unique_ptr<AMDGPUFrameLowering> FrameLowering; std::unique_ptr<AMDGPUTargetLowering> TLInfo; std::unique_ptr<AMDGPUInstrInfo> InstrInfo; InstrItineraryData InstrItins; @@ -104,7 +103,7 @@ public: StringRef GPU, StringRef FS); const AMDGPUFrameLowering *getFrameLowering() const override { - return &FrameLowering; + return FrameLowering.get(); } const AMDGPUInstrInfo *getInstrInfo() const override { return InstrInfo.get(); @@ -161,6 +160,10 @@ public: return FlatAddressSpace; } + bool useFlatForGlobal() const { + return FlatForGlobal; + } + bool hasBFE() const { return (getGeneration() >= EVERGREEN); } @@ -305,6 +308,9 @@ public: return isAmdHsaOS() ? 0 : 36; } + unsigned getMaxNumUserSGPRs() const { + return 16; + } }; } // End namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 2297b52..22f85b3 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPUTargetMachine.h" +#include "AMDGPUTargetObjectFile.h" #include "AMDGPU.h" #include "AMDGPUTargetTransformInfo.h" #include "R600ISelLowering.h" @@ -41,6 +42,23 @@ extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget); RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget); + + PassRegistry *PR = PassRegistry::getPassRegistry(); + initializeSILowerI1CopiesPass(*PR); + initializeSIFixSGPRCopiesPass(*PR); + initializeSIFoldOperandsPass(*PR); + initializeSIFixSGPRLiveRangesPass(*PR); + initializeSIFixControlFlowLiveIntervalsPass(*PR); + initializeSILoadStoreOptimizerPass(*PR); + initializeAMDGPUAnnotateKernelFeaturesPass(*PR); + initializeAMDGPUAnnotateUniformValuesPass(*PR); +} + +static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { + if (TT.getOS() == Triple::AMDHSA) + return make_unique<AMDGPUHSATargetObjectFile>(); + + return make_unique<AMDGPUTargetObjectFile>(); } static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { @@ -72,15 +90,13 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, CodeGenOpt::Level OptLevel) : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM, OptLevel), - TLOF(new TargetLoweringObjectFileELF()), Subtarget(TT, CPU, FS, *this), + TLOF(createTLOF(getTargetTriple())), Subtarget(TT, CPU, FS, *this), IntrinsicInfo() { setRequiresStructuredCFG(true); initAsmInfo(); } -AMDGPUTargetMachine::~AMDGPUTargetMachine() { - delete TLOF; -} +AMDGPUTargetMachine::~AMDGPUTargetMachine() { } //===----------------------------------------------------------------------===// // R600 Target Machine (R600 -> Cayman) @@ -110,7 +126,13 @@ namespace { class AMDGPUPassConfig : public TargetPassConfig { public: AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) - : TargetPassConfig(TM, PM) {} + : TargetPassConfig(TM, PM) { + + // Exceptions and StackMaps are not supported, so these passes will never do + // anything. + disablePass(&StackMapLivenessID); + disablePass(&FuncletLayoutID); + } AMDGPUTargetMachine &getAMDGPUTargetMachine() const { return getTM<AMDGPUTargetMachine>(); @@ -126,8 +148,9 @@ public: void addIRPasses() override; void addCodeGenPrepare() override; - virtual bool addPreISel() override; - virtual bool addInstSelector() override; + bool addPreISel() override; + bool addInstSelector() override; + bool addGCPasses() override; }; class R600PassConfig : public AMDGPUPassConfig { @@ -147,6 +170,8 @@ public: : AMDGPUPassConfig(TM, PM) { } bool addPreISel() override; bool addInstSelector() override; + void addFastRegAlloc(FunctionPass *RegAllocPass) override; + void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; void addPreRegAlloc() override; void addPostRegAlloc() override; void addPreSched2() override; @@ -156,7 +181,7 @@ public: } // End of anonymous namespace TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](Function &F) { + return TargetIRAnalysis([this](const Function &F) { return TargetTransformInfo( AMDGPUTTIImpl(this, F.getParent()->getDataLayout())); }); @@ -172,6 +197,10 @@ void AMDGPUPassConfig::addIRPasses() { // functions, then we will generate code for the first function // without ever running any passes on the second. addPass(createBarrierNoopPass()); + + // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. + addPass(createAMDGPUOpenCLImageTypeLoweringPass()); + TargetPassConfig::addIRPasses(); } @@ -198,6 +227,11 @@ bool AMDGPUPassConfig::addInstSelector() { return false; } +bool AMDGPUPassConfig::addGCPasses() { + // Do nothing. GC is not supported. + return false; +} + //===----------------------------------------------------------------------===// // R600 Pass Setup //===----------------------------------------------------------------------===// @@ -238,16 +272,23 @@ TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { bool GCNPassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); + + // FIXME: We need to run a pass to propagate the attributes when calls are + // supported. + addPass(&AMDGPUAnnotateKernelFeaturesID); + addPass(createSinkingPass()); addPass(createSITypeRewriter()); addPass(createSIAnnotateControlFlowPass()); + addPass(createAMDGPUAnnotateUniformValues()); + return false; } bool GCNPassConfig::addInstSelector() { AMDGPUPassConfig::addInstSelector(); addPass(createSILowerI1CopiesPass()); - addPass(createSIFixSGPRCopiesPass(*TM)); + addPass(&SIFixSGPRCopiesID); addPass(createSIFoldOperandsPass()); return false; } @@ -259,7 +300,6 @@ void GCNPassConfig::addPreRegAlloc() { // earlier passes might recompute live intervals. // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass if (getOptLevel() > CodeGenOpt::None) { - initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry()); insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); } @@ -269,16 +309,27 @@ void GCNPassConfig::addPreRegAlloc() { // This should be run after scheduling, but before register allocation. It // also need extra copies to the address operand to be eliminated. - initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); insertPass(&MachineSchedulerID, &RegisterCoalescerID); } addPass(createSIShrinkInstructionsPass(), false); - addPass(createSIFixSGPRLiveRangesPass(), false); +} + +void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { + addPass(&SIFixSGPRLiveRangesID); + TargetPassConfig::addFastRegAlloc(RegAllocPass); +} + +void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { + // We want to run this after LiveVariables is computed to avoid computing them + // twice. + // FIXME: We shouldn't disable the verifier here. r249087 introduced a failure + // that needs to be fixed. + insertPass(&LiveVariablesID, &SIFixSGPRLiveRangesID, /*VerifyAfter=*/false); + TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); } void GCNPassConfig::addPostRegAlloc() { - addPass(createSIPrepareScratchRegs(), false); addPass(createSIShrinkInstructionsPass(), false); } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 14792e3..236e3f8 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -32,7 +32,7 @@ class AMDGPUTargetMachine : public LLVMTargetMachine { private: protected: - TargetLoweringObjectFile *TLOF; + std::unique_ptr<TargetLoweringObjectFile> TLOF; AMDGPUSubtarget Subtarget; AMDGPUIntrinsicInfo IntrinsicInfo; @@ -52,7 +52,7 @@ public: TargetIRAnalysis getTargetIRAnalysis() override; TargetLoweringObjectFile *getObjFileLowering() const override { - return TLOF; + return TLOF.get(); } }; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp new file mode 100644 index 0000000..e050f21 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp @@ -0,0 +1,87 @@ +//===-- AMDGPUHSATargetObjectFile.cpp - AMDGPU Object Files ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetObjectFile.h" +#include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/Support/ELF.h" + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Generic Object File +//===----------------------------------------------------------------------===// + +MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV, + SectionKind Kind, + Mangler &Mang, + const TargetMachine &TM) const { + if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GV)) + return TextSection; + + return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM); +} + +//===----------------------------------------------------------------------===// +// HSA Object File +//===----------------------------------------------------------------------===// + + +void AMDGPUHSATargetObjectFile::Initialize(MCContext &Ctx, + const TargetMachine &TM){ + TargetLoweringObjectFileELF::Initialize(Ctx, TM); + InitializeELF(TM.Options.UseInitArray); + + TextSection = AMDGPU::getHSATextSection(Ctx); + + DataGlobalAgentSection = AMDGPU::getHSADataGlobalAgentSection(Ctx); + DataGlobalProgramSection = AMDGPU::getHSADataGlobalProgramSection(Ctx); + + RodataReadonlyAgentSection = AMDGPU::getHSARodataReadonlyAgentSection(Ctx); +} + +bool AMDGPUHSATargetObjectFile::isAgentAllocationSection( + const char *SectionName) const { + return cast<MCSectionELF>(DataGlobalAgentSection) + ->getSectionName() + .equals(SectionName); +} + +bool AMDGPUHSATargetObjectFile::isAgentAllocation(const GlobalValue *GV) const { + // Read-only segments can only have agent allocation. + return AMDGPU::isReadOnlySegment(GV) || + (AMDGPU::isGlobalSegment(GV) && GV->hasSection() && + isAgentAllocationSection(GV->getSection())); +} + +bool AMDGPUHSATargetObjectFile::isProgramAllocation( + const GlobalValue *GV) const { + // The default for global segments is program allocation. + return AMDGPU::isGlobalSegment(GV) && !isAgentAllocation(GV); +} + +MCSection *AMDGPUHSATargetObjectFile::SelectSectionForGlobal( + const GlobalValue *GV, SectionKind Kind, + Mangler &Mang, + const TargetMachine &TM) const { + if (Kind.isText() && !GV->hasComdat()) + return getTextSection(); + + if (AMDGPU::isGlobalSegment(GV)) { + if (isAgentAllocation(GV)) + return DataGlobalAgentSection; + + if (isProgramAllocation(GV)) + return DataGlobalProgramSection; + } + + return AMDGPUTargetObjectFile::SelectSectionForGlobal(GV, Kind, Mang, TM); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h new file mode 100644 index 0000000..921341e --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h @@ -0,0 +1,51 @@ +//===-- AMDGPUTargetObjectFile.h - AMDGPU Object Info ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file declares the AMDGPU-specific subclass of +/// TargetLoweringObjectFile. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H + +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + +class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF { + public: + MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, + Mangler &Mang, + const TargetMachine &TM) const override; +}; + +class AMDGPUHSATargetObjectFile final : public AMDGPUTargetObjectFile { +private: + MCSection *DataGlobalAgentSection; + MCSection *DataGlobalProgramSection; + MCSection *RodataReadonlyAgentSection; + + bool isAgentAllocationSection(const char *SectionName) const; + bool isAgentAllocation(const GlobalValue *GV) const; + bool isProgramAllocation(const GlobalValue *GV) const; + +public: + void Initialize(MCContext &Ctx, const TargetMachine &TM) override; + + MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, + Mangler &Mang, + const TargetMachine &TM) const override; +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 6dacc74..54a003d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -74,9 +74,109 @@ unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) { return 4 * 128; // XXX - 4 channels. Should these count as vector instead? } -unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool) { return 32; } +unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) { + return Vector ? 0 : 32; +} unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { // Semi-arbitrary large amount. return 64; } + +unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) { + // XXX - For some reason this isn't called for switch. + switch (Opcode) { + case Instruction::Br: + case Instruction::Ret: + return 10; + default: + return BaseT::getCFInstrCost(Opcode); + } +} + +int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, + unsigned Index) { + switch (Opcode) { + case Instruction::ExtractElement: + // Dynamic indexing isn't free and is best avoided. + return Index == ~0u ? 2 : 0; + default: + return BaseT::getVectorInstrCost(Opcode, ValTy, Index); + } +} + +static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII, + const IntrinsicInst *I) { + switch (I->getIntrinsicID()) { + default: + return false; + case Intrinsic::not_intrinsic: + // This means we have an intrinsic that isn't defined in + // IntrinsicsAMDGPU.td + break; + + case Intrinsic::amdgcn_interp_p1: + case Intrinsic::amdgcn_interp_p2: + case Intrinsic::amdgcn_mbcnt_hi: + case Intrinsic::amdgcn_mbcnt_lo: + case Intrinsic::r600_read_tidig_x: + case Intrinsic::r600_read_tidig_y: + case Intrinsic::r600_read_tidig_z: + return true; + } + + StringRef Name = I->getCalledFunction()->getName(); + switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) { + default: + return false; + case AMDGPUIntrinsic::SI_tid: + case AMDGPUIntrinsic::SI_fs_interp: + return true; + } +} + +static bool isArgPassedInSGPR(const Argument *A) { + const Function *F = A->getParent(); + unsigned ShaderType = AMDGPU::getShaderType(*F); + + // Arguments to compute shaders are never a source of divergence. + if (ShaderType == ShaderType::COMPUTE) + return true; + + // For non-compute shaders, SGPR inputs are marked with either inreg or byval. + if (F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::InReg) || + F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::ByVal)) + return true; + + // Everything else is in VGPRs. + return false; +} + +/// +/// \returns true if the result of the value could potentially be +/// different across workitems in a wavefront. +bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { + + if (const Argument *A = dyn_cast<Argument>(V)) + return !isArgPassedInSGPR(A); + + // Loads from the private address space are divergent, because threads + // can execute the load instruction with the same inputs and get different + // results. + // + // All other loads are not divergent, because if threads issue loads with the + // same arguments, they will always get the same result. + if (const LoadInst *Load = dyn_cast<LoadInst>(V)) + return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; + + if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { + const TargetMachine &TM = getTLI()->getTargetMachine(); + return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic); + } + + // Assume all function calls are a source of divergence. + if (isa<CallInst>(V) || isa<InvokeInst>(V)) + return true; + + return false; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index dee0a69..976afb0 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -60,6 +60,11 @@ public: unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); unsigned getMaxInterleaveFactor(unsigned VF); + + unsigned getCFInstrCost(unsigned Opcode); + + int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); + bool isSourceOfDivergence(const Value *V) const; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index d918ac3..917efd1 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -185,7 +185,7 @@ protected: MachinePostDominatorTree *PDT; MachineLoopInfo *MLI; const R600InstrInfo *TII; - const AMDGPURegisterInfo *TRI; + const R600RegisterInfo *TRI; // PRINT FUNCTIONS /// Print the ordered Blocks. @@ -881,7 +881,7 @@ bool AMDGPUCFGStructurizer::run() { } //while, "one iteration" over the function. MachineBasicBlock *EntryMBB = - GraphTraits<MachineFunction *>::nodes_begin(FuncRep); + &*GraphTraits<MachineFunction *>::nodes_begin(FuncRep); if (EntryMBB->succ_size() == 0) { Finish = true; DEBUG( @@ -904,7 +904,7 @@ bool AMDGPUCFGStructurizer::run() { } while (!Finish && MakeProgress); // Misc wrap up to maintain the consistency of the Function representation. - wrapup(GraphTraits<MachineFunction *>::nodes_begin(FuncRep)); + wrapup(&*GraphTraits<MachineFunction *>::nodes_begin(FuncRep)); // Detach retired Block, release memory. for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end(); @@ -1164,7 +1164,7 @@ int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep, for (SmallVectorImpl<MachineBasicBlock *>::iterator It = ContMBB.begin(), E = ContMBB.end(); It != E; ++It) { - (*It)->removeSuccessor(LoopHeader); + (*It)->removeSuccessor(LoopHeader, true); } numLoopcontPatternMatch += NumCont; @@ -1353,7 +1353,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, // If MigrateTrue is true, then TrueBB is the block being "branched into" // and if MigrateFalse is true, then FalseBB is the block being // "branched into" - // + // // Here is the pseudo code for how I think the optimization should work: // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head. // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from. @@ -1372,7 +1372,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, // the late machine optimization passes, however if we implement // bool TargetRegisterInfo::requiresRegisterScavenging( // const MachineFunction &MF) - // and have it return true, liveness will be tracked correctly + // and have it return true, liveness will be tracked correctly // by generic optimization passes. We will also need to make sure that // all of our target-specific passes that run after regalloc and before // the CFGStructurizer track liveness and we will need to modify this pass @@ -1487,7 +1487,7 @@ void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB, ); DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end()); - DstMBB->removeSuccessor(SrcMBB); + DstMBB->removeSuccessor(SrcMBB, true); cloneSuccessorList(DstMBB, SrcMBB); removeSuccessor(SrcMBB); @@ -1537,9 +1537,9 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI, if (TrueMBB) { MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end()); - MBB->removeSuccessor(TrueMBB); + MBB->removeSuccessor(TrueMBB, true); if (LandMBB && TrueMBB->succ_size()!=0) - TrueMBB->removeSuccessor(LandMBB); + TrueMBB->removeSuccessor(LandMBB, true); retireBlock(TrueMBB); MLI->removeBlock(TrueMBB); } @@ -1548,9 +1548,9 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI, insertInstrBefore(I, AMDGPU::ELSE); MBB->splice(I, FalseMBB, FalseMBB->begin(), FalseMBB->end()); - MBB->removeSuccessor(FalseMBB); + MBB->removeSuccessor(FalseMBB, true); if (LandMBB && FalseMBB->succ_size() != 0) - FalseMBB->removeSuccessor(LandMBB); + FalseMBB->removeSuccessor(LandMBB, true); retireBlock(FalseMBB); MLI->removeBlock(FalseMBB); } @@ -1570,8 +1570,7 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk, insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc()); insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc()); - DstBlk->addSuccessor(LandMBB); - DstBlk->removeSuccessor(DstBlk); + DstBlk->replaceSuccessor(DstBlk, LandMBB); } @@ -1592,7 +1591,7 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, //now branchInst can be erase safely BranchMI->eraseFromParent(); //now take care of successors, retire blocks - ExitingMBB->removeSuccessor(LandMBB); + ExitingMBB->removeSuccessor(LandMBB, true); } void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB, @@ -1666,8 +1665,7 @@ AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB, replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB); //srcBlk, oldBlk, newBlk - PredMBB->removeSuccessor(MBB); - PredMBB->addSuccessor(CloneMBB); + PredMBB->replaceSuccessor(MBB, CloneMBB); // add all successor to cloneBlk cloneSuccessorList(CloneMBB, MBB); @@ -1695,10 +1693,7 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB, ); SpliceEnd = SrcMBB->end(); } else { - DEBUG( - dbgs() << "migrateInstruction see branch instr\n" ; - BranchMI->dump(); - ); + DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI); SpliceEnd = BranchMI; } DEBUG( @@ -1711,7 +1706,7 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB, DEBUG( dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size() - << "srcSize = " << SrcMBB->size() << "\n"; + << "srcSize = " << SrcMBB->size() << '\n'; ); } @@ -1743,7 +1738,7 @@ void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) { // test_fc_do_while_or.c need to fix the upstream on this to remove the loop. while ((BranchMI = getLoopendBlockBranchInstr(MBB)) && isUncondBranch(BranchMI)) { - DEBUG(dbgs() << "Removing uncond branch instr"; BranchMI->dump();); + DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI); BranchMI->eraseFromParent(); } } @@ -1759,10 +1754,10 @@ void AMDGPUCFGStructurizer::removeRedundantConditionalBranch( MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB); assert(BranchMI && isCondBranch(BranchMI)); - DEBUG(dbgs() << "Removing unneeded cond branch instr"; BranchMI->dump();); + DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI); BranchMI->eraseFromParent(); SHOWNEWBLK(MBB1, "Removing redundant successor"); - MBB->removeSuccessor(MBB1); + MBB->removeSuccessor(MBB1, true); } void AMDGPUCFGStructurizer::addDummyExitBlock( diff --git a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 2018983..d9f753f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -28,7 +28,9 @@ #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/Support/ELF.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" @@ -83,6 +85,7 @@ public: unsigned RegNo; int Modifiers; const MCRegisterInfo *TRI; + const MCSubtargetInfo *STI; bool IsForcedVOP3; }; @@ -102,7 +105,7 @@ public: } void addRegOperands(MCInst &Inst, unsigned N) const { - Inst.addOperand(MCOperand::createReg(getReg())); + Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), *Reg.STI))); } void addRegOrImmOperands(MCInst &Inst, unsigned N) const { @@ -215,6 +218,10 @@ public: (isReg() && isRegClass(AMDGPU::SReg_64RegClassID)); } + bool isSCSrc64() const { + return (isReg() && isRegClass(AMDGPU::SReg_64RegClassID)) || isInlineImm(); + } + bool isVCSrc32() const { return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID)); } @@ -251,7 +258,22 @@ public: return EndLoc; } - void print(raw_ostream &OS) const override { } + void print(raw_ostream &OS) const override { + switch (Kind) { + case Register: + OS << "<register " << getReg() << " mods: " << Reg.Modifiers << '>'; + break; + case Immediate: + OS << getImm(); + break; + case Token: + OS << '\'' << getToken() << '\''; + break; + case Expression: + OS << "<expr " << *Expr << '>'; + break; + } + } static std::unique_ptr<AMDGPUOperand> CreateImm(int64_t Val, SMLoc Loc, enum ImmTy Type = ImmTyNone, @@ -278,10 +300,12 @@ public: static std::unique_ptr<AMDGPUOperand> CreateReg(unsigned RegNo, SMLoc S, SMLoc E, const MCRegisterInfo *TRI, + const MCSubtargetInfo *STI, bool ForceVOP3) { auto Op = llvm::make_unique<AMDGPUOperand>(Register); Op->Reg.RegNo = RegNo; Op->Reg.TRI = TRI; + Op->Reg.STI = STI; Op->Reg.Modifiers = -1; Op->Reg.IsForcedVOP3 = ForceVOP3; Op->StartLoc = S; @@ -301,14 +325,32 @@ public: bool isDSOffset01() const; bool isSWaitCnt() const; bool isMubufOffset() const; + bool isSMRDOffset() const; + bool isSMRDLiteralOffset() const; }; class AMDGPUAsmParser : public MCTargetAsmParser { - MCSubtargetInfo &STI; const MCInstrInfo &MII; MCAsmParser &Parser; unsigned ForcedEncodingSize; + + bool isSI() const { + return AMDGPU::isSI(getSTI()); + } + + bool isCI() const { + return AMDGPU::isCI(getSTI()); + } + + bool isVI() const { + return AMDGPU::isVI(getSTI()); + } + + bool hasSGPR102_SGPR103() const { + return !isVI(); + } + /// @name Auto-generated Match Functions /// { @@ -323,20 +365,34 @@ private: bool ParseDirectiveHSACodeObjectISA(); bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header); bool ParseDirectiveAMDKernelCodeT(); + bool ParseSectionDirectiveHSAText(); + bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) const; + bool ParseDirectiveAMDGPUHsaKernel(); + bool ParseDirectiveAMDGPUHsaModuleGlobal(); + bool ParseDirectiveAMDGPUHsaProgramGlobal(); + bool ParseSectionDirectiveHSADataGlobalAgent(); + bool ParseSectionDirectiveHSADataGlobalProgram(); + bool ParseSectionDirectiveHSARodataReadonlyAgent(); public: - AMDGPUAsmParser(MCSubtargetInfo &STI, MCAsmParser &_Parser, +public: + enum AMDGPUMatchResultTy { + Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY + }; + + AMDGPUAsmParser(const MCSubtargetInfo &STI, MCAsmParser &_Parser, const MCInstrInfo &MII, const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(STI), MII(MII), Parser(_Parser), - ForcedEncodingSize(0){ + : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser), + ForcedEncodingSize(0) { + MCAsmParserExtension::Initialize(Parser); - if (STI.getFeatureBits().none()) { + if (getSTI().getFeatureBits().none()) { // Set default features. - STI.ToggleFeature("SOUTHERN_ISLANDS"); + copySTI().ToggleFeature("SOUTHERN_ISLANDS"); } - setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); } AMDGPUTargetStreamer &getTargetStreamer() { @@ -420,10 +476,10 @@ struct OptionalOperand { } -static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) { +static int getRegClass(bool IsVgpr, unsigned RegWidth) { if (IsVgpr) { switch (RegWidth) { - default: llvm_unreachable("Unknown register width"); + default: return -1; case 1: return AMDGPU::VGPR_32RegClassID; case 2: return AMDGPU::VReg_64RegClassID; case 3: return AMDGPU::VReg_96RegClassID; @@ -434,7 +490,7 @@ static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) { } switch (RegWidth) { - default: llvm_unreachable("Unknown register width"); + default: return -1; case 1: return AMDGPU::SGPR_32RegClassID; case 2: return AMDGPU::SGPR_64RegClassID; case 4: return AMDGPU::SReg_128RegClassID; @@ -443,16 +499,16 @@ static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) { } } -static unsigned getRegForName(const StringRef &RegName) { +static unsigned getRegForName(StringRef RegName) { return StringSwitch<unsigned>(RegName) .Case("exec", AMDGPU::EXEC) .Case("vcc", AMDGPU::VCC) - .Case("flat_scr", AMDGPU::FLAT_SCR) + .Case("flat_scratch", AMDGPU::FLAT_SCR) .Case("m0", AMDGPU::M0) .Case("scc", AMDGPU::SCC) - .Case("flat_scr_lo", AMDGPU::FLAT_SCR_LO) - .Case("flat_scr_hi", AMDGPU::FLAT_SCR_HI) + .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) + .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) .Case("vcc_lo", AMDGPU::VCC_LO) .Case("vcc_hi", AMDGPU::VCC_HI) .Case("exec_lo", AMDGPU::EXEC_LO) @@ -464,12 +520,14 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End const AsmToken Tok = Parser.getTok(); StartLoc = Tok.getLoc(); EndLoc = Tok.getEndLoc(); - const StringRef &RegName = Tok.getString(); + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + + StringRef RegName = Tok.getString(); RegNo = getRegForName(RegName); if (RegNo) { Parser.Lex(); - return false; + return !subtargetHasRegister(*TRI, RegNo); } // Match vgprs and sgprs @@ -514,16 +572,24 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End RegIndexInClass = RegLo; } else { // SGPR registers are aligned. Max alignment is 4 dwords. - RegIndexInClass = RegLo / std::min(RegWidth, 4u); + unsigned Size = std::min(RegWidth, 4u); + if (RegLo % Size != 0) + return true; + + RegIndexInClass = RegLo / Size; } } - const MCRegisterInfo *TRC = getContext().getRegisterInfo(); - unsigned RC = getRegClass(IsVgpr, RegWidth); - if (RegIndexInClass > TRC->getRegClass(RC).getNumRegs()) + int RCID = getRegClass(IsVgpr, RegWidth); + if (RCID == -1) return true; - RegNo = TRC->getRegClass(RC).getRegister(RegIndexInClass); - return false; + + const MCRegisterClass RC = TRI->getRegClass(RCID); + if (RegIndexInClass >= RC.getNumRegs()) + return true; + + RegNo = RC.getRegister(RegIndexInClass); + return !subtargetHasRegister(*TRI, RegNo); } unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { @@ -534,6 +600,11 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { (getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3))) return Match_InvalidOperand; + if ((TSFlags & SIInstrFlags::VOP3) && + (TSFlags & SIInstrFlags::VOPAsmPrefer32Bit) && + getForcedEncodingSize() != 64) + return Match_PreferE32; + return Match_Success; } @@ -549,7 +620,7 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, default: break; case Match_Success: Inst.setLoc(IDLoc); - Out.EmitInstruction(Inst, STI); + Out.EmitInstruction(Inst, getSTI()); return false; case Match_MissingFeature: return Error(IDLoc, "instruction not supported on this GPU"); @@ -592,6 +663,9 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, } return Error(ErrorLoc, "invalid operand for instruction"); } + case Match_PreferE32: + return Error(IDLoc, "internal error: instruction without _e64 suffix " + "should be encoded as e32"); } llvm_unreachable("Implement any new match types added!"); } @@ -640,7 +714,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() { // If this directive has no arguments, then use the ISA version for the // targeted GPU. if (getLexer().is(AsmToken::EndOfStatement)) { - AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(STI.getFeatureBits()); + AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits()); getTargetStreamer().EmitDirectiveHSACodeObjectISA(Isa.Major, Isa.Minor, Isa.Stepping, "AMD", "AMDGPU"); @@ -852,7 +926,7 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID, bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() { amd_kernel_code_t Header; - AMDGPU::initDefaultAMDKernelCodeT(Header, STI.getFeatureBits()); + AMDGPU::initDefaultAMDKernelCodeT(Header, getSTI().getFeatureBits()); while (true) { @@ -882,6 +956,64 @@ bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() { return false; } +bool AMDGPUAsmParser::ParseSectionDirectiveHSAText() { + getParser().getStreamer().SwitchSection( + AMDGPU::getHSATextSection(getContext())); + return false; +} + +bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() { + if (getLexer().isNot(AsmToken::Identifier)) + return TokError("expected symbol name"); + + StringRef KernelName = Parser.getTok().getString(); + + getTargetStreamer().EmitAMDGPUSymbolType(KernelName, + ELF::STT_AMDGPU_HSA_KERNEL); + Lex(); + return false; +} + +bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaModuleGlobal() { + if (getLexer().isNot(AsmToken::Identifier)) + return TokError("expected symbol name"); + + StringRef GlobalName = Parser.getTok().getIdentifier(); + + getTargetStreamer().EmitAMDGPUHsaModuleScopeGlobal(GlobalName); + Lex(); + return false; +} + +bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaProgramGlobal() { + if (getLexer().isNot(AsmToken::Identifier)) + return TokError("expected symbol name"); + + StringRef GlobalName = Parser.getTok().getIdentifier(); + + getTargetStreamer().EmitAMDGPUHsaProgramScopeGlobal(GlobalName); + Lex(); + return false; +} + +bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalAgent() { + getParser().getStreamer().SwitchSection( + AMDGPU::getHSADataGlobalAgentSection(getContext())); + return false; +} + +bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalProgram() { + getParser().getStreamer().SwitchSection( + AMDGPU::getHSADataGlobalProgramSection(getContext())); + return false; +} + +bool AMDGPUAsmParser::ParseSectionDirectiveHSARodataReadonlyAgent() { + getParser().getStreamer().SwitchSection( + AMDGPU::getHSARodataReadonlyAgentSection(getContext())); + return false; +} + bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { StringRef IDVal = DirectiveID.getString(); @@ -894,6 +1026,55 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".amd_kernel_code_t") return ParseDirectiveAMDKernelCodeT(); + if (IDVal == ".hsatext" || IDVal == ".text") + return ParseSectionDirectiveHSAText(); + + if (IDVal == ".amdgpu_hsa_kernel") + return ParseDirectiveAMDGPUHsaKernel(); + + if (IDVal == ".amdgpu_hsa_module_global") + return ParseDirectiveAMDGPUHsaModuleGlobal(); + + if (IDVal == ".amdgpu_hsa_program_global") + return ParseDirectiveAMDGPUHsaProgramGlobal(); + + if (IDVal == ".hsadata_global_agent") + return ParseSectionDirectiveHSADataGlobalAgent(); + + if (IDVal == ".hsadata_global_program") + return ParseSectionDirectiveHSADataGlobalProgram(); + + if (IDVal == ".hsarodata_readonly_agent") + return ParseSectionDirectiveHSARodataReadonlyAgent(); + + return true; +} + +bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, + unsigned RegNo) const { + if (isCI()) + return true; + + if (isSI()) { + // No flat_scr + switch (RegNo) { + case AMDGPU::FLAT_SCR: + case AMDGPU::FLAT_SCR_LO: + case AMDGPU::FLAT_SCR_HI: + return false; + default: + return true; + } + } + + // VI only has 102 SGPRs, so make sure we aren't trying to use the 2 more that + // SI/CI have. + for (MCRegAliasIterator R(AMDGPU::SGPR102_SGPR103, &MRI, true); + R.isValid(); ++R) { + if (*R == RegNo) + return false; + } + return true; } @@ -943,13 +1124,11 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { int64_t IntVal; if (getParser().parseAbsoluteExpression(IntVal)) return MatchOperand_ParseFail; - APInt IntVal32(32, IntVal); - if (IntVal32.getSExtValue() != IntVal) { + if (!isInt<32>(IntVal) && !isUInt<32>(IntVal)) { Error(S, "invalid immediate: only 32-bit values are legal"); return MatchOperand_ParseFail; } - IntVal = IntVal32.getSExtValue(); if (Negate) IntVal *= -1; Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S)); @@ -1002,7 +1181,7 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { Operands.push_back(AMDGPUOperand::CreateReg( - RegNo, S, E, getContext().getRegisterInfo(), + RegNo, S, E, getContext().getRegisterInfo(), &getSTI(), isForcedVOP3())); if (HasModifiers || Modifiers) { @@ -1571,6 +1750,23 @@ AMDGPUAsmParser::parseR128(OperandVector &Operands) { } //===----------------------------------------------------------------------===// +// smrd +//===----------------------------------------------------------------------===// + +bool AMDGPUOperand::isSMRDOffset() const { + + // FIXME: Support 20-bit offsets on VI. We need to to pass subtarget + // information here. + return isImm() && isUInt<8>(getImm()); +} + +bool AMDGPUOperand::isSMRDLiteralOffset() const { + // 32-bit literals are only supported on CI and we only want to use them + // when the offset is > 8-bits. + return isImm() && !isUInt<8>(getImm()) && isUInt<32>(getImm()); +} + +//===----------------------------------------------------------------------===// // vop3 //===----------------------------------------------------------------------===// @@ -1653,8 +1849,12 @@ AMDGPUAsmParser::parseVOP3OptionalOps(OperandVector &Operands) { } void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { - ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1); - unsigned i = 2; + + unsigned i = 1; + const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); + if (Desc.getNumDefs() > 0) { + ((AMDGPUOperand &)*Operands[i++]).addRegOperands(Inst, 1); + } std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx; diff --git a/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td index 2f5fdbe..88a090d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td @@ -8,6 +8,22 @@ //===----------------------------------------------------------------------===// // Instruction definitions for CI and newer. //===----------------------------------------------------------------------===// +// Remaining instructions: +// S_CBRANCH_CDBGUSER +// S_CBRANCH_CDBGSYS +// S_CBRANCH_CDBGSYS_OR_USER +// S_CBRANCH_CDBGSYS_AND_USER +// DS_NOP +// DS_GWS_SEMA_RELEASE_ALL +// DS_WRAP_RTN_B32 +// DS_CNDXCHG32_RTN_B64 +// DS_WRITE_B96 +// DS_WRITE_B128 +// DS_CONDXCHG32_RTN_B128 +// DS_READ_B96 +// DS_READ_B128 +// BUFFER_LOAD_DWORDX3 +// BUFFER_STORE_DWORDX3 def isCIVI : Predicate < @@ -23,6 +39,7 @@ def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">; let SubtargetPredicate = isCIVI in { +let SchedRW = [WriteDoubleAdd] in { defm V_TRUNC_F64 : VOP1Inst <vop1<0x17>, "v_trunc_f64", VOP_F64_F64, ftrunc >; @@ -35,82 +52,218 @@ defm V_FLOOR_F64 : VOP1Inst <vop1<0x1A>, "v_floor_f64", defm V_RNDNE_F64 : VOP1Inst <vop1<0x19>, "v_rndne_f64", VOP_F64_F64, frint >; +} // End SchedRW = [WriteDoubleAdd] + +let SchedRW = [WriteQuarterRate32] in { defm V_LOG_LEGACY_F32 : VOP1Inst <vop1<0x45, 0x4c>, "v_log_legacy_f32", VOP_F32_F32 >; defm V_EXP_LEGACY_F32 : VOP1Inst <vop1<0x46, 0x4b>, "v_exp_legacy_f32", VOP_F32_F32 >; +} // End SchedRW = [WriteQuarterRate32] + +//===----------------------------------------------------------------------===// +// VOP3 Instructions +//===----------------------------------------------------------------------===// + +defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8", + VOP_I32_I32_I32 +>; +defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8", + VOP_I32_I32_I32 +>; +defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8", + VOP_I32_I32_I32 +>; + +let isCommutable = 1 in { +defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32", + VOP_I64_I32_I32_I64 +>; + +// XXX - Does this set VCC? +defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32", + VOP_I64_I32_I32_I64 +>; +} // End isCommutable = 1 + + +//===----------------------------------------------------------------------===// +// DS Instructions +//===----------------------------------------------------------------------===// +defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">; + +// DS_CONDXCHG32_RTN_B64 +// DS_CONDXCHG32_RTN_B128 + +//===----------------------------------------------------------------------===// +// SMRD Instructions +//===----------------------------------------------------------------------===// + +defm S_DCACHE_INV_VOL : SMRD_Inval <smrd<0x1d, 0x22>, + "s_dcache_inv_vol", int_amdgcn_s_dcache_inv_vol>; + +//===----------------------------------------------------------------------===// +// MUBUF Instructions +//===----------------------------------------------------------------------===// + +defm BUFFER_WBINVL1_VOL : MUBUF_Invalidate <mubuf<0x70, 0x3f>, + "buffer_wbinvl1_vol", int_amdgcn_buffer_wbinvl1_vol +>; //===----------------------------------------------------------------------===// // Flat Instructions //===----------------------------------------------------------------------===// -def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x8, "flat_load_ubyte", VGPR_32>; -def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x9, "flat_load_sbyte", VGPR_32>; -def FLAT_LOAD_USHORT : FLAT_Load_Helper <0xa, "flat_load_ushort", VGPR_32>; -def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0xb, "flat_load_sshort", VGPR_32>; -def FLAT_LOAD_DWORD : FLAT_Load_Helper <0xc, "flat_load_dword", VGPR_32>; -def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0xd, "flat_load_dwordx2", VReg_64>; -def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0xe, "flat_load_dwordx4", VReg_128>; -def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0xf, "flat_load_dwordx3", VReg_96>; -def FLAT_STORE_BYTE : FLAT_Store_Helper <0x18, "flat_store_byte", VGPR_32>; -def FLAT_STORE_SHORT : FLAT_Store_Helper <0x1a, "flat_store_short", VGPR_32>; -def FLAT_STORE_DWORD : FLAT_Store_Helper <0x1c, "flat_store_dword", VGPR_32>; -def FLAT_STORE_DWORDX2 : FLAT_Store_Helper < - 0x1d, "flat_store_dwordx2", VReg_64 +defm FLAT_LOAD_UBYTE : FLAT_Load_Helper < + flat<0x8, 0x10>, "flat_load_ubyte", VGPR_32 +>; +defm FLAT_LOAD_SBYTE : FLAT_Load_Helper < + flat<0x9, 0x11>, "flat_load_sbyte", VGPR_32 +>; +defm FLAT_LOAD_USHORT : FLAT_Load_Helper < + flat<0xa, 0x12>, "flat_load_ushort", VGPR_32 +>; +defm FLAT_LOAD_SSHORT : FLAT_Load_Helper < + flat<0xb, 0x13>, "flat_load_sshort", VGPR_32> +; +defm FLAT_LOAD_DWORD : FLAT_Load_Helper < + flat<0xc, 0x14>, "flat_load_dword", VGPR_32 +>; +defm FLAT_LOAD_DWORDX2 : FLAT_Load_Helper < + flat<0xd, 0x15>, "flat_load_dwordx2", VReg_64 +>; +defm FLAT_LOAD_DWORDX4 : FLAT_Load_Helper < + flat<0xe, 0x17>, "flat_load_dwordx4", VReg_128 +>; +defm FLAT_LOAD_DWORDX3 : FLAT_Load_Helper < + flat<0xf, 0x16>, "flat_load_dwordx3", VReg_96 +>; +defm FLAT_STORE_BYTE : FLAT_Store_Helper < + flat<0x18>, "flat_store_byte", VGPR_32 +>; +defm FLAT_STORE_SHORT : FLAT_Store_Helper < + flat <0x1a>, "flat_store_short", VGPR_32 +>; +defm FLAT_STORE_DWORD : FLAT_Store_Helper < + flat<0x1c>, "flat_store_dword", VGPR_32 +>; +defm FLAT_STORE_DWORDX2 : FLAT_Store_Helper < + flat<0x1d>, "flat_store_dwordx2", VReg_64 +>; +defm FLAT_STORE_DWORDX4 : FLAT_Store_Helper < + flat<0x1e, 0x1f>, "flat_store_dwordx4", VReg_128 >; -def FLAT_STORE_DWORDX4 : FLAT_Store_Helper < - 0x1e, "flat_store_dwordx4", VReg_128 +defm FLAT_STORE_DWORDX3 : FLAT_Store_Helper < + flat<0x1f, 0x1e>, "flat_store_dwordx3", VReg_96 >; -def FLAT_STORE_DWORDX3 : FLAT_Store_Helper < - 0x1f, "flat_store_dwordx3", VReg_96 +defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC < + flat<0x30, 0x40>, "flat_atomic_swap", VGPR_32 >; -defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC <0x30, "flat_atomic_swap", VGPR_32>; defm FLAT_ATOMIC_CMPSWAP : FLAT_ATOMIC < - 0x31, "flat_atomic_cmpswap", VGPR_32, VReg_64 ->; -defm FLAT_ATOMIC_ADD : FLAT_ATOMIC <0x32, "flat_atomic_add", VGPR_32>; -defm FLAT_ATOMIC_SUB : FLAT_ATOMIC <0x33, "flat_atomic_sub", VGPR_32>; -defm FLAT_ATOMIC_RSUB : FLAT_ATOMIC <0x34, "flat_atomic_rsub", VGPR_32>; -defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC <0x35, "flat_atomic_smin", VGPR_32>; -defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC <0x36, "flat_atomic_umin", VGPR_32>; -defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC <0x37, "flat_atomic_smax", VGPR_32>; -defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC <0x38, "flat_atomic_umax", VGPR_32>; -defm FLAT_ATOMIC_AND : FLAT_ATOMIC <0x39, "flat_atomic_and", VGPR_32>; -defm FLAT_ATOMIC_OR : FLAT_ATOMIC <0x3a, "flat_atomic_or", VGPR_32>; -defm FLAT_ATOMIC_XOR : FLAT_ATOMIC <0x3b, "flat_atomic_xor", VGPR_32>; -defm FLAT_ATOMIC_INC : FLAT_ATOMIC <0x3c, "flat_atomic_inc", VGPR_32>; -defm FLAT_ATOMIC_DEC : FLAT_ATOMIC <0x3d, "flat_atomic_dec", VGPR_32>; -defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC < - 0x3e, "flat_atomic_fcmpswap", VGPR_32, VReg_64 + flat<0x31, 0x41>, "flat_atomic_cmpswap", VGPR_32, VReg_64 +>; +defm FLAT_ATOMIC_ADD : FLAT_ATOMIC < + flat<0x32, 0x42>, "flat_atomic_add", VGPR_32 +>; +defm FLAT_ATOMIC_SUB : FLAT_ATOMIC < + flat<0x33, 0x43>, "flat_atomic_sub", VGPR_32 +>; +defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC < + flat<0x35, 0x44>, "flat_atomic_smin", VGPR_32 +>; +defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC < + flat<0x36, 0x45>, "flat_atomic_umin", VGPR_32 +>; +defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC < + flat<0x37, 0x46>, "flat_atomic_smax", VGPR_32 +>; +defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC < + flat<0x38, 0x47>, "flat_atomic_umax", VGPR_32 +>; +defm FLAT_ATOMIC_AND : FLAT_ATOMIC < + flat<0x39, 0x48>, "flat_atomic_and", VGPR_32 +>; +defm FLAT_ATOMIC_OR : FLAT_ATOMIC < + flat<0x3a, 0x49>, "flat_atomic_or", VGPR_32 +>; +defm FLAT_ATOMIC_XOR : FLAT_ATOMIC < + flat<0x3b, 0x4a>, "flat_atomic_xor", VGPR_32 +>; +defm FLAT_ATOMIC_INC : FLAT_ATOMIC < + flat<0x3c, 0x4b>, "flat_atomic_inc", VGPR_32 +>; +defm FLAT_ATOMIC_DEC : FLAT_ATOMIC < + flat<0x3d, 0x4c>, "flat_atomic_dec", VGPR_32 +>; +defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC < + flat<0x50, 0x60>, "flat_atomic_swap_x2", VReg_64 >; -defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC <0x3f, "flat_atomic_fmin", VGPR_32>; -defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC <0x40, "flat_atomic_fmax", VGPR_32>; -defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC <0x50, "flat_atomic_swap_x2", VReg_64>; defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_ATOMIC < - 0x51, "flat_atomic_cmpswap_x2", VReg_64, VReg_128 ->; -defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC <0x52, "flat_atomic_add_x2", VReg_64>; -defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC <0x53, "flat_atomic_sub_x2", VReg_64>; -defm FLAT_ATOMIC_RSUB_X2 : FLAT_ATOMIC <0x54, "flat_atomic_rsub_x2", VReg_64>; -defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC <0x55, "flat_atomic_smin_x2", VReg_64>; -defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC <0x56, "flat_atomic_umin_x2", VReg_64>; -defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC <0x57, "flat_atomic_smax_x2", VReg_64>; -defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC <0x58, "flat_atomic_umax_x2", VReg_64>; -defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC <0x59, "flat_atomic_and_x2", VReg_64>; -defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC <0x5a, "flat_atomic_or_x2", VReg_64>; -defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC <0x5b, "flat_atomic_xor_x2", VReg_64>; -defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC <0x5c, "flat_atomic_inc_x2", VReg_64>; -defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC <0x5d, "flat_atomic_dec_x2", VReg_64>; -defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC < - 0x5e, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128 + flat<0x51, 0x61>, "flat_atomic_cmpswap_x2", VReg_64, VReg_128 +>; +defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC < + flat<0x52, 0x62>, "flat_atomic_add_x2", VReg_64 +>; +defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC < + flat<0x53, 0x63>, "flat_atomic_sub_x2", VReg_64 +>; +defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC < + flat<0x55, 0x64>, "flat_atomic_smin_x2", VReg_64 +>; +defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC < + flat<0x56, 0x65>, "flat_atomic_umin_x2", VReg_64 +>; +defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC < + flat<0x57, 0x66>, "flat_atomic_smax_x2", VReg_64 +>; +defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC < + flat<0x58, 0x67>, "flat_atomic_umax_x2", VReg_64 +>; +defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC < + flat<0x59, 0x68>, "flat_atomic_and_x2", VReg_64 +>; +defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC < + flat<0x5a, 0x69>, "flat_atomic_or_x2", VReg_64 +>; +defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC < + flat<0x5b, 0x6a>, "flat_atomic_xor_x2", VReg_64 +>; +defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC < + flat<0x5c, 0x6b>, "flat_atomic_inc_x2", VReg_64 +>; +defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC < + flat<0x5d, 0x6c>, "flat_atomic_dec_x2", VReg_64 >; -defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC <0x5f, "flat_atomic_fmin_x2", VReg_64>; -defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC <0x60, "flat_atomic_fmax_x2", VReg_64>; } // End SubtargetPredicate = isCIVI +// CI Only flat instructions + +let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst in { + +defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC < + flat<0x3e>, "flat_atomic_fcmpswap", VGPR_32, VReg_64 +>; +defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC < + flat<0x3f>, "flat_atomic_fmin", VGPR_32 +>; +defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC < + flat<0x40>, "flat_atomic_fmax", VGPR_32 +>; +defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC < + flat<0x5e>, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128 +>; +defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC < + flat<0x5f>, "flat_atomic_fmin_x2", VReg_64 +>; +defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC < + flat<0x60>, "flat_atomic_fmax_x2", VReg_64 +>; + +} // End let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst + //===----------------------------------------------------------------------===// // Flat Patterns //===----------------------------------------------------------------------===// @@ -147,3 +300,80 @@ def : FLATStore_Pattern <FLAT_STORE_DWORDX4, v4i32, flat_store>; } // End HasFlatAddressSpace predicate +let Predicates = [isCI] in { + +// Convert (x - floor(x)) to fract(x) +def : Pat < + (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), + (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), + (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +// Convert (x + (-floor(x))) to fract(x) +def : Pat < + (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), + (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), + (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +} // End Predicates = [isCI] + + +//===----------------------------------------------------------------------===// +// Patterns to generate flat for global +//===----------------------------------------------------------------------===// + +def useFlatForGlobal : Predicate < + "Subtarget->useFlatForGlobal() || " + "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">; + +let Predicates = [useFlatForGlobal] in { + +// 1. Offset as 20bit DWORD immediate +def : Pat < + (SIload_constant v4i32:$sbase, IMM20bit:$offset), + (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) +>; + +// Patterns for global loads with no offset +class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < + (vt (node i64:$addr)), + (inst $addr, 0, 0, 0) +>; + +def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_global, i32>; +def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_global, i32>; +def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_global, i32>; +def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_global, i32>; +def : FlatLoadPat <FLAT_LOAD_DWORD, global_load, i32>; +def : FlatLoadPat <FLAT_LOAD_DWORDX2, global_load, v2i32>; +def : FlatLoadPat <FLAT_LOAD_DWORDX4, global_load, v4i32>; + +class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < + (node vt:$data, i64:$addr), + (inst $data, $addr, 0, 0, 0) +>; + +def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_global, i32>; +def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_global, i32>; +def : FlatStorePat <FLAT_STORE_DWORD, global_store, i32>; +def : FlatStorePat <FLAT_STORE_DWORDX2, global_store, v2i32>; +def : FlatStorePat <FLAT_STORE_DWORDX4, global_store, v4i32>; + +class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < + (vt (node i64:$addr, vt:$data)), + (inst $addr, $data, 0, 0) +>; + +def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>; + +} // End Predicates = [useFlatForGlobal] diff --git a/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td index ba4df82..a6c3785 100644 --- a/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td @@ -82,6 +82,10 @@ def RAT_STORE_DWORD32 : RAT_STORE_DWORD <R600_TReg32_X, i32, 0x1>; def RAT_STORE_DWORD64 : RAT_STORE_DWORD <R600_Reg64, v2i32, 0x3>; def RAT_STORE_DWORD128 : RAT_STORE_DWORD <R600_Reg128, v4i32, 0xf>; +def RAT_STORE_TYPED_cm: CF_MEM_RAT_STORE_TYPED<0> { + let eop = 0; // This bit is not used on Cayman. +} + class VTX_READ_cm <string name, bits<8> buffer_id, dag outs, list<dag> pattern> : VTX_WORD0_cm, VTX_READ<name, buffer_id, outs, pattern> { diff --git a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td index 7adcd46..779a14e 100644 --- a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -40,6 +40,15 @@ class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, dag ins, string name, : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins, "MEM_RAT "#name, pattern>; +class CF_MEM_RAT_STORE_TYPED<bits<1> has_eop> + : CF_MEM_RAT <0x1, ?, (ins R600_Reg128:$rw_gpr, R600_Reg128:$index_gpr, + i32imm:$rat_id, InstFlag:$eop), + "STORE_TYPED RAT($rat_id) $rw_gpr, $index_gpr" + #!if(has_eop, ", $eop", ""), + [(int_r600_rat_store_typed R600_Reg128:$rw_gpr, + R600_Reg128:$index_gpr, + (i32 imm:$rat_id))]>; + def RAT_MSKOR : CF_MEM_RAT <0x11, 0, (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr), "MSKOR $rw_gpr.XW, $index_gpr", @@ -105,6 +114,8 @@ def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf, [(global_store v4i32:$rw_gpr, i32:$index_gpr)] >; +def RAT_STORE_TYPED_eg: CF_MEM_RAT_STORE_TYPED<1>; + } // End usesCustomInserter = 1 class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern> diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp index e811d5c..a187de8 100644 --- a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -16,6 +16,7 @@ #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -283,8 +284,13 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) { O << "4.0"; else if (Imm == DoubleToBits(-4.0)) O << "-4.0"; - else - llvm_unreachable("64-bit literal constants not supported"); + else { + assert(isUInt<32>(Imm)); + + // In rare situations, we will have a 32-bit literal in a 64-bit + // operand. This is technically allowed for the encoding of s_mov_b64. + O << formatHex(static_cast<uint64_t>(Imm)); + } } void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, @@ -592,11 +598,11 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, } else { unsigned Stream = (SImm16 >> 8) & 0x3; if (Op == 1) - O << "cut"; + O << "cut"; else if (Op == 2) - O << "emit"; + O << "emit"; else if (Op == 3) - O << "emit-cut"; + O << "emit-cut"; O << " stream " << Stream; } O << "), [m0] "; diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h index 14fb511..90541d8 100644 --- a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -13,9 +13,7 @@ #ifndef LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H #define LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H -#include "llvm/ADT/StringRef.h" #include "llvm/MC/MCInstPrinter.h" -#include "llvm/Support/raw_ostream.h" namespace llvm { diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index 4434d9b..60e8c8f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -99,14 +99,22 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, case AMDGPU::fixup_si_rodata: { uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset()); - *Dst = Value; - break; - } - - case AMDGPU::fixup_si_end_of_text: { - uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset()); - // The value points to the last instruction in the text section, so we - // need to add 4 bytes to get to the start of the constants. + // We emit constant data at the end of the text section and generate its + // address using the following code sequence: + // s_getpc_b64 s[0:1] + // s_add_u32 s0, s0, $symbol + // s_addc_u32 s1, s1, 0 + // + // s_getpc_b64 returns the address of the s_add_u32 instruction and then + // the fixup replaces $symbol with a literal constant, which is a + // pc-relative offset from the encoding of the $symbol operand to the + // constant data. + // + // What we want here is an offset from the start of the s_add_u32 + // instruction to the constant data, but since the encoding of $symbol + // starts 4 bytes after the start of the add instruction, we end up + // with an offset that is 4 bytes too small. This requires us to + // add 4 to the fixup value before applying it. *Dst = Value + 4; break; } @@ -136,8 +144,7 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo( const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = { // name offset bits flags { "fixup_si_sopp_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, - { "fixup_si_rodata", 0, 32, 0 }, - { "fixup_si_end_of_text", 0, 32, MCFixupKindInfo::FKF_IsPCRel } + { "fixup_si_rodata", 0, 32, MCFixupKindInfo::FKF_IsPCRel } }; if (Kind < FirstTargetFixupKind) diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp new file mode 100644 index 0000000..9ff9fe7 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp @@ -0,0 +1,26 @@ +//===-------- AMDGPUELFStreamer.cpp - ELF Object Output -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUELFStreamer.h" +#include "Utils/AMDGPUBaseInfo.h" + +using namespace llvm; + +void AMDGPUELFStreamer::InitSections(bool NoExecStack) { + // Start with the .hsatext section by default. + SwitchSection(AMDGPU::getHSATextSection(getContext())); +} + +MCELFStreamer *llvm::createAMDGPUELFStreamer(MCContext &Context, + MCAsmBackend &MAB, + raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, + bool RelaxAll) { + return new AMDGPUELFStreamer(Context, MAB, OS, Emitter); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h new file mode 100644 index 0000000..488d7e7 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h @@ -0,0 +1,40 @@ +//===-------- AMDGPUELFStreamer.h - ELF Object Output ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is a custom MCELFStreamer which allows us to insert some hooks before +// emitting data into an actual object file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H +#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H + +#include "llvm/MC/MCELFStreamer.h" + +namespace llvm { +class MCAsmBackend; +class MCCodeEmitter; +class MCContext; +class MCSubtargetInfo; + +class AMDGPUELFStreamer : public MCELFStreamer { +public: + AMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS, + MCCodeEmitter *Emitter) + : MCELFStreamer(Context, MAB, OS, Emitter) { } + + virtual void InitSections(bool NoExecStac) override; +}; + +MCELFStreamer *createAMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB, + raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll); +} // namespace llvm. + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h index 01021d6..59a9178 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h @@ -21,9 +21,6 @@ enum Fixups { /// fixup for global addresses with constant initializers fixup_si_rodata, - /// fixup for offset from instruction to end of text section - fixup_si_end_of_text, - // Marker LastTargetFixupKind, NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 028a86d..68b1d1a 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -22,13 +22,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() { InlineAsmEnd = ";#ASMEND"; //===--- Data Emission Directives -------------------------------------===// - ZeroDirective = ".zero"; - AsciiDirective = ".ascii\t"; - AscizDirective = ".asciz\t"; - Data8bitsDirective = ".byte\t"; - Data16bitsDirective = ".short\t"; - Data32bitsDirective = ".long\t"; - Data64bitsDirective = ".quad\t"; SunStyleELFSectionSwitchSyntax = true; UsesELFSectionDirectiveForBSS = true; @@ -41,3 +34,10 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() { //===--- Dwarf Emission Directives -----------------------------------===// SupportsDebugInformation = true; } + +bool AMDGPUMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const { + return SectionName == ".hsatext" || SectionName == ".hsadata_global_agent" || + SectionName == ".hsadata_global_program" || + SectionName == ".hsarodata_readonly_agent" || + MCAsmInfo::shouldOmitSectionDirective(SectionName); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h index a5bac51..a546961 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h @@ -21,12 +21,13 @@ class Triple; // If you need to create another MCAsmInfo class, which inherits from MCAsmInfo, // you will need to make sure your new class sets PrivateGlobalPrefix to -// a prefix that won't appeary in a fuction name. The default value +// a prefix that won't appear in a function name. The default value // for PrivateGlobalPrefix is 'L', so it will consider any function starting // with 'L' as a local symbol. class AMDGPUMCAsmInfo : public MCAsmInfoELF { public: explicit AMDGPUMCAsmInfo(const Triple &TT); + bool shouldOmitSectionDirective(StringRef SectionName) const override; }; } // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index c709741..f704094 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPUMCTargetDesc.h" +#include "AMDGPUELFStreamer.h" #include "AMDGPUMCAsmInfo.h" #include "AMDGPUTargetStreamer.h" #include "InstPrinter/AMDGPUInstPrinter.h" @@ -85,6 +86,15 @@ static MCTargetStreamer * createAMDGPUObjectTargetStreamer( return new AMDGPUTargetELFStreamer(S); } +static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context, + MCAsmBackend &MAB, raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll) { + if (T.getOS() == Triple::AMDHSA) + return createAMDGPUELFStreamer(Context, MAB, OS, Emitter, RelaxAll); + + return createELFStreamer(Context, MAB, OS, Emitter, RelaxAll); +} + extern "C" void LLVMInitializeAMDGPUTargetMC() { for (Target *T : {&TheAMDGPUTarget, &TheGCNTarget}) { RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T); @@ -95,6 +105,7 @@ extern "C" void LLVMInitializeAMDGPUTargetMC() { TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo); TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter); TargetRegistry::RegisterMCAsmBackend(*T, createAMDGPUAsmBackend); + TargetRegistry::RegisterELFStreamer(*T, createMCStreamer); } // R600 specific registration diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 09e6cb1..b91134d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -13,6 +13,7 @@ #include "AMDGPUTargetStreamer.h" #include "SIDefines.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/Twine.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFStreamer.h" @@ -220,6 +221,26 @@ AMDGPUTargetAsmStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) { } +void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, + unsigned Type) { + switch (Type) { + default: llvm_unreachable("Invalid AMDGPU symbol type"); + case ELF::STT_AMDGPU_HSA_KERNEL: + OS << "\t.amdgpu_hsa_kernel " << SymbolName << '\n' ; + break; + } +} + +void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaModuleScopeGlobal( + StringRef GlobalName) { + OS << "\t.amdgpu_hsa_module_global " << GlobalName << '\n'; +} + +void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaProgramScopeGlobal( + StringRef GlobalName) { + OS << "\t.amdgpu_hsa_program_global " << GlobalName << '\n'; +} + //===----------------------------------------------------------------------===// // AMDGPUTargetELFStreamer //===----------------------------------------------------------------------===// @@ -291,7 +312,35 @@ AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) { MCStreamer &OS = getStreamer(); OS.PushSection(); - OS.SwitchSection(OS.getContext().getObjectFileInfo()->getTextSection()); + // The MCObjectFileInfo that is available to the assembler is a generic + // implementation and not AMDGPUHSATargetObjectFile, so we can't use + // MCObjectFileInfo::getTextSection() here for fetching the HSATextSection. + OS.SwitchSection(AMDGPU::getHSATextSection(OS.getContext())); OS.EmitBytes(StringRef((const char*)&Header, sizeof(Header))); OS.PopSection(); } + +void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, + unsigned Type) { + MCSymbolELF *Symbol = cast<MCSymbolELF>( + getStreamer().getContext().getOrCreateSymbol(SymbolName)); + Symbol->setType(ELF::STT_AMDGPU_HSA_KERNEL); +} + +void AMDGPUTargetELFStreamer::EmitAMDGPUHsaModuleScopeGlobal( + StringRef GlobalName) { + + MCSymbolELF *Symbol = cast<MCSymbolELF>( + getStreamer().getContext().getOrCreateSymbol(GlobalName)); + Symbol->setType(ELF::STT_OBJECT); + Symbol->setBinding(ELF::STB_LOCAL); +} + +void AMDGPUTargetELFStreamer::EmitAMDGPUHsaProgramScopeGlobal( + StringRef GlobalName) { + + MCSymbolELF *Symbol = cast<MCSymbolELF>( + getStreamer().getContext().getOrCreateSymbol(GlobalName)); + Symbol->setType(ELF::STT_OBJECT); + Symbol->setBinding(ELF::STB_GLOBAL); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index d37677c..83bb728 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -7,6 +7,9 @@ // //===----------------------------------------------------------------------===// +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H + #include "AMDKernelCodeT.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" @@ -27,6 +30,12 @@ public: StringRef ArchName) = 0; virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) = 0; + + virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0; + + virtual void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) = 0; + + virtual void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) = 0; }; class AMDGPUTargetAsmStreamer : public AMDGPUTargetStreamer { @@ -41,6 +50,12 @@ public: StringRef ArchName) override; void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override; + + void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override; + + void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override; + + void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override; }; class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer { @@ -72,6 +87,12 @@ public: void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override; + void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override; + + void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override; + + void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override; }; } +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp index e683498..3c1142d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -37,7 +37,6 @@ class R600MCCodeEmitter : public AMDGPUMCCodeEmitter { const MCRegisterInfo &MRI; public: - R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri) : MCII(mcii), MRI(mri) { } @@ -50,8 +49,8 @@ public: uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; -private: +private: void EmitByte(unsigned int byte, raw_ostream &OS) const; void Emit(uint32_t value, raw_ostream &OS) const; @@ -59,7 +58,6 @@ private: unsigned getHWRegChan(unsigned reg) const; unsigned getHWReg(unsigned regNo) const; - }; } // End anonymous namespace @@ -83,7 +81,7 @@ enum FCInstr { MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - MCContext &Ctx) { + MCContext &Ctx) { return new R600MCCodeEmitter(MCII, MRI); } diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index 65a0eeb..9eb3dad 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -36,7 +36,6 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { void operator=(const SIMCCodeEmitter &) = delete; const MCInstrInfo &MCII; const MCRegisterInfo &MRI; - MCContext &Ctx; /// \brief Can this operand also contain immediate values? bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const; @@ -47,7 +46,7 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { public: SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, MCContext &ctx) - : MCII(mcii), MRI(mri), Ctx(ctx) { } + : MCII(mcii), MRI(mri) { } ~SIMCCodeEmitter() override {} @@ -250,17 +249,7 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, if (MO.isExpr()) { const MCSymbolRefExpr *Expr = cast<MCSymbolRefExpr>(MO.getExpr()); - MCFixupKind Kind; - const MCSymbol *Sym = - Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); - - if (&Expr->getSymbol() == Sym) { - // Add the offset to the beginning of the constant values. - Kind = (MCFixupKind)AMDGPU::fixup_si_end_of_text; - } else { - // This is used for constant data stored in .rodata. - Kind = (MCFixupKind)AMDGPU::fixup_si_rodata; - } + MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_rodata; Fixups.push_back(MCFixup::create(4, Expr, Kind, MI.getLoc())); } diff --git a/contrib/llvm/lib/Target/AMDGPU/Processors.td b/contrib/llvm/lib/Target/AMDGPU/Processors.td index d9a0723..a1584a2 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Processors.td +++ b/contrib/llvm/lib/Target/AMDGPU/Processors.td @@ -142,3 +142,7 @@ def : ProcessorModel<"carrizo", SIQuarterSpeedModel, def : ProcessorModel<"fiji", SIQuarterSpeedModel, [FeatureVolcanicIslands, FeatureISAVersion8_0_1] >; + +def : ProcessorModel<"stoney", SIQuarterSpeedModel, + [FeatureVolcanicIslands, FeatureISAVersion8_0_1] +>; diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp index c8f37f6..bd80bb2 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -405,8 +405,8 @@ private: if (MO.isReg() && MO.isInternalRead()) MO.setIsInternalRead(false); } - getLiteral(BI, Literals); - ClauseContent.push_back(BI); + getLiteral(&*BI, Literals); + ClauseContent.push_back(&*BI); } I = BI; DeleteMI->eraseFromParent(); diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 4e4d554..124a9c6 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -190,6 +190,10 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM, setSchedulingPreference(Sched::Source); } +static inline bool isEOP(MachineBasicBlock::iterator I) { + return std::next(I)->getOpcode() == AMDGPU::RETURN; +} + MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { MachineFunction * MF = BB->getParent(); @@ -276,12 +280,18 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( case AMDGPU::RAT_WRITE_CACHELESS_32_eg: case AMDGPU::RAT_WRITE_CACHELESS_64_eg: case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { - unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) .addOperand(MI->getOperand(0)) .addOperand(MI->getOperand(1)) - .addImm(EOP); // Set End of program bit + .addImm(isEOP(I)); // Set End of program bit + break; + } + case AMDGPU::RAT_STORE_TYPED_eg: { + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addOperand(MI->getOperand(2)) + .addImm(isEOP(I)); // Set End of program bit break; } @@ -539,7 +549,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( } } } - bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; + bool EOP = isEOP(I); if (!EOP && !isLastInstructionOfItsType) return BB; unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; @@ -946,6 +956,8 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDValue Arg = Op.getOperand(0); SDLoc DL(Op); + + // TODO: Should this propagate fast-math-flags? SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, DAG.getNode(ISD::FADD, DL, VT, DAG.getNode(ISD::FMUL, DL, VT, Arg, @@ -1936,6 +1948,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, Arg->getOperand(0).getOperand(Element)); } } + break; } case ISD::SELECT_CC: { diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp index 855fa9f..8b6eea1 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -922,7 +922,7 @@ bool R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, unsigned ExtraPredCycles, - const BranchProbability &Probability) const{ + BranchProbability Probability) const{ return true; } @@ -933,14 +933,14 @@ R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB, MachineBasicBlock &FMBB, unsigned NumFCycles, unsigned ExtraFCycles, - const BranchProbability &Probability) const { + BranchProbability Probability) const { return true; } bool R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, - const BranchProbability &Probability) + BranchProbability Probability) const { return true; } diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h index dee4c2b..e7251c3 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h @@ -174,18 +174,18 @@ namespace llvm { bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, - const BranchProbability &Probability) const override; + BranchProbability Probability) const override; bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, unsigned ExtraPredCycles, - const BranchProbability &Probability) const override ; + BranchProbability Probability) const override ; bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumTCycles, unsigned ExtraTCycles, MachineBasicBlock &FMBB, unsigned NumFCycles, unsigned ExtraFCycles, - const BranchProbability &Probability) const override; + BranchProbability Probability) const override; bool DefinesPredicate(MachineInstr *MI, std::vector<MachineOperand> &Pred) const override; diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td index 7beed09..33ef6a4 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td @@ -1655,7 +1655,7 @@ def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>; // ISel Patterns //===----------------------------------------------------------------------===// -// CND*_INT Pattterns for f32 True / False values +// CND*_INT Patterns for f32 True / False values class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat < (selectcc i32:$src0, 0, f32:$src1, f32:$src2, cc), diff --git a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index 0c06ccc..5efb3b9 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -318,7 +318,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { MRI = &(Fn.getRegInfo()); for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); MBB != MBBe; ++MBB) { - MachineBasicBlock *MB = MBB; + MachineBasicBlock *MB = &*MBB; PreviousRegSeq.clear(); PreviousRegSeqByReg.clear(); PreviousRegSeqByUndefCount.clear(); diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp index deee5bc..2126961 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp @@ -81,11 +81,11 @@ private: int LastDstChan = -1; do { bool isTrans = false; - int BISlot = getSlot(BI); + int BISlot = getSlot(&*BI); if (LastDstChan >= BISlot) isTrans = true; LastDstChan = BISlot; - if (TII->isPredicated(BI)) + if (TII->isPredicated(&*BI)) continue; int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write); if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0) @@ -95,7 +95,7 @@ private: continue; } unsigned Dst = BI->getOperand(DstIdx).getReg(); - if (isTrans || TII->isTransOnly(BI)) { + if (isTrans || TII->isTransOnly(&*BI)) { Result[Dst] = AMDGPU::PS; continue; } @@ -149,7 +149,7 @@ private: public: // Ctor. R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI) - : VLIWPacketizerList(MF, MLI, true), + : VLIWPacketizerList(MF, MLI, nullptr), TII(static_cast<const R600InstrInfo *>( MF.getSubtarget().getInstrInfo())), TRI(TII->getRegisterInfo()) { @@ -162,14 +162,14 @@ public: } // ignorePseudoInstruction - Ignore bundling of pseudo instructions. - bool ignorePseudoInstruction(MachineInstr *MI, - MachineBasicBlock *MBB) override { + bool ignorePseudoInstruction(const MachineInstr *MI, + const MachineBasicBlock *MBB) override { return false; } // isSoloInstruction - return true if instruction MI can not be packetized // with any other instruction, which means that MI itself is a packet. - bool isSoloInstruction(MachineInstr *MI) override { + bool isSoloInstruction(const MachineInstr *MI) override { if (TII->isVector(*MI)) return true; if (!TII->isALUInstr(MI->getOpcode())) @@ -375,7 +375,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { // instruction stream until we find the nearest boundary. MachineBasicBlock::iterator I = RegionEnd; for(;I != MBB->begin(); --I, --RemainingCount) { - if (TII->isSchedulingBoundary(std::prev(I), MBB, Fn)) + if (TII->isSchedulingBoundary(&*std::prev(I), &*MBB, Fn)) break; } I = MBB->begin(); @@ -392,7 +392,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { continue; } - Packetizer.PacketizeMIs(MBB, I, RegionEnd); + Packetizer.PacketizeMIs(&*MBB, &*I, RegionEnd); RegionEnd = I; } } diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h index 9713e60..4f8a129 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h @@ -35,7 +35,7 @@ struct R600RegisterInfo : public AMDGPURegisterInfo { /// \brief get the register class of the specified type to use in the /// CFGStructurizer - const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override; + const TargetRegisterClass *getCFGStructurizerRegClass(MVT VT) const; const RegClassWeight & getRegClassWeight(const TargetRegisterClass *RC) const override; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index ccfbf1b..fa4d24a 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -312,11 +312,10 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { if (std::find(Latches.begin(), Latches.end(), *PI) == Latches.end()) Preds.push_back(*PI); } - BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", nullptr, DT, - LI, false); + BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false); } - CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt()); + CallInst::Create(EndCf, popSaved(), "", &*BB->getFirstInsertionPt()); } /// \brief Annotate the control flow with intrinsics so the backend can diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h index 4c32639..7f79dd3 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h @@ -37,7 +37,8 @@ enum { MIMG = 1 << 18, FLAT = 1 << 19, WQM = 1 << 20, - VGPRSpill = 1 << 21 + VGPRSpill = 1 << 21, + VOPAsmPrefer32Bit = 1 << 22 }; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp index 5fe8d19..636750d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp @@ -16,15 +16,9 @@ #include "AMDGPU.h" #include "SIInstrInfo.h" -#include "SIRegisterInfo.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" using namespace llvm; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 23502b4..96e37c5 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -82,22 +82,10 @@ using namespace llvm; namespace { class SIFixSGPRCopies : public MachineFunctionPass { - -private: +public: static char ID; - const TargetRegisterClass *inferRegClassFromUses(const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI, - unsigned Reg, - unsigned SubReg) const; - const TargetRegisterClass *inferRegClassFromDef(const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI, - unsigned Reg, - unsigned SubReg) const; - bool isVGPRToSGPRCopy(const MachineInstr &Copy, const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI) const; -public: - SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { } + SIFixSGPRCopies() : MachineFunctionPass(ID) { } bool runOnMachineFunction(MachineFunction &MF) override; @@ -105,14 +93,23 @@ public: return "SI Fix SGPR copies"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } }; } // End anonymous namespace +INITIALIZE_PASS(SIFixSGPRCopies, DEBUG_TYPE, + "SI Fix SGPR copies", false, false) + char SIFixSGPRCopies::ID = 0; -FunctionPass *llvm::createSIFixSGPRCopiesPass(TargetMachine &tm) { - return new SIFixSGPRCopies(tm); +char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID; + +FunctionPass *llvm::createSIFixSGPRCopiesPass() { + return new SIFixSGPRCopies(); } static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) { @@ -128,77 +125,115 @@ static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) { return false; } -/// This functions walks the use list of Reg until it finds an Instruction -/// that isn't a COPY returns the register class of that instruction. -/// \return The register defined by the first non-COPY instruction. -const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses( - const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI, - unsigned Reg, - unsigned SubReg) const { - - const TargetRegisterClass *RC - = TargetRegisterInfo::isVirtualRegister(Reg) ? - MRI.getRegClass(Reg) : - TRI->getPhysRegClass(Reg); - - RC = TRI->getSubRegClass(RC, SubReg); - for (MachineRegisterInfo::use_instr_iterator - I = MRI.use_instr_begin(Reg), E = MRI.use_instr_end(); I != E; ++I) { - switch (I->getOpcode()) { - case AMDGPU::COPY: - RC = TRI->getCommonSubClass(RC, inferRegClassFromUses(TRI, MRI, - I->getOperand(0).getReg(), - I->getOperand(0).getSubReg())); - break; - } - } +static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> +getCopyRegClasses(const MachineInstr &Copy, + const SIRegisterInfo &TRI, + const MachineRegisterInfo &MRI) { + unsigned DstReg = Copy.getOperand(0).getReg(); + unsigned SrcReg = Copy.getOperand(1).getReg(); + + const TargetRegisterClass *SrcRC = + TargetRegisterInfo::isVirtualRegister(SrcReg) ? + MRI.getRegClass(SrcReg) : + TRI.getPhysRegClass(SrcReg); - return RC; + // We don't really care about the subregister here. + // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg()); + + const TargetRegisterClass *DstRC = + TargetRegisterInfo::isVirtualRegister(DstReg) ? + MRI.getRegClass(DstReg) : + TRI.getPhysRegClass(DstReg); + + return std::make_pair(SrcRC, DstRC); } -const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromDef( - const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI, - unsigned Reg, - unsigned SubReg) const { - if (!TargetRegisterInfo::isVirtualRegister(Reg)) { - const TargetRegisterClass *RC = TRI->getPhysRegClass(Reg); - return TRI->getSubRegClass(RC, SubReg); - } - MachineInstr *Def = MRI.getVRegDef(Reg); - if (Def->getOpcode() != AMDGPU::COPY) { - return TRI->getSubRegClass(MRI.getRegClass(Reg), SubReg); - } +static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, + const TargetRegisterClass *DstRC, + const SIRegisterInfo &TRI) { + return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC); +} - return inferRegClassFromDef(TRI, MRI, Def->getOperand(1).getReg(), - Def->getOperand(1).getSubReg()); +static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, + const TargetRegisterClass *DstRC, + const SIRegisterInfo &TRI) { + return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC); } -bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy, - const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI) const { +// Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. +// +// SGPRx = ... +// SGPRy = REG_SEQUENCE SGPRx, sub0 ... +// VGPRz = COPY SGPRy +// +// ==> +// +// VGPRx = COPY SGPRx +// VGPRz = REG_SEQUENCE VGPRx, sub0 +// +// This exposes immediate folding opportunities when materializing 64-bit +// immediates. +static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, + const SIRegisterInfo *TRI, + const SIInstrInfo *TII, + MachineRegisterInfo &MRI) { + assert(MI.isRegSequence()); + + unsigned DstReg = MI.getOperand(0).getReg(); + if (!TRI->isSGPRClass(MRI.getRegClass(DstReg))) + return false; - unsigned DstReg = Copy.getOperand(0).getReg(); - unsigned SrcReg = Copy.getOperand(1).getReg(); - unsigned SrcSubReg = Copy.getOperand(1).getSubReg(); + if (!MRI.hasOneUse(DstReg)) + return false; - if (!TargetRegisterInfo::isVirtualRegister(DstReg)) { - // If the destination register is a physical register there isn't really - // much we can do to fix this. + MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg); + if (!CopyUse.isCopy()) return false; - } - const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); + const TargetRegisterClass *SrcRC, *DstRC; + std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI); - const TargetRegisterClass *SrcRC; + if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) + return false; - if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || - MRI.getRegClass(SrcReg) == &AMDGPU::VReg_1RegClass) + // TODO: Could have multiple extracts? + unsigned SubReg = CopyUse.getOperand(1).getSubReg(); + if (SubReg != AMDGPU::NoSubRegister) return false; - SrcRC = TRI->getSubRegClass(MRI.getRegClass(SrcReg), SrcSubReg); - return TRI->isSGPRClass(DstRC) && TRI->hasVGPRs(SrcRC); + MRI.setRegClass(DstReg, DstRC); + + // SGPRx = ... + // SGPRy = REG_SEQUENCE SGPRx, sub0 ... + // VGPRz = COPY SGPRy + + // => + // VGPRx = COPY SGPRx + // VGPRz = REG_SEQUENCE VGPRx, sub0 + + MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); + + for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { + unsigned SrcReg = MI.getOperand(I).getReg(); + unsigned SrcSubReg = MI.getOperand(I).getReg(); + + const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); + assert(TRI->isSGPRClass(SrcRC) && + "Expected SGPR REG_SEQUENCE to only have SGPR inputs"); + + SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg); + const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC); + + unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC); + + BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg) + .addOperand(MI.getOperand(I)); + + MI.getOperand(I).setReg(TmpReg); + } + + CopyUse.eraseFromParent(); + return true; } bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { @@ -207,40 +242,38 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + + SmallVector<MachineInstr *, 16> Worklist; + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { + I != E; ++I) { MachineInstr &MI = *I; - if (MI.getOpcode() == AMDGPU::COPY && isVGPRToSGPRCopy(MI, TRI, MRI)) { - DEBUG(dbgs() << "Fixing VGPR -> SGPR copy:\n"); - DEBUG(MI.print(dbgs())); - TII->moveToVALU(MI); - - } switch (MI.getOpcode()) { - default: continue; - case AMDGPU::PHI: { - DEBUG(dbgs() << "Fixing PHI: " << MI); - - for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { - const MachineOperand &Op = MI.getOperand(i); - unsigned Reg = Op.getReg(); - const TargetRegisterClass *RC - = inferRegClassFromDef(TRI, MRI, Reg, Op.getSubReg()); + default: + continue; + case AMDGPU::COPY: { + // If the destination register is a physical register there isn't really + // much we can do to fix this. + if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg())) + continue; - MRI.constrainRegClass(Op.getReg(), RC); - } - unsigned Reg = MI.getOperand(0).getReg(); - const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg, - MI.getOperand(0).getSubReg()); - if (TRI->getCommonSubClass(RC, &AMDGPU::VGPR_32RegClass)) { - MRI.constrainRegClass(Reg, &AMDGPU::VGPR_32RegClass); + const TargetRegisterClass *SrcRC, *DstRC; + std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI); + if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) { + DEBUG(dbgs() << "Fixing VGPR -> SGPR copy: " << MI); + TII->moveToVALU(MI); } + break; + } + case AMDGPU::PHI: { + DEBUG(dbgs() << "Fixing PHI: " << MI); + unsigned Reg = MI.getOperand(0).getReg(); if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) break; @@ -310,8 +343,10 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { } case AMDGPU::REG_SEQUENCE: { if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) || - !hasVGPROperands(MI, TRI)) + !hasVGPROperands(MI, TRI)) { + foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI); continue; + } DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp index 0c54446..8bda283 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp @@ -7,9 +7,8 @@ // //===----------------------------------------------------------------------===// // -/// \file -/// SALU instructions ignore control flow, so we need to modify the live ranges -/// of the registers they define in some cases. +/// \file SALU instructions ignore the execution mask, so we need to modify the +/// live ranges of the registers they define in some cases. /// /// The main case we need to handle is when a def is used in one side of a /// branch and not another. For example: @@ -42,13 +41,15 @@ /// ENDIF /// %use /// -/// Adding this use will make the def live thoughout the IF branch, which is +/// Adding this use will make the def live throughout the IF branch, which is /// what we want. #include "AMDGPU.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachinePostDominators.h" @@ -79,9 +80,13 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<LiveIntervals>(); + AU.addRequired<LiveVariables>(); + AU.addPreserved<LiveVariables>(); + AU.addRequired<MachinePostDominatorTree>(); + AU.addPreserved<MachinePostDominatorTree>(); AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -90,7 +95,7 @@ public: INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE, "SI Fix SGPR Live Ranges", false, false) -INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(LiveVariables) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE, "SI Fix SGPR Live Ranges", false, false) @@ -108,40 +113,48 @@ bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) { const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>( MF.getSubtarget().getRegisterInfo()); - LiveIntervals *LIS = &getAnalysis<LiveIntervals>(); - MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>(); - std::vector<std::pair<unsigned, LiveRange *>> SGPRLiveRanges; + bool MadeChange = false; + + MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>(); + SmallVector<unsigned, 16> SGPRLiveRanges; + + LiveVariables *LV = &getAnalysis<LiveVariables>(); + MachineBasicBlock *Entry = &MF.front(); - // First pass, collect all live intervals for SGPRs - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { + // Use a depth first order so that in SSA, we encounter all defs before + // uses. Once the defs of the block have been found, attempt to insert + // SGPR_USE instructions in successor blocks if required. + for (MachineBasicBlock *MBB : depth_first(Entry)) { + for (const MachineInstr &MI : *MBB) { for (const MachineOperand &MO : MI.defs()) { - if (MO.isImplicit()) - continue; + // We should never see a live out def of a physical register, so we also + // do not need to worry about implicit_defs(). unsigned Def = MO.getReg(); if (TargetRegisterInfo::isVirtualRegister(Def)) { - if (TRI->isSGPRClass(MRI.getRegClass(Def))) - SGPRLiveRanges.push_back( - std::make_pair(Def, &LIS->getInterval(Def))); - } else if (TRI->isSGPRClass(TRI->getPhysRegClass(Def))) { - SGPRLiveRanges.push_back( - std::make_pair(Def, &LIS->getRegUnit(Def))); + if (TRI->isSGPRClass(MRI.getRegClass(Def))) { + // Only consider defs that are live outs. We don't care about def / + // use within the same block. + + // LiveVariables does not consider registers that are only used in a + // phi in a sucessor block as live out, unlike LiveIntervals. + // + // This is OK because SIFixSGPRCopies replaced any SGPR phis with + // VGPRs. + if (LV->isLiveOut(Def, *MBB)) + SGPRLiveRanges.push_back(Def); + } } } } - } - // Second pass fix the intervals - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - MachineBasicBlock &MBB = *BI; - if (MBB.succ_size() < 2) + if (MBB->succ_size() < 2) continue; - // We have structured control flow, so number of succesors should be two. - assert(MBB.succ_size() == 2); - MachineBasicBlock *SuccA = *MBB.succ_begin(); - MachineBasicBlock *SuccB = *(++MBB.succ_begin()); + // We have structured control flow, so the number of successors should be + // two. + assert(MBB->succ_size() == 2); + MachineBasicBlock *SuccA = *MBB->succ_begin(); + MachineBasicBlock *SuccB = *(++MBB->succ_begin()); MachineBasicBlock *NCD = PDT->findNearestCommonDominator(SuccA, SuccB); if (!NCD) @@ -156,37 +169,51 @@ bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) { NCD = PDT->findNearestCommonDominator(*NCD->succ_begin(), *(++NCD->succ_begin())); } - assert(SuccA && SuccB); - for (std::pair<unsigned, LiveRange*> RegLR : SGPRLiveRanges) { - unsigned Reg = RegLR.first; - LiveRange *LR = RegLR.second; - - // FIXME: We could be smarter here. If the register is Live-In to - // one block, but the other doesn't have any SGPR defs, then there - // won't be a conflict. Also, if the branch decision is based on - // a value in an SGPR, then there will be no conflict. - bool LiveInToA = LIS->isLiveInToMBB(*LR, SuccA); - bool LiveInToB = LIS->isLiveInToMBB(*LR, SuccB); - - if ((!LiveInToA && !LiveInToB) || - (LiveInToA && LiveInToB)) + + for (unsigned Reg : SGPRLiveRanges) { + // FIXME: We could be smarter here. If the register is Live-In to one + // block, but the other doesn't have any SGPR defs, then there won't be a + // conflict. Also, if the branch condition is uniform then there will be + // no conflict. + bool LiveInToA = LV->isLiveIn(Reg, *SuccA); + bool LiveInToB = LV->isLiveIn(Reg, *SuccB); + + if (!LiveInToA && !LiveInToB) { + DEBUG(dbgs() << PrintReg(Reg, TRI, 0) + << " is live into neither successor\n"); continue; + } + + if (LiveInToA && LiveInToB) { + DEBUG(dbgs() << PrintReg(Reg, TRI, 0) + << " is live into both successors\n"); + continue; + } // This interval is live in to one successor, but not the other, so // we need to update its range so it is live in to both. - DEBUG(dbgs() << "Possible SGPR conflict detected " << " in " << *LR << - " BB#" << SuccA->getNumber() << ", BB#" << - SuccB->getNumber() << - " with NCD = " << NCD->getNumber() << '\n'); + DEBUG(dbgs() << "Possible SGPR conflict detected for " + << PrintReg(Reg, TRI, 0) + << " BB#" << SuccA->getNumber() + << ", BB#" << SuccB->getNumber() + << " with NCD = BB#" << NCD->getNumber() << '\n'); + + assert(TargetRegisterInfo::isVirtualRegister(Reg) && + "Not expecting to extend live range of physreg"); // FIXME: Need to figure out how to update LiveRange here so this pass // will be able to preserve LiveInterval analysis. - BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(), - TII->get(AMDGPU::SGPR_USE)) - .addReg(Reg, RegState::Implicit); - DEBUG(NCD->getFirstNonPHI()->dump()); + MachineInstr *NCDSGPRUse = + BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(), + TII->get(AMDGPU::SGPR_USE)) + .addReg(Reg, RegState::Implicit); + + MadeChange = true; + LV->HandleVirtRegUse(Reg, NCD, NCDSGPRUse); + + DEBUG(NCDSGPRUse->dump()); } } - return false; + return MadeChange; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index c288725..02a3930 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -45,6 +45,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -164,8 +165,8 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, // Operand is not legal, so try to commute the instruction to // see if this makes it possible to fold. - unsigned CommuteIdx0; - unsigned CommuteIdx1; + unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex; + unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex; bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1); if (CanCommute) { @@ -175,7 +176,16 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, OpNo = CommuteIdx0; } - if (!CanCommute || !TII->commuteInstruction(MI)) + // One of operands might be an Imm operand, and OpNo may refer to it after + // the call of commuteInstruction() below. Such situations are avoided + // here explicitly as OpNo must be a register operand to be a candidate + // for memory folding. + if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() || + !MI->getOperand(CommuteIdx1).isReg())) + return false; + + if (!CanCommute || + !TII->commuteInstruction(MI, false, CommuteIdx0, CommuteIdx1)) return false; if (!TII->isOperandLegal(MI, OpNo, OpToFold)) @@ -186,6 +196,110 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, return true; } +static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, + unsigned UseOpIdx, + std::vector<FoldCandidate> &FoldList, + SmallVectorImpl<MachineInstr *> &CopiesToReplace, + const SIInstrInfo *TII, const SIRegisterInfo &TRI, + MachineRegisterInfo &MRI) { + const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); + + // FIXME: Fold operands with subregs. + if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) || + UseOp.isImplicit())) { + return; + } + + bool FoldingImm = OpToFold.isImm(); + APInt Imm; + + if (FoldingImm) { + unsigned UseReg = UseOp.getReg(); + const TargetRegisterClass *UseRC + = TargetRegisterInfo::isVirtualRegister(UseReg) ? + MRI.getRegClass(UseReg) : + TRI.getPhysRegClass(UseReg); + + Imm = APInt(64, OpToFold.getImm()); + + const MCInstrDesc &FoldDesc = TII->get(OpToFold.getParent()->getOpcode()); + const TargetRegisterClass *FoldRC = + TRI.getRegClass(FoldDesc.OpInfo[0].RegClass); + + // Split 64-bit constants into 32-bits for folding. + if (FoldRC->getSize() == 8 && UseOp.getSubReg()) { + if (UseRC->getSize() != 8) + return; + + if (UseOp.getSubReg() == AMDGPU::sub0) { + Imm = Imm.getLoBits(32); + } else { + assert(UseOp.getSubReg() == AMDGPU::sub1); + Imm = Imm.getHiBits(32); + } + } + + // In order to fold immediates into copies, we need to change the + // copy to a MOV. + if (UseMI->getOpcode() == AMDGPU::COPY) { + unsigned DestReg = UseMI->getOperand(0).getReg(); + const TargetRegisterClass *DestRC + = TargetRegisterInfo::isVirtualRegister(DestReg) ? + MRI.getRegClass(DestReg) : + TRI.getPhysRegClass(DestReg); + + unsigned MovOp = TII->getMovOpcode(DestRC); + if (MovOp == AMDGPU::COPY) + return; + + UseMI->setDesc(TII->get(MovOp)); + CopiesToReplace.push_back(UseMI); + } + } + + // Special case for REG_SEQUENCE: We can't fold literals into + // REG_SEQUENCE instructions, so we have to fold them into the + // uses of REG_SEQUENCE. + if (UseMI->getOpcode() == AMDGPU::REG_SEQUENCE) { + unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); + unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); + + for (MachineRegisterInfo::use_iterator + RSUse = MRI.use_begin(RegSeqDstReg), + RSE = MRI.use_end(); RSUse != RSE; ++RSUse) { + + MachineInstr *RSUseMI = RSUse->getParent(); + if (RSUse->getSubReg() != RegSeqDstSubReg) + continue; + + foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList, + CopiesToReplace, TII, TRI, MRI); + } + return; + } + + const MCInstrDesc &UseDesc = UseMI->getDesc(); + + // Don't fold into target independent nodes. Target independent opcodes + // don't have defined register classes. + if (UseDesc.isVariadic() || + UseDesc.OpInfo[UseOpIdx].RegClass == -1) + return; + + if (FoldingImm) { + MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); + tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); + return; + } + + tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); + + // FIXME: We could try to change the instruction from 64-bit to 32-bit + // to enable more folding opportunites. The shrink operands pass + // already does this. + return; +} + bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); const SIInstrInfo *TII = @@ -226,88 +340,36 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { OpToFold.getSubReg())) continue; + + // We need mutate the operands of new mov instructions to add implicit + // uses of EXEC, but adding them invalidates the use_iterator, so defer + // this. + SmallVector<MachineInstr *, 4> CopiesToReplace; + std::vector<FoldCandidate> FoldList; for (MachineRegisterInfo::use_iterator Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end(); Use != E; ++Use) { MachineInstr *UseMI = Use->getParent(); - const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo()); - // FIXME: Fold operands with subregs. - if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) || - UseOp.isImplicit())) { - continue; - } - - APInt Imm; - - if (FoldingImm) { - unsigned UseReg = UseOp.getReg(); - const TargetRegisterClass *UseRC - = TargetRegisterInfo::isVirtualRegister(UseReg) ? - MRI.getRegClass(UseReg) : - TRI.getPhysRegClass(UseReg); - - Imm = APInt(64, OpToFold.getImm()); - - // Split 64-bit constants into 32-bits for folding. - if (UseOp.getSubReg()) { - if (UseRC->getSize() != 8) - continue; - - if (UseOp.getSubReg() == AMDGPU::sub0) { - Imm = Imm.getLoBits(32); - } else { - assert(UseOp.getSubReg() == AMDGPU::sub1); - Imm = Imm.getHiBits(32); - } - } - - // In order to fold immediates into copies, we need to change the - // copy to a MOV. - if (UseMI->getOpcode() == AMDGPU::COPY) { - unsigned DestReg = UseMI->getOperand(0).getReg(); - const TargetRegisterClass *DestRC - = TargetRegisterInfo::isVirtualRegister(DestReg) ? - MRI.getRegClass(DestReg) : - TRI.getPhysRegClass(DestReg); - - unsigned MovOp = TII->getMovOpcode(DestRC); - if (MovOp == AMDGPU::COPY) - continue; - - UseMI->setDesc(TII->get(MovOp)); - } - } - - const MCInstrDesc &UseDesc = UseMI->getDesc(); - - // Don't fold into target independent nodes. Target independent opcodes - // don't have defined register classes. - if (UseDesc.isVariadic() || - UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1) - continue; - - if (FoldingImm) { - MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); - tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &ImmOp, TII); - continue; - } - - tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &OpToFold, TII); - - // FIXME: We could try to change the instruction from 64-bit to 32-bit - // to enable more folding opportunites. The shrink operands pass - // already does this. + foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList, + CopiesToReplace, TII, TRI, MRI); } + // Make sure we add EXEC uses to any new v_mov instructions created. + for (MachineInstr *Copy : CopiesToReplace) + Copy->addImplicitDefUseOperands(MF); + for (FoldCandidate &Fold : FoldList) { if (updateOperand(Fold, TRI)) { // Clear kill flags. if (!Fold.isImm()) { assert(Fold.OpToFold && Fold.OpToFold->isReg()); - Fold.OpToFold->setIsKill(false); + // FIXME: Probably shouldn't bother trying to fold if not an + // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR + // copies. + MRI.clearKillFlags(Fold.OpToFold->getReg()); } DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << Fold.UseOpNo << " of " << *Fold.UseMI << '\n'); diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp new file mode 100644 index 0000000..6b3c81c --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -0,0 +1,243 @@ +//===----------------------- SIFrameLowering.cpp --------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// + +#include "SIFrameLowering.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/RegisterScavenging.h" + +using namespace llvm; + + +static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo, + const MachineFrameInfo *FrameInfo) { + if (!FuncInfo->hasSpilledSGPRs()) + return false; + + if (FuncInfo->hasSpilledVGPRs()) + return false; + + for (int I = FrameInfo->getObjectIndexBegin(), + E = FrameInfo->getObjectIndexEnd(); I != E; ++I) { + if (!FrameInfo->isSpillSlotObjectIndex(I)) + return false; + } + + return true; +} + +static ArrayRef<MCPhysReg> getAllSGPR128() { + return makeArrayRef(AMDGPU::SReg_128RegClass.begin(), + AMDGPU::SReg_128RegClass.getNumRegs()); +} + +static ArrayRef<MCPhysReg> getAllSGPRs() { + return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), + AMDGPU::SGPR_32RegClass.getNumRegs()); +} + +void SIFrameLowering::emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + if (!MF.getFrameInfo()->hasStackObjects()) + return; + + assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); + + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + // If we only have SGPR spills, we won't actually be using scratch memory + // since these spill to VGPRs. + // + // FIXME: We should be cleaning up these unused SGPR spill frame indices + // somewhere. + if (hasOnlySGPRSpills(MFI, MF.getFrameInfo())) + return; + + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + + // We need to insert initialization of the scratch resource descriptor. + unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); + assert(ScratchRsrcReg != AMDGPU::NoRegister); + + unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + assert(ScratchWaveOffsetReg != AMDGPU::NoRegister); + + unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + + unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; + if (ST.isAmdHsaOS()) { + PreloadedPrivateBufferReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + } + + // If we reserved the original input registers, we don't need to copy to the + // reserved registers. + if (ScratchRsrcReg == PreloadedPrivateBufferReg) { + // We should always reserve these 5 registers at the same time. + assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg && + "scratch wave offset and private segment buffer inconsistent"); + return; + } + + + // We added live-ins during argument lowering, but since they were not used + // they were deleted. We're adding the uses now, so add them back. + MachineRegisterInfo &MRI = MF.getRegInfo(); + MRI.addLiveIn(PreloadedScratchWaveOffsetReg); + MBB.addLiveIn(PreloadedScratchWaveOffsetReg); + + if (ST.isAmdHsaOS()) { + MRI.addLiveIn(PreloadedPrivateBufferReg); + MBB.addLiveIn(PreloadedPrivateBufferReg); + } + + // We reserved the last registers for this. Shift it down to the end of those + // which were actually used. + // + // FIXME: It might be safer to use a pseudoregister before replacement. + + // FIXME: We should be able to eliminate unused input registers. We only + // cannot do this for the resources required for scratch access. For now we + // skip over user SGPRs and may leave unused holes. + + // We find the resource first because it has an alignment requirement. + if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4; + // Skip the last 2 elements because the last one is reserved for VCC, and + // this is the 2nd to last element already. + for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) { + // Pick the first unallocated one. Make sure we don't clobber the other + // reserved input we needed. + if (!MRI.isPhysRegUsed(Reg)) { + assert(MRI.isAllocatable(Reg)); + MRI.replaceRegWith(ScratchRsrcReg, Reg); + ScratchRsrcReg = Reg; + MFI->setScratchRSrcReg(ScratchRsrcReg); + break; + } + } + } + + if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + // Skip the last 2 elements because the last one is reserved for VCC, and + // this is the 2nd to last element already. + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); + for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) { + // Pick the first unallocated SGPR. Be careful not to pick an alias of the + // scratch descriptor, since we haven’t added its uses yet. + if (!MRI.isPhysRegUsed(Reg)) { + assert(MRI.isAllocatable(Reg) && + !TRI->isSubRegisterEq(ScratchRsrcReg, Reg)); + + MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); + ScratchWaveOffsetReg = Reg; + MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg); + break; + } + } + } + + + assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg)); + + const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); + MachineBasicBlock::iterator I = MBB.begin(); + DebugLoc DL; + + if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { + // Make sure we emit the copy for the offset first. We may have chosen to copy + // the buffer resource into a register that aliases the input offset register. + BuildMI(MBB, I, DL, SMovB32, ScratchWaveOffsetReg) + .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); + } + + if (ST.isAmdHsaOS()) { + // Insert copies from argument register. + assert( + !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) && + !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg)); + + unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); + unsigned Rsrc23 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3); + + unsigned Lo = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub0_sub1); + unsigned Hi = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub2_sub3); + + const MCInstrDesc &SMovB64 = TII->get(AMDGPU::S_MOV_B64); + + BuildMI(MBB, I, DL, SMovB64, Rsrc01) + .addReg(Lo, RegState::Kill); + BuildMI(MBB, I, DL, SMovB64, Rsrc23) + .addReg(Hi, RegState::Kill); + } else { + unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); + unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); + + // Use relocations to get the pointer, and setup the other bits manually. + uint64_t Rsrc23 = TII->getScratchRsrcWords23(); + BuildMI(MBB, I, DL, SMovB32, Rsrc0) + .addExternalSymbol("SCRATCH_RSRC_DWORD0") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc1) + .addExternalSymbol("SCRATCH_RSRC_DWORD1") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc2) + .addImm(Rsrc23 & 0xffffffff) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc3) + .addImm(Rsrc23 >> 32) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + } + + // Make the register selected live throughout the function. + for (MachineBasicBlock &OtherBB : MF) { + if (&OtherBB == &MBB) + continue; + + OtherBB.addLiveIn(ScratchRsrcReg); + OtherBB.addLiveIn(ScratchWaveOffsetReg); + } +} + +void SIFrameLowering::processFunctionBeforeFrameFinalized( + MachineFunction &MF, + RegScavenger *RS) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + + if (!MFI->hasStackObjects()) + return; + + bool MayNeedScavengingEmergencySlot = MFI->hasStackObjects(); + + assert((RS || !MayNeedScavengingEmergencySlot) && + "RegScavenger required if spilling"); + + if (MayNeedScavengingEmergencySlot) { + int ScavengeFI = MFI->CreateSpillStackObject( + AMDGPU::SGPR_32RegClass.getSize(), + AMDGPU::SGPR_32RegClass.getAlignment()); + RS->addScavengingFrameIndex(ScavengeFI); + } +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h new file mode 100644 index 0000000..a9152fd --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -0,0 +1,34 @@ +//===--------------------- SIFrameLowering.h --------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H +#define LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H + +#include "AMDGPUFrameLowering.h" + +namespace llvm { + +class SIFrameLowering final : public AMDGPUFrameLowering { +public: + SIFrameLowering(StackDirection D, unsigned StackAl, int LAO, + unsigned TransAl = 1) : + AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} + ~SIFrameLowering() override {} + + void emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const override; + + void processFunctionBeforeFrameFinalized( + MachineFunction &MF, + RegScavenger *RS = nullptr) const override; +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index c2db9ff..0e043cb 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -20,6 +20,7 @@ #include "SIISelLowering.h" #include "AMDGPU.h" +#include "AMDGPUDiagnosticInfoUnsupported.h" #include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" @@ -51,6 +52,9 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); @@ -103,6 +107,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setOperationAction(ISD::SETCC, MVT::v4i1, Expand); setOperationAction(ISD::BSWAP, MVT::i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); @@ -155,13 +160,30 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, for (MVT VT : MVT::fp_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); + setTruncStoreAction(MVT::i64, MVT::i32, Expand); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); + + setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); + + setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); + setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); + setOperationAction(ISD::LOAD, MVT::i1, Custom); + setOperationAction(ISD::LOAD, MVT::v2i64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32); + + setOperationAction(ISD::STORE, MVT::v2i64, Promote); + AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32); + + setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); setOperationAction(ISD::FrameIndex, MVT::i32, Custom); @@ -173,9 +195,14 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); setOperationAction(ISD::SELECT, MVT::i1, Promote); + setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); + + + setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); + // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. - for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) { + for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch(Op) { case ISD::LOAD: @@ -186,6 +213,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, case ISD::INSERT_VECTOR_ELT: case ISD::INSERT_SUBVECTOR: case ISD::EXTRACT_SUBVECTOR: + case ISD::SCALAR_TO_VECTOR: break; case ISD::CONCAT_VECTORS: setOperationAction(Op, VT, Custom); @@ -197,6 +225,22 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, } } + // Most operations are naturally 32-bit vector operations. We only support + // load and store of i64 vectors, so promote v2i64 vector operations to v4i32. + for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) { + setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32); + + setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32); + + setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); + } + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); @@ -261,6 +305,41 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1); } +bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { + // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and + // additionally can do r + r + i with addr64. 32-bit has more addressing + // mode options. Depending on the resource constant, it can also do + // (i64 r0) + (i32 r1) * (i14 i). + // + // Private arrays end up using a scratch buffer most of the time, so also + // assume those use MUBUF instructions. Scratch loads / stores are currently + // implemented as mubuf instructions with offen bit set, so slightly + // different than the normal addr64. + if (!isUInt<12>(AM.BaseOffs)) + return false; + + // FIXME: Since we can split immediate into soffset and immediate offset, + // would it make sense to allow any immediate? + + switch (AM.Scale) { + case 0: // r + i or just i, depending on HasBaseReg. + return true; + case 1: + return true; // We have r + r or r + i. + case 2: + if (AM.HasBaseReg) { + // Reject 2 * r + r. + return false; + } + + // Allow 2 * r as r + r + // Or 2 * r + i is allowed as r + r + i. + return true; + default: // Don't allow n * r + return false; + } +} + bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { @@ -269,7 +348,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, return false; switch (AS) { - case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::GLOBAL_ADDRESS: { if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { // Assume the we will use FLAT for all global memory accesses // on VI. @@ -282,51 +361,51 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, // because it has never been validated. return isLegalFlatAddressingMode(AM); } - // fall-through - case AMDGPUAS::PRIVATE_ADDRESS: - case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions? - case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: { - // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and - // additionally can do r + r + i with addr64. 32-bit has more addressing - // mode options. Depending on the resource constant, it can also do - // (i64 r0) + (i32 r1) * (i14 i). - // - // SMRD instructions have an 8-bit, dword offset. - // - // Assume nonunifom access, since the address space isn't enough to know - // what instruction we will use, and since we don't know if this is a load - // or store and scalar stores are only available on VI. - // - // We also know if we are doing an extload, we can't do a scalar load. - // - // Private arrays end up using a scratch buffer most of the time, so also - // assume those use MUBUF instructions. Scratch loads / stores are currently - // implemented as mubuf instructions with offen bit set, so slightly - // different than the normal addr64. - if (!isUInt<12>(AM.BaseOffs)) - return false; - // FIXME: Since we can split immediate into soffset and immediate offset, - // would it make sense to allow any immediate? + return isLegalMUBUFAddressingMode(AM); + } + case AMDGPUAS::CONSTANT_ADDRESS: { + // If the offset isn't a multiple of 4, it probably isn't going to be + // correctly aligned. + if (AM.BaseOffs % 4 != 0) + return isLegalMUBUFAddressingMode(AM); + + // There are no SMRD extloads, so if we have to do a small type access we + // will use a MUBUF load. + // FIXME?: We also need to do this if unaligned, but we don't know the + // alignment here. + if (DL.getTypeStoreSize(Ty) < 4) + return isLegalMUBUFAddressingMode(AM); + + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { + // SMRD instructions have an 8-bit, dword offset on SI. + if (!isUInt<8>(AM.BaseOffs / 4)) + return false; + } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) { + // On CI+, this can also be a 32-bit literal constant offset. If it fits + // in 8-bits, it can use a smaller encoding. + if (!isUInt<32>(AM.BaseOffs / 4)) + return false; + } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // On VI, these use the SMEM format and the offset is 20-bit in bytes. + if (!isUInt<20>(AM.BaseOffs)) + return false; + } else + llvm_unreachable("unhandled generation"); - switch (AM.Scale) { - case 0: // r + i or just i, depending on HasBaseReg. + if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. return true; - case 1: - return true; // We have r + r or r + i. - case 2: - if (AM.HasBaseReg) { - // Reject 2 * r + r. - return false; - } - // Allow 2 * r as r + r - // Or 2 * r + i is allowed as r + r + i. + if (AM.Scale == 1 && AM.HasBaseReg) return true; - default: // Don't allow n * r - return false; - } + + return false; } + + case AMDGPUAS::PRIVATE_ADDRESS: + case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: + return isLegalMUBUFAddressingMode(AM); + case AMDGPUAS::LOCAL_ADDRESS: case AMDGPUAS::REGION_ADDRESS: { // Basic, single offset DS instructions allow a 16-bit unsigned immediate @@ -374,7 +453,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte // aligned, 8 byte access in a single operation using ds_read2/write2_b32 // with adjacent offsets. - return Align % 4 == 0; + bool AlignedBy4 = (Align % 4 == 0); + if (IsFast) + *IsFast = AlignedBy4; + return AlignedBy4; } // Smaller than dword value must be aligned. @@ -411,6 +493,32 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, return MVT::Other; } +static bool isFlatGlobalAddrSpace(unsigned AS) { + return AS == AMDGPUAS::GLOBAL_ADDRESS || + AS == AMDGPUAS::FLAT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS; +} + +bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, + unsigned DestAS) const { + return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); +} + + +bool SITargetLowering::isMemOpUniform(const SDNode *N) const { + const MemSDNode *MemNode = cast<MemSDNode>(N); + const Value *Ptr = MemNode->getMemOperand()->getValue(); + + // UndefValue means this is a load of a kernel input. These are uniform. + // Sometimes LDS instructions have constant pointers + if (isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || isa<Constant>(Ptr) || + isa<GlobalValue>(Ptr)) + return true; + + const Instruction *I = dyn_cast_or_null<Instruction>(Ptr); + return I && I->getMetadata("amdgpu.uniform"); +} + TargetLoweringBase::LegalizeTypeAction SITargetLowering::getPreferredVectorAction(EVT VT) const { if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) @@ -426,12 +534,6 @@ bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, return TII->isInlineConstant(Imm); } -static EVT toIntegerVT(EVT VT) { - if (VT.isVector()) - return VT.changeVectorElementTypeToInteger(); - return MVT::getIntegerVT(VT.getSizeInBits()); -} - SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc SL, SDValue Chain, unsigned Offset, bool Signed) const { @@ -439,7 +541,7 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, MachineFunction &MF = DAG.getMachineFunction(); const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo()); - unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); + unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); Type *Ty = VT.getTypeForEVT(*DAG.getContext()); @@ -455,30 +557,10 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, unsigned Align = DL.getABITypeAlignment(Ty); - if (VT != MemVT && VT.isFloatingPoint()) { - // Do an integer load and convert. - // FIXME: This is mostly because load legalization after type legalization - // doesn't handle FP extloads. - assert(VT.getScalarType() == MVT::f32 && - MemVT.getScalarType() == MVT::f16); - - EVT IVT = toIntegerVT(VT); - EVT MemIVT = toIntegerVT(MemVT); - SDValue Load = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD, - IVT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemIVT, - false, // isVolatile - true, // isNonTemporal - true, // isInvariant - Align); // Alignment - SDValue Ops[] = { - DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load), - Load.getValue(1) - }; - - return DAG.getMergeValues(Ops, SL); - } - ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + if (MemVT.isFloatingPoint()) + ExtTy = ISD::EXTLOAD; + return DAG.getLoad(ISD::UNINDEXED, ExtTy, VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT, false, // isVolatile @@ -497,8 +579,16 @@ SDValue SITargetLowering::LowerFormalArguments( MachineFunction &MF = DAG.getMachineFunction(); FunctionType *FType = MF.getFunction()->getFunctionType(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + + if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) { + const Function *Fn = MF.getFunction(); + DiagnosticInfoUnsupported NoGraphicsHSA(*Fn, "non-compute shaders with HSA"); + DAG.getContext()->diagnose(NoGraphicsHSA); + return SDValue(); + } - assert(CallConv == CallingConv::C); + // FIXME: We currently assume all calling conventions are kernels. SmallVector<ISD::InputArg, 16> Splits; BitVector Skipped(Ins.size()); @@ -513,7 +603,7 @@ SDValue SITargetLowering::LowerFormalArguments( assert((PSInputNum <= 15) && "Too many PS inputs!"); if (!Arg.Used) { - // We can savely skip PS inputs + // We can safely skip PS inputs Skipped.set(i); ++PSInputNum; continue; @@ -530,7 +620,7 @@ SDValue SITargetLowering::LowerFormalArguments( // We REALLY want the ORIGINAL number of vertex elements here, e.g. a // three or five element vertex only needs three or five registers, - // NOT four or eigth. + // NOT four or eight. Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); unsigned NumElements = ParamType->getVectorNumElements(); @@ -556,41 +646,30 @@ SDValue SITargetLowering::LowerFormalArguments( CCInfo.AllocateReg(AMDGPU::VGPR1); } - // The pointer to the list of arguments is stored in SGPR0, SGPR1 - // The pointer to the scratch buffer is stored in SGPR2, SGPR3 - if (Info->getShaderType() == ShaderType::COMPUTE) { - if (Subtarget->isAmdHsaOS()) - Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers. - else - Info->NumUserSGPRs = 4; - - unsigned InputPtrReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); - unsigned InputPtrRegLo = - TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0); - unsigned InputPtrRegHi = - TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1); - - unsigned ScratchPtrReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); - unsigned ScratchPtrRegLo = - TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0); - unsigned ScratchPtrRegHi = - TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1); - - CCInfo.AllocateReg(InputPtrRegLo); - CCInfo.AllocateReg(InputPtrRegHi); - CCInfo.AllocateReg(ScratchPtrRegLo); - CCInfo.AllocateReg(ScratchPtrRegHi); - MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); - MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass); - } - if (Info->getShaderType() == ShaderType::COMPUTE) { getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, Splits); } + // FIXME: How should these inputs interact with inreg / custom SGPR inputs? + if (Info->hasPrivateSegmentBuffer()) { + unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); + MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass); + CCInfo.AllocateReg(PrivateSegmentBufferReg); + } + + if (Info->hasDispatchPtr()) { + unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI); + MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(DispatchPtrReg); + } + + if (Info->hasKernargSegmentPtr()) { + unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); + MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(InputPtrReg); + } + AnalyzeFormalArguments(CCInfo, Splits); SmallVector<SDValue, 16> Chains; @@ -617,7 +696,7 @@ SDValue SITargetLowering::LowerFormalArguments( Offset, Ins[i].Flags.isSExt()); Chains.push_back(Arg.getValue(1)); - const PointerType *ParamTy = + auto *ParamTy = dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { @@ -678,10 +757,113 @@ SDValue SITargetLowering::LowerFormalArguments( InVals.push_back(Val); } - if (Info->getShaderType() != ShaderType::COMPUTE) { - unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef<MCPhysReg>( - AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs())); - Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); + // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read + // these from the dispatch pointer. + + // Start adding system SGPRs. + if (Info->hasWorkGroupIDX()) { + unsigned Reg = Info->addWorkGroupIDX(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } else + llvm_unreachable("work group id x is always enabled"); + + if (Info->hasWorkGroupIDY()) { + unsigned Reg = Info->addWorkGroupIDY(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkGroupIDZ()) { + unsigned Reg = Info->addWorkGroupIDZ(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkGroupInfo()) { + unsigned Reg = Info->addWorkGroupInfo(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasPrivateSegmentWaveByteOffset()) { + // Scratch wave offset passed in system SGPR. + unsigned PrivateSegmentWaveByteOffsetReg + = Info->addPrivateSegmentWaveByteOffset(); + + MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); + } + + // Now that we've figured out where the scratch register inputs are, see if + // should reserve the arguments and use them directly. + + bool HasStackObjects = MF.getFrameInfo()->hasStackObjects(); + + if (ST.isAmdHsaOS()) { + // TODO: Assume we will spill without optimizations. + if (HasStackObjects) { + // If we have stack objects, we unquestionably need the private buffer + // resource. For the HSA ABI, this will be the first 4 user SGPR + // inputs. We can reserve those and use them directly. + + unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + Info->setScratchRSrcReg(PrivateSegmentBufferReg); + + unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); + } else { + unsigned ReservedBufferReg + = TRI->reservedPrivateSegmentBufferReg(MF); + unsigned ReservedOffsetReg + = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + + // We tentatively reserve the last registers (skipping the last two + // which may contain VCC). After register allocation, we'll replace + // these with the ones immediately after those which were really + // allocated. In the prologue copies will be inserted from the argument + // to these reserved registers. + Info->setScratchRSrcReg(ReservedBufferReg); + Info->setScratchWaveOffsetReg(ReservedOffsetReg); + } + } else { + unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF); + + // Without HSA, relocations are used for the scratch pointer and the + // buffer resource setup is always inserted in the prologue. Scratch wave + // offset is still in an input SGPR. + Info->setScratchRSrcReg(ReservedBufferReg); + + if (HasStackObjects) { + unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg); + } else { + unsigned ReservedOffsetReg + = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + Info->setScratchWaveOffsetReg(ReservedOffsetReg); + } + } + + if (Info->hasWorkItemIDX()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } else + llvm_unreachable("workitem id x should always be enabled"); + + if (Info->hasWorkItemIDY()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkItemIDZ()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); } if (Chains.empty()) @@ -693,27 +875,11 @@ SDValue SITargetLowering::LowerFormalArguments( MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { - MachineBasicBlock::iterator I = *MI; - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); - switch (MI->getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); case AMDGPU::BRANCH: return BB; - case AMDGPU::SI_RegisterStorePseudo: { - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - MachineInstrBuilder MIB = - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), - Reg); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) - MIB.addOperand(MI->getOperand(i)); - - MI->eraseFromParent(); - break; - } } return BB; } @@ -944,20 +1110,8 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, const GlobalValue *GV = GSD->getGlobal(); MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace()); - SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT); SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); - - SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, - DAG.getConstant(0, DL, MVT::i32)); - SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, - DAG.getConstant(1, DL, MVT::i32)); - - SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue), - PtrLo, GA); - SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue), - PtrHi, DAG.getConstant(0, DL, MVT::i32), - SDValue(Lo.getNode(), 1)); - return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi); + return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA); } SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, @@ -977,6 +1131,18 @@ SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, // a glue result. } +SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, + SDValue Op, + MVT VT, + unsigned Offset) const { + SDLoc SL(Op); + SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL, + DAG.getEntryNode(), Offset, false); + // The local size values will have the hi 16-bits as zero. + return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, + DAG.getValueType(VT)); +} + SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -988,7 +1154,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDLoc DL(Op); unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + // TODO: Should this propagate fast-math-flags? + switch (IntrinsicID) { + case Intrinsic::amdgcn_dispatch_ptr: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT); + case Intrinsic::r600_read_ngroups_x: return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::NGROUPS_X, false); @@ -1008,37 +1180,36 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); case Intrinsic::r600_read_local_size_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_X, false); + return lowerImplicitZextParam(DAG, Op, MVT::i16, + SI::KernelInputOffsets::LOCAL_SIZE_X); case Intrinsic::r600_read_local_size_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_Y, false); + return lowerImplicitZextParam(DAG, Op, MVT::i16, + SI::KernelInputOffsets::LOCAL_SIZE_Y); case Intrinsic::r600_read_local_size_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_Z, false); - + return lowerImplicitZextParam(DAG, Op, MVT::i16, + SI::KernelInputOffsets::LOCAL_SIZE_Z); case Intrinsic::AMDGPU_read_workdim: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - getImplicitParameterOffset(MFI, GRID_DIM), false); - + // Really only 2 bits. + return lowerImplicitZextParam(DAG, Op, MVT::i8, + getImplicitParameterOffset(MFI, GRID_DIM)); case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT); case Intrinsic::r600_read_tgid_y: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT); case Intrinsic::r600_read_tgid_z: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT); case Intrinsic::r600_read_tidig_x: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT); case Intrinsic::r600_read_tidig_y: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT); case Intrinsic::r600_read_tidig_z: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT); case AMDGPUIntrinsic::SI_load_const: { SDValue Ops[] = { Op.getOperand(1), @@ -1077,6 +1248,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, DAG.getConstant(2, DL, MVT::i32), // P0 Op.getOperand(1), Op.getOperand(2), Glue); } + case AMDGPUIntrinsic::SI_packf16: + if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef()) + return DAG.getUNDEF(MVT::i32); + return Op; case AMDGPUIntrinsic::SI_fs_interp: { SDValue IJ = Op.getOperand(4); SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, @@ -1092,6 +1267,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J, Op.getOperand(1), Op.getOperand(2), Glue); } + case Intrinsic::amdgcn_interp_p1: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); + SDValue Glue = M0.getValue(1); + return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3), Glue); + } + case Intrinsic::amdgcn_interp_p2: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); + SDValue Glue = SDValue(M0.getNode(), 1); + return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), + Glue); + } default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); } @@ -1152,16 +1340,29 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { "Custom lowering for non-i32 vectors hasn't been implemented."); unsigned NumElements = Op.getValueType().getVectorNumElements(); assert(NumElements != 2 && "v2 loads are supported for all address spaces."); + switch (Load->getAddressSpace()) { default: break; + case AMDGPUAS::CONSTANT_ADDRESS: + if (isMemOpUniform(Load)) + break; + // Non-uniform loads will be selected to MUBUF instructions, so they + // have the same legalization requires ments as global and private + // loads. + // + // Fall-through case AMDGPUAS::GLOBAL_ADDRESS: case AMDGPUAS::PRIVATE_ADDRESS: + if (NumElements >= 8) + return SplitVectorLoad(Op, DAG); + // v4 loads are supported for private and global memory. if (NumElements <= 4) break; // fall-through case AMDGPUAS::LOCAL_ADDRESS: - return ScalarizeVectorLoad(Op, DAG); + // If properly aligned, if we split we might be able to use ds_read_b64. + return SplitVectorLoad(Op, DAG); } } @@ -1236,8 +1437,10 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { if (Unsafe) { // Turn into multiply by the reciprocal. // x / y -> x * (1.0 / y) + SDNodeFlags Flags; + Flags.setUnsafeAlgebra(true); SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); - return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip); + return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags); } return SDValue(); @@ -1274,6 +1477,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + // TODO: Should this propagate fast-math-flags? + r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); @@ -1379,7 +1584,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return Ret; if (VT.isVector() && VT.getVectorNumElements() >= 8) - return ScalarizeVectorStore(Op, DAG); + return SplitVectorStore(Op, DAG); if (VT == MVT::i1) return DAG.getTruncStore(Store->getChain(), DL, @@ -1393,6 +1598,7 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT VT = Op.getValueType(); SDValue Arg = Op.getOperand(0); + // TODO: Should this propagate fast-math-flags? SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, DAG.getNode(ISD::FMUL, DL, VT, Arg, DAG.getConstantFP(0.5/M_PI, DL, @@ -2125,9 +2331,14 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - TII->legalizeOperands(MI); - if (TII->isMIMG(MI->getOpcode())) { + if (TII->isVOP3(MI->getOpcode())) { + // Make sure constant bus requirements are respected. + TII->legalizeOperandsVOP3(MRI, MI); + return; + } + + if (TII->isMIMG(*MI)) { unsigned VReg = MI->getOperand(0).getReg(); unsigned Writemask = MI->getOperand(1).getImm(); unsigned BitsSet = 0; @@ -2169,53 +2380,38 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const { const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); -#if 1 - // XXX - Workaround for moveToVALU not handling different register class - // inserts for REG_SEQUENCE. - - // Build the half of the subregister with the constants. - const SDValue Ops0[] = { - DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), - buildSMovImm32(DAG, DL, 0), - DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), - DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) - }; - - SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::v2i32, Ops0), 0); - - // Combine the constants and the pointer. - const SDValue Ops1[] = { - DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), - Ptr, - DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), - SubRegHi, - DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) - }; + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + + // Build the half of the subregister with the constants before building the + // full 128-bit register. If we are building multiple resource descriptors, + // this will allow CSEing of the 2-component register. + const SDValue Ops0[] = { + DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), + buildSMovImm32(DAG, DL, 0), + DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), + DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) + }; - return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); -#else - const SDValue Ops[] = { - DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32), - Ptr, - DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32), - buildSMovImm32(DAG, DL, 0), - DAG.getTargetConstant(AMDGPU::sub2, MVT::i32), - buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32), - DAG.getTargetConstant(AMDGPU::sub3, MVT::i32) - }; + SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::v2i32, Ops0), 0); - return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); + // Combine the constants and the pointer. + const SDValue Ops1[] = { + DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), + Ptr, + DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), + SubRegHi, + DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) + }; -#endif + return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); } /// \brief Return a resource descriptor with the 'Add TID' bit enabled -/// The TID (Thread ID) is multipled by the stride value (bits [61:48] -/// of the resource descriptor) to create an offset, which is added to the -/// resource ponter. +/// The TID (Thread ID) is multiplied by the stride value (bits [61:48] +/// of the resource descriptor) to create an offset, which is added to +/// the resource pointer. MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, SDLoc DL, SDValue Ptr, @@ -2248,15 +2444,6 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); } -MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr) const { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); - - return buildRSRC(DAG, DL, Ptr, 0, TII->getScratchRsrcWords23()); -} - SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const { @@ -2274,13 +2461,41 @@ std::pair<unsigned, const TargetRegisterClass *> SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { - if (Constraint == "r") { - switch(VT.SimpleTy) { - default: llvm_unreachable("Unhandled type for 'r' inline asm constraint"); - case MVT::i64: - return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); - case MVT::i32: + + if (Constraint.size() == 1) { + switch (Constraint[0]) { + case 's': + case 'r': + switch (VT.getSizeInBits()) { + default: + return std::make_pair(0U, nullptr); + case 32: return std::make_pair(0U, &AMDGPU::SGPR_32RegClass); + case 64: + return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); + case 128: + return std::make_pair(0U, &AMDGPU::SReg_128RegClass); + case 256: + return std::make_pair(0U, &AMDGPU::SReg_256RegClass); + } + + case 'v': + switch (VT.getSizeInBits()) { + default: + return std::make_pair(0U, nullptr); + case 32: + return std::make_pair(0U, &AMDGPU::VGPR_32RegClass); + case 64: + return std::make_pair(0U, &AMDGPU::VReg_64RegClass); + case 96: + return std::make_pair(0U, &AMDGPU::VReg_96RegClass); + case 128: + return std::make_pair(0U, &AMDGPU::VReg_128RegClass); + case 256: + return std::make_pair(0U, &AMDGPU::VReg_256RegClass); + case 512: + return std::make_pair(0U, &AMDGPU::VReg_512RegClass); + } } } @@ -2301,3 +2516,16 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, } return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } + +SITargetLowering::ConstraintType +SITargetLowering::getConstraintType(StringRef Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: break; + case 's': + case 'v': + return C_RegisterClass; + } + } + return TargetLowering::getConstraintType(Constraint); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h index d84c32e..e2f8cb1 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -28,6 +28,9 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; + SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, + MVT VT, unsigned Offset) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; @@ -57,6 +60,7 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; bool isLegalFlatAddressingMode(const AddrMode &AM) const; + bool isLegalMUBUFAddressingMode(const AddrMode &AM) const; public: SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI); @@ -76,6 +80,9 @@ public: bool MemcpyStrSrc, MachineFunction &MF) const override; + bool isMemOpUniform(const SDNode *N) const; + bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; + TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; @@ -112,13 +119,10 @@ public: SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const; - MachineSDNode *buildScratchRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr) const; - std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; + ConstraintType getConstraintType(StringRef Constraint) const override; SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const; }; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp index 90a37f1..821aada 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -91,7 +91,8 @@ private: bool isOpRelevant(MachineOperand &Op); /// \brief Get register interval an operand affects. - RegInterval getRegInterval(MachineOperand &Op); + RegInterval getRegInterval(const TargetRegisterClass *RC, + const MachineOperand &Reg) const; /// \brief Handle instructions async components void pushInstruction(MachineBasicBlock &MBB, @@ -121,9 +122,13 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; const char *getPassName() const override { - return "SI insert wait instructions"; + return "SI insert wait instructions"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } }; } // End anonymous namespace @@ -138,9 +143,8 @@ FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) { } Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { - - uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags; - Counters Result; + uint64_t TSFlags = MI.getDesc().TSFlags; + Counters Result = { { 0, 0, 0 } }; Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT); @@ -151,15 +155,22 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { // LGKM may uses larger values if (TSFlags & SIInstrFlags::LGKM_CNT) { - if (TII->isSMRD(MI.getOpcode())) { - - MachineOperand &Op = MI.getOperand(0); - assert(Op.isReg() && "First LGKM operand must be a register!"); - - unsigned Reg = Op.getReg(); - unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize(); - Result.Named.LGKM = Size > 4 ? 2 : 1; - + if (TII->isSMRD(MI)) { + + if (MI.getNumOperands() != 0) { + assert(MI.getOperand(0).isReg() && + "First LGKM operand must be a register!"); + + // XXX - What if this is a write into a super register? + const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0); + unsigned Size = RC->getSize(); + Result.Named.LGKM = Size > 4 ? 2 : 1; + } else { + // s_dcache_inv etc. do not have a a destination register. Assume we + // want a wait on these. + // XXX - What is the right value? + Result.Named.LGKM = 1; + } } else { // DS Result.Named.LGKM = 1; @@ -173,9 +184,8 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { } bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { - // Constants are always irrelevant - if (!Op.isReg()) + if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) return false; // Defines are always relevant @@ -196,7 +206,7 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { // operand comes before the value operand and it may have // multiple data operands. - if (TII->isDS(MI.getOpcode())) { + if (TII->isDS(MI)) { MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data); if (Data && Op.isIdenticalTo(*Data)) return true; @@ -224,18 +234,13 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { return false; } -RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) { - - if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) - return std::make_pair(0, 0); - - unsigned Reg = Op.getReg(); - unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize(); - +RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC, + const MachineOperand &Reg) const { + unsigned Size = RC->getSize(); assert(Size >= 4); RegInterval Result; - Result.first = TRI->getEncodingValue(Reg); + Result.first = TRI->getEncodingValue(Reg.getReg()); Result.second = Result.first + Size / 4; return Result; @@ -246,10 +251,13 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, // Get the hardware counter increments and sum them up Counters Increment = getHwCounts(*I); + Counters Limit = ZeroCounts; unsigned Sum = 0; for (unsigned i = 0; i < 3; ++i) { LastIssued.Array[i] += Increment.Array[i]; + if (Increment.Array[i]) + Limit.Array[i] = LastIssued.Array[i]; Sum += Increment.Array[i]; } @@ -261,7 +269,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - // Any occurence of consecutive VMEM or SMEM instructions forms a VMEM + // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM // or SMEM clause, respectively. // // The temporary workaround is to break the clauses with S_NOP. @@ -270,7 +278,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, // and destination registers don't overlap, e.g. this is illegal: // r0 = load r2 // r2 = load r0 - if ((LastOpcodeType == SMEM && TII->isSMRD(I->getOpcode())) || + if ((LastOpcodeType == SMEM && TII->isSMRD(*I)) || (LastOpcodeType == VMEM && Increment.Named.VM)) { // Insert a NOP to break the clause. BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)) @@ -278,7 +286,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, LastInstWritesM0 = false; } - if (TII->isSMRD(I->getOpcode())) + if (TII->isSMRD(*I)) LastOpcodeType = SMEM; else if (Increment.Named.VM) LastOpcodeType = VMEM; @@ -290,21 +298,21 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, } for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { - MachineOperand &Op = I->getOperand(i); if (!isOpRelevant(Op)) continue; - RegInterval Interval = getRegInterval(Op); + const TargetRegisterClass *RC = TII->getOpRegClass(*I, i); + RegInterval Interval = getRegInterval(RC, Op); for (unsigned j = Interval.first; j < Interval.second; ++j) { // Remember which registers we define if (Op.isDef()) - DefinedRegs[j] = LastIssued; + DefinedRegs[j] = Limit; // and which one we are using if (Op.isUse()) - UsedRegs[j] = LastIssued; + UsedRegs[j] = Limit; } } } @@ -390,12 +398,18 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) { if (MI.getOpcode() == AMDGPU::S_SENDMSG) return LastIssued; - // For each register affected by this - // instruction increase the result sequence + // For each register affected by this instruction increase the result + // sequence. + // + // TODO: We could probably just look at explicit operands if we removed VCC / + // EXEC from SMRD dest reg classes. for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &Op = MI.getOperand(i); - RegInterval Interval = getRegInterval(Op); + if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) + continue; + + const TargetRegisterClass *RC = TII->getOpRegClass(MI, i); + RegInterval Interval = getRegInterval(RC, Op); for (unsigned j = Interval.first; j < Interval.second; ++j) { if (Op.isDef()) { diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td index 211666a..0e883f6 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -41,6 +41,10 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : field bits<1> WQM = 0; field bits<1> VGPRSpill = 0; + // This bit tells the assembler to use the 32-bit encoding in case it + // is unable to infer the encoding from the operands. + field bits<1> VOPAsmPrefer32Bit = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = VM_CNT; let TSFlags{1} = EXP_CNT; @@ -68,10 +72,8 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : let TSFlags{19} = FLAT; let TSFlags{20} = WQM; let TSFlags{21} = VGPRSpill; + let TSFlags{22} = VOPAsmPrefer32Bit; - // Most instructions require adjustments after selection to satisfy - // operand requirements. - let hasPostISelHook = 1; let SchedRW = [Write32Bit]; } @@ -86,7 +88,6 @@ class Enc64 { } class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">; -def VOPDstVCC : VOPDstOperand <VCCReg>; let Uses = [EXEC] in { @@ -101,11 +102,11 @@ class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> : } class VOPCCommon <dag ins, string asm, list<dag> pattern> : - VOPAnyCommon <(outs VOPDstVCC:$dst), ins, asm, pattern> { + VOPAnyCommon <(outs), ins, asm, pattern> { - let DisableEncoding = "$dst"; let VOPC = 1; let Size = 4; + let Defs = [VCC]; } class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> : @@ -138,6 +139,11 @@ class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> : let isCodeGenOnly = 0; int Size = 8; + + // Because SGPRs may be allowed if there are multiple operands, we + // need a post-isel hook to insert copies in order to avoid + // violating constant bus requirements. + let hasPostISelHook = 1; } } // End Uses = [EXEC] @@ -222,6 +228,20 @@ class SMRDe <bits<5> op, bits<1> imm> : Enc32 { let Inst{31-27} = 0x18; //encoding } +class SMRD_IMMe_ci <bits<5> op> : Enc64 { + bits<7> sdst; + bits<7> sbase; + bits<32> offset; + + let Inst{7-0} = 0xff; + let Inst{8} = 0; + let Inst{14-9} = sbase{6-1}; + let Inst{21-15} = sdst; + let Inst{26-22} = op; + let Inst{31-27} = 0x18; //encoding + let Inst{63-32} = offset; +} + let SchedRW = [WriteSALU] in { class SOP1 <dag outs, dag ins, string asm, list<dag> pattern> : InstSI<outs, ins, asm, pattern> { @@ -249,13 +269,13 @@ class SOP2 <dag outs, dag ins, string asm, list<dag> pattern> : class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : InstSI<outs, ins, asm, pattern>, SOPCe <op> { - let DisableEncoding = "$dst"; let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; let SALU = 1; let SOPC = 1; let isCodeGenOnly = 0; + let Defs = [SCC]; let UseNamedOperandTable = 1; } @@ -598,15 +618,13 @@ class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> : // Vector I/O operations //===----------------------------------------------------------------------===// -let Uses = [EXEC] in { - class DS <dag outs, dag ins, string asm, list<dag> pattern> : InstSI <outs, ins, asm, pattern> { let LGKM_CNT = 1; let DS = 1; let UseNamedOperandTable = 1; - let Uses = [M0]; + let Uses = [M0, EXEC]; // Most instruction load and store data, so set this as the default. let mayLoad = 1; @@ -623,6 +641,7 @@ class MUBUF <dag outs, dag ins, string asm, list<dag> pattern> : let VM_CNT = 1; let EXP_CNT = 1; let MUBUF = 1; + let Uses = [EXEC]; let hasSideEffects = 0; let UseNamedOperandTable = 1; @@ -636,6 +655,7 @@ class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> : let VM_CNT = 1; let EXP_CNT = 1; let MTBUF = 1; + let Uses = [EXEC]; let hasSideEffects = 0; let UseNamedOperandTable = 1; @@ -665,9 +685,7 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : let VM_CNT = 1; let EXP_CNT = 1; let MIMG = 1; + let Uses = [EXEC]; let hasSideEffects = 0; // XXX ???? } - - -} // End Uses = [EXEC] diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index cfd2c42..a08a5a8 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -82,6 +82,7 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, switch (MI->getOpcode()) { case AMDGPU::V_MOV_B32_e32: case AMDGPU::V_MOV_B32_e64: + case AMDGPU::V_MOV_B64_PSEUDO: return true; default: return false; @@ -204,7 +205,8 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, unsigned &Offset, const TargetRegisterInfo *TRI) const { unsigned Opc = LdSt->getOpcode(); - if (isDS(Opc)) { + + if (isDS(*LdSt)) { const MachineOperand *OffsetImm = getNamedOperand(*LdSt, AMDGPU::OpName::offset); if (OffsetImm) { @@ -254,7 +256,7 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, return false; } - if (isMUBUF(Opc) || isMTBUF(Opc)) { + if (isMUBUF(*LdSt) || isMTBUF(*LdSt)) { if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) return false; @@ -270,7 +272,7 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, return true; } - if (isSMRD(Opc)) { + if (isSMRD(*LdSt)) { const MachineOperand *OffsetImm = getNamedOperand(*LdSt, AMDGPU::OpName::offset); if (!OffsetImm) @@ -289,20 +291,18 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt, unsigned NumLoads) const { - unsigned Opc0 = FirstLdSt->getOpcode(); - unsigned Opc1 = SecondLdSt->getOpcode(); - // TODO: This needs finer tuning if (NumLoads > 4) return false; - if (isDS(Opc0) && isDS(Opc1)) + if (isDS(*FirstLdSt) && isDS(*SecondLdSt)) return true; - if (isSMRD(Opc0) && isSMRD(Opc1)) + if (isSMRD(*FirstLdSt) && isSMRD(*SecondLdSt)) return true; - if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) + if ((isMUBUF(*FirstLdSt) || isMTBUF(*FirstLdSt)) && + (isMUBUF(*SecondLdSt) || isMTBUF(*SecondLdSt))) return true; return false; @@ -323,28 +323,45 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, - AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0 + AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, + }; + + static const int16_t Sub0_15_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, + AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, + AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, + AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, }; static const int16_t Sub0_7[] = { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, - AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0 + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + }; + + static const int16_t Sub0_7_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, + AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, }; static const int16_t Sub0_3[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0 + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + }; + + static const int16_t Sub0_3_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, }; static const int16_t Sub0_2[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0 + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, }; static const int16_t Sub0_1[] = { - AMDGPU::sub0, AMDGPU::sub1, 0 + AMDGPU::sub0, AMDGPU::sub1, }; unsigned Opcode; - const int16_t *SubIndices; + ArrayRef<int16_t> SubIndices; + bool Forward; if (AMDGPU::SReg_32RegClass.contains(DestReg)) { assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); @@ -360,7 +377,7 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } else { // FIXME: Hack until VReg_1 removed. assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); - BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32), AMDGPU::VCC) + BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32)) .addImm(0) .addReg(SrcReg, getKillRegState(KillSrc)); } @@ -375,18 +392,18 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); - Opcode = AMDGPU::S_MOV_B32; - SubIndices = Sub0_3; + Opcode = AMDGPU::S_MOV_B64; + SubIndices = Sub0_3_64; } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); - Opcode = AMDGPU::S_MOV_B32; - SubIndices = Sub0_7; + Opcode = AMDGPU::S_MOV_B64; + SubIndices = Sub0_7_64; } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); - Opcode = AMDGPU::S_MOV_B32; - SubIndices = Sub0_15; + Opcode = AMDGPU::S_MOV_B64; + SubIndices = Sub0_15_64; } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || @@ -428,13 +445,27 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, llvm_unreachable("Can't copy register!"); } - while (unsigned SubIdx = *SubIndices++) { + if (RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg)) + Forward = true; + else + Forward = false; + + for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { + unsigned SubIdx; + if (Forward) + SubIdx = SubIndices[Idx]; + else + SubIdx = SubIndices[SubIndices.size() - Idx - 1]; + MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx)); - Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc)); + Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); + + if (Idx == SubIndices.size() - 1) + Builder.addReg(SrcReg, RegState::Kill | RegState::Implicit); - if (*SubIndices) + if (Idx == 0) Builder.addReg(DestReg, RegState::Define | RegState::Implicit); } } @@ -471,6 +502,40 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { return AMDGPU::COPY; } +static unsigned getSGPRSpillSaveOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_S32_SAVE; + case 8: + return AMDGPU::SI_SPILL_S64_SAVE; + case 16: + return AMDGPU::SI_SPILL_S128_SAVE; + case 32: + return AMDGPU::SI_SPILL_S256_SAVE; + case 64: + return AMDGPU::SI_SPILL_S512_SAVE; + default: + llvm_unreachable("unknown register size"); + } +} + +static unsigned getVGPRSpillSaveOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_V32_SAVE; + case 8: + return AMDGPU::SI_SPILL_V64_SAVE; + case 16: + return AMDGPU::SI_SPILL_V128_SAVE; + case 32: + return AMDGPU::SI_SPILL_V256_SAVE; + case 64: + return AMDGPU::SI_SPILL_V512_SAVE; + default: + llvm_unreachable("unknown register size"); + } +} + void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, bool isKill, @@ -481,47 +546,83 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo *FrameInfo = MF->getFrameInfo(); DebugLoc DL = MBB.findDebugLoc(MI); - int Opcode = -1; + + unsigned Size = FrameInfo->getObjectSize(FrameIndex); + unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); + MachinePointerInfo PtrInfo + = MachinePointerInfo::getFixedStack(*MF, FrameIndex); + MachineMemOperand *MMO + = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, + Size, Align); if (RI.isSGPRClass(RC)) { + MFI->setHasSpilledSGPRs(); + // We are only allowed to create one new instruction when spilling // registers, so we need to use pseudo instruction for spilling // SGPRs. - switch (RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_S32_SAVE; break; - case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break; - case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break; - case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break; - case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break; - } - } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { - MFI->setHasSpilledVGPRs(); - - switch(RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break; - case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break; - case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break; - case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break; - case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break; - case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break; - } + unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize()); + BuildMI(MBB, MI, DL, get(Opcode)) + .addReg(SrcReg) // src + .addFrameIndex(FrameIndex) // frame_idx + .addMemOperand(MMO); + + return; } - if (Opcode != -1) { - FrameInfo->setObjectAlignment(FrameIndex, 4); - BuildMI(MBB, MI, DL, get(Opcode)) - .addReg(SrcReg) - .addFrameIndex(FrameIndex) - // Place-holder registers, these will be filled in by - // SIPrepareScratchRegs. - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - } else { + if (!ST.isVGPRSpillingEnabled(MFI)) { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" " spill register"); BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) - .addReg(SrcReg); + .addReg(SrcReg); + + return; + } + + assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); + + unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); + MFI->setHasSpilledVGPRs(); + BuildMI(MBB, MI, DL, get(Opcode)) + .addReg(SrcReg) // src + .addFrameIndex(FrameIndex) // frame_idx + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset + .addMemOperand(MMO); +} + +static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_S32_RESTORE; + case 8: + return AMDGPU::SI_SPILL_S64_RESTORE; + case 16: + return AMDGPU::SI_SPILL_S128_RESTORE; + case 32: + return AMDGPU::SI_SPILL_S256_RESTORE; + case 64: + return AMDGPU::SI_SPILL_S512_RESTORE; + default: + llvm_unreachable("unknown register size"); + } +} + +static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_V32_RESTORE; + case 8: + return AMDGPU::SI_SPILL_V64_RESTORE; + case 16: + return AMDGPU::SI_SPILL_V128_RESTORE; + case 32: + return AMDGPU::SI_SPILL_V256_RESTORE; + case 64: + return AMDGPU::SI_SPILL_V512_RESTORE; + default: + llvm_unreachable("unknown register size"); } } @@ -534,42 +635,43 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo *FrameInfo = MF->getFrameInfo(); DebugLoc DL = MBB.findDebugLoc(MI); - int Opcode = -1; - - if (RI.isSGPRClass(RC)){ - switch(RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break; - case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break; - case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break; - case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break; - case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break; - } - } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { - switch(RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break; - case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break; - case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break; - case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break; - case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break; - case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break; - } - } + unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); + unsigned Size = FrameInfo->getObjectSize(FrameIndex); + + MachinePointerInfo PtrInfo + = MachinePointerInfo::getFixedStack(*MF, FrameIndex); - if (Opcode != -1) { - FrameInfo->setObjectAlignment(FrameIndex, 4); + MachineMemOperand *MMO = MF->getMachineMemOperand( + PtrInfo, MachineMemOperand::MOLoad, Size, Align); + + if (RI.isSGPRClass(RC)) { + // FIXME: Maybe this should not include a memoperand because it will be + // lowered to non-memory instructions. + unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize()); BuildMI(MBB, MI, DL, get(Opcode), DestReg) - .addFrameIndex(FrameIndex) - // Place-holder registers, these will be filled in by - // SIPrepareScratchRegs. - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); + .addFrameIndex(FrameIndex) // frame_idx + .addMemOperand(MMO); - } else { + return; + } + + if (!ST.isVGPRSpillingEnabled(MFI)) { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" " restore register"); BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); + + return; } + + assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); + + unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); + BuildMI(MBB, MI, DL, get(Opcode), DestReg) + .addFrameIndex(FrameIndex) // frame_idx + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset + .addMemOperand(MMO); } /// \param @Offset Offset in bytes of the FrameIndex being spilled @@ -601,17 +703,21 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, if (MFI->getShaderType() == ShaderType::COMPUTE && WorkGroupSize > WavefrontSize) { - unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X); - unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y); - unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z); + unsigned TIDIGXReg + = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X); + unsigned TIDIGYReg + = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y); + unsigned TIDIGZReg + = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z); unsigned InputPtrReg = - TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR); + TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { if (!Entry.isLiveIn(Reg)) Entry.addLiveIn(Reg); } RS->enterBasicBlock(&Entry); + // FIXME: Can we scavenge an SReg_64 and access the subregs? unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) @@ -667,8 +773,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, return TmpReg; } -void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI, - int Count) const { +void SIInstrInfo::insertWaitStates(MachineBasicBlock::iterator MI, + int Count) const { while (Count > 0) { int Arg; if (Count >= 8) @@ -687,26 +793,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { switch (MI->getOpcode()) { default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); - case AMDGPU::SI_CONSTDATA_PTR: { - unsigned Reg = MI->getOperand(0).getReg(); - unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); - unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); - - BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg); - - // Add 32-bit offset from this instruction to the start of the constant data. - BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo) - .addReg(RegLo) - .addTargetIndex(AMDGPU::TI_CONSTDATA_START) - .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit); - BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi) - .addReg(RegHi) - .addImm(0) - .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit) - .addReg(AMDGPU::SCC, RegState::Implicit); - MI->eraseFromParent(); - break; - } case AMDGPU::SGPR_USE: // This is just a placeholder for register allocation. MI->eraseFromParent(); @@ -760,49 +846,90 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { MI->eraseFromParent(); break; } + + case AMDGPU::SI_CONSTDATA_PTR: { + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo *>(ST.getRegisterInfo()); + MachineFunction &MF = *MBB.getParent(); + unsigned Reg = MI->getOperand(0).getReg(); + unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0); + unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1); + + // Create a bundle so these instructions won't be re-ordered by the + // post-RA scheduler. + MIBundleBuilder Bundler(MBB, MI); + Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); + + // Add 32-bit offset from this instruction to the start of the + // constant data. + Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) + .addReg(RegLo) + .addOperand(MI->getOperand(1))); + Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) + .addReg(RegHi) + .addImm(0)); + + llvm::finalizeBundle(MBB, Bundler.begin()); + + MI->eraseFromParent(); + break; + } } return true; } -MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, - bool NewMI) const { - - if (MI->getNumOperands() < 3) - return nullptr; - +/// Commutes the operands in the given instruction. +/// The commutable operands are specified by their indices OpIdx0 and OpIdx1. +/// +/// Do not call this method for a non-commutable instruction or for +/// non-commutable pair of operand indices OpIdx0 and OpIdx1. +/// Even though the instruction is commutable, the method may still +/// fail to commute the operands, null pointer is returned in such cases. +MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx0, + unsigned OpIdx1) const { int CommutedOpcode = commuteOpcode(*MI); if (CommutedOpcode == -1) return nullptr; int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::src0); - assert(Src0Idx != -1 && "Should always have src0 operand"); - MachineOperand &Src0 = MI->getOperand(Src0Idx); if (!Src0.isReg()) return nullptr; int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::src1); - if (Src1Idx == -1) + + if ((OpIdx0 != static_cast<unsigned>(Src0Idx) || + OpIdx1 != static_cast<unsigned>(Src1Idx)) && + (OpIdx0 != static_cast<unsigned>(Src1Idx) || + OpIdx1 != static_cast<unsigned>(Src0Idx))) return nullptr; MachineOperand &Src1 = MI->getOperand(Src1Idx); - // Make sure it's legal to commute operands for VOP2. - if (isVOP2(MI->getOpcode()) && - (!isOperandLegal(MI, Src0Idx, &Src1) || - !isOperandLegal(MI, Src1Idx, &Src0))) { - return nullptr; + + if (isVOP2(*MI)) { + const MCInstrDesc &InstrDesc = MI->getDesc(); + // For VOP2 instructions, any operand type is valid to use for src0. Make + // sure we can use the src1 as src0. + // + // We could be stricter here and only allow commuting if there is a reason + // to do so. i.e. if both operands are VGPRs there is no real benefit, + // although MachineCSE attempts to find matches by commuting. + const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) + return nullptr; } if (!Src1.isReg()) { // Allow commuting instructions with Imm operands. if (NewMI || !Src1.isImm() || - (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) { + (!isVOP2(*MI) && !isVOP3(*MI))) { return nullptr; } - // Be sure to copy the source modifiers to the right place. if (MachineOperand *Src0Mods = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { @@ -832,7 +959,7 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, Src1.ChangeToRegister(Reg, false); Src1.setSubReg(SubReg); } else { - MI = TargetInstrInfo::commuteInstruction(MI, NewMI); + MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1); } if (MI) @@ -845,8 +972,8 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, // between the true commutable operands, and the base // TargetInstrInfo::commuteInstruction uses it. bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, - unsigned &SrcOpIdx1, - unsigned &SrcOpIdx2) const { + unsigned &SrcOpIdx0, + unsigned &SrcOpIdx1) const { const MCInstrDesc &MCID = MI->getDesc(); if (!MCID.isCommutable()) return false; @@ -857,7 +984,8 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, return false; // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on - // immediate. + // immediate. Also, immediate src0 operand is not handled in + // SIInstrInfo::commuteInstruction(); if (!MI->getOperand(Src0Idx).isReg()) return false; @@ -865,18 +993,22 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, if (Src1Idx == -1) return false; - if (!MI->getOperand(Src1Idx).isReg()) - return false; - - // If any source modifiers are set, the generic instruction commuting won't - // understand how to copy the source modifiers. - if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || - hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) + MachineOperand &Src1 = MI->getOperand(Src1Idx); + if (Src1.isImm()) { + // SIInstrInfo::commuteInstruction() does support commuting the immediate + // operand src1 in 2 and 3 operand instructions. + if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode())) + return false; + } else if (Src1.isReg()) { + // If any source modifiers are set, the generic instruction commuting won't + // understand how to copy the source modifiers. + if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || + hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) + return false; + } else return false; - SrcOpIdx1 = Src0Idx; - SrcOpIdx2 = Src1Idx; - return true; + return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); } MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB, @@ -898,11 +1030,6 @@ bool SIInstrInfo::isMov(unsigned Opcode) const { } } -bool -SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { - return RC != &AMDGPU::EXECRegRegClass; -} - static void removeModOperands(MachineInstr &MI) { unsigned Opc = MI.getOpcode(); int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, @@ -984,9 +1111,6 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); } - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::src2)); - // ChangingToImmediate adds Src2 back to the instruction. Src2->ChangeToImmediate(Imm); removeModOperands(*UseMI); @@ -1045,18 +1169,6 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, return false; } -bool -SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI, - AliasAnalysis *AA) const { - switch(MI->getOpcode()) { - default: return AMDGPUInstrInfo::isTriviallyReMaterializable(MI, AA); - case AMDGPU::S_MOV_B32: - case AMDGPU::S_MOV_B64: - case AMDGPU::V_MOV_B32_e32: - return MI->getOperand(1).isImm(); - } -} - static bool offsetsDoNotOverlap(int WidthA, int OffsetA, int WidthB, int OffsetB) { int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; @@ -1088,9 +1200,6 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, MachineInstr *MIb, AliasAnalysis *AA) const { - unsigned Opc0 = MIa->getOpcode(); - unsigned Opc1 = MIb->getOpcode(); - assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && "MIa must load from or modify a memory location"); assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && @@ -1105,32 +1214,32 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, // TODO: Should we check the address space from the MachineMemOperand? That // would allow us to distinguish objects we know don't alias based on the - // underlying addres space, even if it was lowered to a different one, + // underlying address space, even if it was lowered to a different one, // e.g. private accesses lowered to use MUBUF instructions on a scratch // buffer. - if (isDS(Opc0)) { - if (isDS(Opc1)) + if (isDS(*MIa)) { + if (isDS(*MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(Opc1); + return !isFLAT(*MIb); } - if (isMUBUF(Opc0) || isMTBUF(Opc0)) { - if (isMUBUF(Opc1) || isMTBUF(Opc1)) + if (isMUBUF(*MIa) || isMTBUF(*MIa)) { + if (isMUBUF(*MIb) || isMTBUF(*MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(Opc1) && !isSMRD(Opc1); + return !isFLAT(*MIb) && !isSMRD(*MIb); } - if (isSMRD(Opc0)) { - if (isSMRD(Opc1)) + if (isSMRD(*MIa)) { + if (isSMRD(*MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(Opc1) && !isMUBUF(Opc0) && !isMTBUF(Opc0); + return !isFLAT(*MIb) && !isMUBUF(*MIa) && !isMTBUF(*MIa); } - if (isFLAT(Opc0)) { - if (isFLAT(Opc1)) + if (isFLAT(*MIa)) { + if (isFLAT(*MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); return false; @@ -1319,6 +1428,26 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, return false; } +static unsigned findImplicitSGPRRead(const MachineInstr &MI) { + for (const MachineOperand &MO : MI.implicit_operands()) { + // We only care about reads. + if (MO.isDef()) + continue; + + switch (MO.getReg()) { + case AMDGPU::VCC: + case AMDGPU::M0: + case AMDGPU::FLAT_SCR: + return MO.getReg(); + + default: + break; + } + } + + return AMDGPU::NoRegister; +} + bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, StringRef &ErrInfo) const { uint16_t Opcode = MI->getOpcode(); @@ -1335,7 +1464,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, return false; } - // Make sure the register classes are correct + // Make sure the register classes are correct. for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { if (MI->getOperand(i).isFPImm()) { ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " @@ -1392,14 +1521,17 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, // Verify VOP* - if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) { + if (isVOP1(*MI) || isVOP2(*MI) || isVOP3(*MI) || isVOPC(*MI)) { // Only look at the true operands. Only a real operand can use the constant // bus, and we don't want to check pseudo-operands like the source modifier // flags. const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; unsigned ConstantBusCount = 0; - unsigned SGPRUsed = AMDGPU::NoRegister; + unsigned SGPRUsed = findImplicitSGPRRead(*MI); + if (SGPRUsed != AMDGPU::NoRegister) + ++ConstantBusCount; + for (int OpIdx : OpIndices) { if (OpIdx == -1) break; @@ -1435,6 +1567,16 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, } } + // Make sure we aren't losing exec uses in the td files. This mostly requires + // being careful when using let Uses to try to add other use registers. + if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) { + const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC); + if (!Exec || !Exec->isImplicit()) { + ErrInfo = "VALU instruction does not implicitly read exec mask"; + return false; + } + } + return true; } @@ -1483,11 +1625,17 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; case AMDGPU::S_LOAD_DWORD_IMM: - case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64; + case AMDGPU::S_LOAD_DWORD_SGPR: + case AMDGPU::S_LOAD_DWORD_IMM_ci: + return AMDGPU::BUFFER_LOAD_DWORD_ADDR64; case AMDGPU::S_LOAD_DWORDX2_IMM: - case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; + case AMDGPU::S_LOAD_DWORDX2_SGPR: + case AMDGPU::S_LOAD_DWORDX2_IMM_ci: + return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; case AMDGPU::S_LOAD_DWORDX4_IMM: - case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; + case AMDGPU::S_LOAD_DWORDX4_SGPR: + case AMDGPU::S_LOAD_DWORDX4_IMM_ci: + return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; @@ -1562,17 +1710,21 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, unsigned SubIdx, const TargetRegisterClass *SubRC) const { - assert(SuperReg.isReg()); - - unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); + MachineBasicBlock *MBB = MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); unsigned SubReg = MRI.createVirtualRegister(SubRC); + if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { + BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) + .addReg(SuperReg.getReg(), 0, SubIdx); + return SubReg; + } + // Just in case the super register is itself a sub-register, copy it to a new // value so we don't need to worry about merging its subreg index with the // SubIdx passed to this function. The register coalescer should be able to // eliminate this extra copy. - MachineBasicBlock *MBB = MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); + unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); @@ -1605,36 +1757,6 @@ MachineOperand SIInstrInfo::buildExtractSubRegOrImm( return MachineOperand::CreateReg(SubReg, false); } -unsigned SIInstrInfo::split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist, - MachineBasicBlock::iterator MI, - MachineRegisterInfo &MRI, - const TargetRegisterClass *RC, - const MachineOperand &Op) const { - MachineBasicBlock *MBB = MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - unsigned LoDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned HiDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned Dst = MRI.createVirtualRegister(RC); - - MachineInstr *Lo = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), - LoDst) - .addImm(Op.getImm() & 0xFFFFFFFF); - MachineInstr *Hi = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), - HiDst) - .addImm(Op.getImm() >> 32); - - BuildMI(*MBB, MI, DL, get(TargetOpcode::REG_SEQUENCE), Dst) - .addReg(LoDst) - .addImm(AMDGPU::sub0) - .addReg(HiDst) - .addImm(AMDGPU::sub1); - - Worklist.push_back(Lo); - Worklist.push_back(Hi); - - return Dst; -} - // Change the order of operands from (0, 1, 2) to (0, 2, 1) void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { assert(Inst->getNumExplicitOperands() == 3); @@ -1643,6 +1765,41 @@ void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { Inst->addOperand(Op1); } +bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, + const MCOperandInfo &OpInfo, + const MachineOperand &MO) const { + if (!MO.isReg()) + return false; + + unsigned Reg = MO.getReg(); + const TargetRegisterClass *RC = + TargetRegisterInfo::isVirtualRegister(Reg) ? + MRI.getRegClass(Reg) : + RI.getPhysRegClass(Reg); + + // In order to be legal, the common sub-class must be equal to the + // class of the current operand. For example: + // + // v_mov_b32 s0 ; Operand defined as vsrc_32 + // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL + // + // s_sendmsg 0, s0 ; Operand defined as m0reg + // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL + + return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; +} + +bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, + const MCOperandInfo &OpInfo, + const MachineOperand &MO) const { + if (MO.isReg()) + return isLegalRegOperand(MRI, OpInfo, MO); + + // Handle non-register types that are treated like immediates. + assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); + return true; +} + bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, const MachineOperand *MO) const { const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); @@ -1653,7 +1810,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, if (!MO) MO = &MI->getOperand(OpIdx); - if (isVALU(InstDesc.Opcode) && + if (isVALU(*MI) && usesConstantBus(MRI, *MO, DefinedRC->getSize())) { unsigned SGPRUsed = MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister; @@ -1670,21 +1827,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, if (MO->isReg()) { assert(DefinedRC); - const TargetRegisterClass *RC = - TargetRegisterInfo::isVirtualRegister(MO->getReg()) ? - MRI.getRegClass(MO->getReg()) : - RI.getPhysRegClass(MO->getReg()); - - // In order to be legal, the common sub-class must be equal to the - // class of the current operand. For example: - // - // v_mov_b32 s0 ; Operand defined as vsrc_32 - // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL - // - // s_sendmsg 0, s0 ; Operand defined as m0reg - // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL - - return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; + return isLegalRegOperand(MRI, OpInfo, *MO); } @@ -1699,81 +1842,143 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, return isImmOperandLegal(MI, OpIdx, *MO); } -void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { - MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); +void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, + MachineInstr *MI) const { + unsigned Opc = MI->getOpcode(); + const MCInstrDesc &InstrDesc = get(Opc); - int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src0); - int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src1); - int Src2Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src2); + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + MachineOperand &Src1 = MI->getOperand(Src1Idx); - // Legalize VOP2 - if (isVOP2(MI->getOpcode()) && Src1Idx != -1) { - // Legalize src0 - if (!isOperandLegal(MI, Src0Idx)) + // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 + // we need to only have one constant bus use. + // + // Note we do not need to worry about literal constants here. They are + // disabled for the operand type for instructions because they will always + // violate the one constant bus use rule. + bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister; + if (HasImplicitSGPR) { + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + MachineOperand &Src0 = MI->getOperand(Src0Idx); + + if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) legalizeOpWithMove(MI, Src0Idx); + } - // Legalize src1 - if (isOperandLegal(MI, Src1Idx)) - return; + // VOP2 src0 instructions support all operand types, so we don't need to check + // their legality. If src1 is already legal, we don't need to do anything. + if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) + return; - // Usually src0 of VOP2 instructions allow more types of inputs - // than src1, so try to commute the instruction to decrease our - // chances of having to insert a MOV instruction to legalize src1. - if (MI->isCommutable()) { - if (commuteInstruction(MI)) - // If we are successful in commuting, then we know MI is legal, so - // we are done. - return; - } + // We do not use commuteInstruction here because it is too aggressive and will + // commute if it is possible. We only want to commute here if it improves + // legality. This can be called a fairly large number of times so don't waste + // compile time pointlessly swapping and checking legality again. + if (HasImplicitSGPR || !MI->isCommutable()) { + legalizeOpWithMove(MI, Src1Idx); + return; + } + + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + MachineOperand &Src0 = MI->getOperand(Src0Idx); + // If src0 can be used as src1, commuting will make the operands legal. + // Otherwise we have to give up and insert a move. + // + // TODO: Other immediate-like operand kinds could be commuted if there was a + // MachineOperand::ChangeTo* for them. + if ((!Src1.isImm() && !Src1.isReg()) || + !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { legalizeOpWithMove(MI, Src1Idx); return; } - // XXX - Do any VOP3 instructions read VCC? - // Legalize VOP3 - if (isVOP3(MI->getOpcode())) { - int VOP3Idx[3] = { Src0Idx, Src1Idx, Src2Idx }; + int CommutedOpc = commuteOpcode(*MI); + if (CommutedOpc == -1) { + legalizeOpWithMove(MI, Src1Idx); + return; + } - // Find the one SGPR operand we are allowed to use. - unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); + MI->setDesc(get(CommutedOpc)); - for (unsigned i = 0; i < 3; ++i) { - int Idx = VOP3Idx[i]; - if (Idx == -1) - break; - MachineOperand &MO = MI->getOperand(Idx); + unsigned Src0Reg = Src0.getReg(); + unsigned Src0SubReg = Src0.getSubReg(); + bool Src0Kill = Src0.isKill(); - if (MO.isReg()) { - if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) - continue; // VGPRs are legal + if (Src1.isImm()) + Src0.ChangeToImmediate(Src1.getImm()); + else if (Src1.isReg()) { + Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); + Src0.setSubReg(Src1.getSubReg()); + } else + llvm_unreachable("Should only have register or immediate operands"); - assert(MO.getReg() != AMDGPU::SCC && "SCC operand to VOP3 instruction"); + Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); + Src1.setSubReg(Src0SubReg); +} - if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { - SGPRReg = MO.getReg(); - // We can use one SGPR in each VOP3 instruction. - continue; - } - } else if (!isLiteralConstant(MO, getOpSize(MI->getOpcode(), Idx))) { - // If it is not a register and not a literal constant, then it must be - // an inline constant which is always legal. - continue; - } - // If we make it this far, then the operand is not legal and we must - // legalize it. - legalizeOpWithMove(MI, Idx); +// Legalize VOP3 operands. Because all operand types are supported for any +// operand, and since literal constants are not allowed and should never be +// seen, we only need to worry about inserting copies if we use multiple SGPR +// operands. +void SIInstrInfo::legalizeOperandsVOP3( + MachineRegisterInfo &MRI, + MachineInstr *MI) const { + unsigned Opc = MI->getOpcode(); + + int VOP3Idx[3] = { + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) + }; + + // Find the one SGPR operand we are allowed to use. + unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); + + for (unsigned i = 0; i < 3; ++i) { + int Idx = VOP3Idx[i]; + if (Idx == -1) + break; + MachineOperand &MO = MI->getOperand(Idx); + + // We should never see a VOP3 instruction with an illegal immediate operand. + if (!MO.isReg()) + continue; + + if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) + continue; // VGPRs are legal + + if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { + SGPRReg = MO.getReg(); + // We can use one SGPR in each VOP3 instruction. + continue; } + + // If we make it this far, then the operand is not legal and we must + // legalize it. + legalizeOpWithMove(MI, Idx); + } +} + +void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { + MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + + // Legalize VOP2 + if (isVOP2(*MI)) { + legalizeOperandsVOP2(MRI, MI); + return; + } + + // Legalize VOP3 + if (isVOP3(*MI)) { + legalizeOperandsVOP3(MRI, MI); + return; } // Legalize REG_SEQUENCE and PHI // The register class of the operands much be the same type as the register // class of the output. - if (MI->getOpcode() == AMDGPU::REG_SEQUENCE || - MI->getOpcode() == AMDGPU::PHI) { + if (MI->getOpcode() == AMDGPU::PHI) { const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { if (!MI->getOperand(i).isReg() || @@ -1802,26 +2007,53 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { } // Update all the operands so they have the same type. - for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { - if (!MI->getOperand(i).isReg() || - !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) + for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { + MachineOperand &Op = MI->getOperand(I); + if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) continue; unsigned DstReg = MRI.createVirtualRegister(RC); - MachineBasicBlock *InsertBB; - MachineBasicBlock::iterator Insert; - if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { - InsertBB = MI->getParent(); - Insert = MI; - } else { - // MI is a PHI instruction. - InsertBB = MI->getOperand(i + 1).getMBB(); - Insert = InsertBB->getFirstTerminator(); + + // MI is a PHI instruction. + MachineBasicBlock *InsertBB = MI->getOperand(I + 1).getMBB(); + MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); + + BuildMI(*InsertBB, Insert, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) + .addOperand(Op); + Op.setReg(DstReg); + } + } + + // REG_SEQUENCE doesn't really require operand legalization, but if one has a + // VGPR dest type and SGPR sources, insert copies so all operands are + // VGPRs. This seems to help operand folding / the register coalescer. + if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { + MachineBasicBlock *MBB = MI->getParent(); + const TargetRegisterClass *DstRC = getOpRegClass(*MI, 0); + if (RI.hasVGPRs(DstRC)) { + // Update all the operands so they are VGPR register classes. These may + // not be the same register class because REG_SEQUENCE supports mixing + // subregister index types e.g. sub0_sub1 + sub2 + sub3 + for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { + MachineOperand &Op = MI->getOperand(I); + if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) + continue; + + const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); + const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); + if (VRC == OpRC) + continue; + + unsigned DstReg = MRI.createVirtualRegister(VRC); + + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) + .addOperand(Op); + + Op.setReg(DstReg); + Op.setIsKill(); } - BuildMI(*InsertBB, Insert, MI->getDebugLoc(), - get(AMDGPU::COPY), DstReg) - .addOperand(MI->getOperand(i)); - MI->getOperand(i).setReg(DstReg); } + + return; } // Legalize INSERT_SUBREG @@ -1858,15 +2090,10 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { } MachineBasicBlock &MBB = *MI->getParent(); - // Extract the ptr from the resource descriptor. - - // SRsrcPtrLo = srsrc:sub0 - unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc, - &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass); - // SRsrcPtrHi = srsrc:sub1 - unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc, - &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass); + // Extract the ptr from the resource descriptor. + unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, + &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); // Create an empty resource descriptor unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); @@ -1891,80 +2118,112 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { .addImm(RsrcDataFormat >> 32); // NewSRsrc = {Zero64, SRsrcFormat} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), - NewSRsrc) - .addReg(Zero64) - .addImm(AMDGPU::sub0_sub1) - .addReg(SRsrcFormatLo) - .addImm(AMDGPU::sub2) - .addReg(SRsrcFormatHi) - .addImm(AMDGPU::sub3); + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) + .addReg(Zero64) + .addImm(AMDGPU::sub0_sub1) + .addReg(SRsrcFormatLo) + .addImm(AMDGPU::sub2) + .addReg(SRsrcFormatHi) + .addImm(AMDGPU::sub3); MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - unsigned NewVAddrLo; - unsigned NewVAddrHi; if (VAddr) { // This is already an ADDR64 instruction so we need to add the pointer // extracted from the resource descriptor to the current value of VAddr. - NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - - // NewVaddrLo = SRsrcPtrLo + VAddr:sub0 - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32), - NewVAddrLo) - .addReg(SRsrcPtrLo) - .addReg(VAddr->getReg(), 0, AMDGPU::sub0) - .addReg(AMDGPU::VCC, RegState::ImplicitDefine); - - // NewVaddrHi = SRsrcPtrHi + VAddr:sub1 - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32), - NewVAddrHi) - .addReg(SRsrcPtrHi) - .addReg(VAddr->getReg(), 0, AMDGPU::sub1) - .addReg(AMDGPU::VCC, RegState::ImplicitDefine) - .addReg(AMDGPU::VCC, RegState::Implicit); - + unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 + DebugLoc DL = MI->getDebugLoc(); + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) + .addReg(SRsrcPtr, 0, AMDGPU::sub0) + .addReg(VAddr->getReg(), 0, AMDGPU::sub0); + + // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) + .addReg(SRsrcPtr, 0, AMDGPU::sub1) + .addReg(VAddr->getReg(), 0, AMDGPU::sub1); + + // NewVaddr = {NewVaddrHi, NewVaddrLo} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) + .addReg(NewVAddrLo) + .addImm(AMDGPU::sub0) + .addReg(NewVAddrHi) + .addImm(AMDGPU::sub1); } else { // This instructions is the _OFFSET variant, so we need to convert it to // ADDR64. + assert(MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() + < AMDGPUSubtarget::VOLCANIC_ISLANDS && + "FIXME: Need to emit flat atomics here"); + MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); - - // Create the new instruction. unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); - MachineInstr *Addr64 = - BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) - .addOperand(*VData) - .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. - // This will be replaced later - // with the new value of vaddr. - .addOperand(*SRsrc) - .addOperand(*SOffset) - .addOperand(*Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0); // tfe + + // Atomics rith return have have an additional tied operand and are + // missing some of the special bits. + MachineOperand *VDataIn = getNamedOperand(*MI, AMDGPU::OpName::vdata_in); + MachineInstr *Addr64; + + if (!VDataIn) { + // Regular buffer load / store. + MachineInstrBuilder MIB + = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) + .addOperand(*VData) + .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. + // This will be replaced later + // with the new value of vaddr. + .addOperand(*SRsrc) + .addOperand(*SOffset) + .addOperand(*Offset); + + // Atomics do not have this operand. + if (const MachineOperand *GLC + = getNamedOperand(*MI, AMDGPU::OpName::glc)) { + MIB.addImm(GLC->getImm()); + } + + MIB.addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)); + + if (const MachineOperand *TFE + = getNamedOperand(*MI, AMDGPU::OpName::tfe)) { + MIB.addImm(TFE->getImm()); + } + + MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + Addr64 = MIB; + } else { + // Atomics with return. + Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) + .addOperand(*VData) + .addOperand(*VDataIn) + .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. + // This will be replaced later + // with the new value of vaddr. + .addOperand(*SRsrc) + .addOperand(*SOffset) + .addOperand(*Offset) + .addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)) + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + } MI->removeFromParent(); MI = Addr64; - NewVAddrLo = SRsrcPtrLo; - NewVAddrHi = SRsrcPtrHi; + // NewVaddr = {NewVaddrHi, NewVaddrLo} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) + .addReg(SRsrcPtr, 0, AMDGPU::sub0) + .addImm(AMDGPU::sub0) + .addReg(SRsrcPtr, 0, AMDGPU::sub1) + .addImm(AMDGPU::sub1); + VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); } - // NewVaddr = {NewVaddrHi, NewVaddrLo} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), - NewVAddr) - .addReg(NewVAddrLo) - .addImm(AMDGPU::sub0) - .addReg(NewVAddrHi) - .addImm(AMDGPU::sub1); - - // Update the instruction to use NewVaddr VAddr->setReg(NewVAddr); // Update the instruction to use NewSRsrc @@ -2028,53 +2287,64 @@ void SIInstrInfo::splitSMRD(MachineInstr *MI, .addOperand(*SOff); unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR) - .addOperand(*SOff) - .addImm(HalfSize); - Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp)) + .addReg(SOff->getReg(), 0, SOff->getSubReg()) + .addImm(HalfSize); + Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) .addReg(SBase->getReg(), getKillRegState(IsKill), SBase->getSubReg()) .addReg(OffsetSGPR); } unsigned SubLo, SubHi; + const TargetRegisterClass *NewDstRC; switch (HalfSize) { case 4: SubLo = AMDGPU::sub0; SubHi = AMDGPU::sub1; + NewDstRC = &AMDGPU::VReg_64RegClass; break; case 8: SubLo = AMDGPU::sub0_sub1; SubHi = AMDGPU::sub2_sub3; + NewDstRC = &AMDGPU::VReg_128RegClass; break; case 16: SubLo = AMDGPU::sub0_sub1_sub2_sub3; SubHi = AMDGPU::sub4_sub5_sub6_sub7; + NewDstRC = &AMDGPU::VReg_256RegClass; break; case 32: SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15; + NewDstRC = &AMDGPU::VReg_512RegClass; break; default: llvm_unreachable("Unhandled HalfSize"); } - BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE)) - .addOperand(MI->getOperand(0)) - .addReg(RegLo) - .addImm(SubLo) - .addReg(RegHi) - .addImm(SubHi); + unsigned OldDst = MI->getOperand(0).getReg(); + unsigned NewDst = MRI.createVirtualRegister(NewDstRC); + + MRI.replaceRegWith(OldDst, NewDst); + + BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDst) + .addReg(RegLo) + .addImm(SubLo) + .addReg(RegHi) + .addImm(SubHi); } -void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const { +void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, + MachineRegisterInfo &MRI, + SmallVectorImpl<MachineInstr *> &Worklist) const { MachineBasicBlock *MBB = MI->getParent(); - switch (MI->getOpcode()) { - case AMDGPU::S_LOAD_DWORD_IMM: - case AMDGPU::S_LOAD_DWORD_SGPR: - case AMDGPU::S_LOAD_DWORDX2_IMM: - case AMDGPU::S_LOAD_DWORDX2_SGPR: - case AMDGPU::S_LOAD_DWORDX4_IMM: - case AMDGPU::S_LOAD_DWORDX4_SGPR: { + int DstIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); + assert(DstIdx != -1); + unsigned DstRCID = get(MI->getOpcode()).OpInfo[DstIdx].RegClass; + switch(RI.getRegClass(DstRCID)->getSize()) { + case 4: + case 8: + case 16: { unsigned NewOpcode = getVALUOp(*MI); unsigned RegOffset; unsigned ImmOffset; @@ -2118,53 +2388,55 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3) .addImm(RsrcDataFormat >> 32); BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc) - .addReg(DWord0) - .addImm(AMDGPU::sub0) - .addReg(DWord1) - .addImm(AMDGPU::sub1) - .addReg(DWord2) - .addImm(AMDGPU::sub2) - .addReg(DWord3) - .addImm(AMDGPU::sub3); - MI->setDesc(get(NewOpcode)); - if (MI->getOperand(2).isReg()) { - MI->getOperand(2).setReg(SRsrc); - } else { - MI->getOperand(2).ChangeToRegister(SRsrc, false); - } - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset)); - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // glc - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // slc - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // tfe - - const TargetRegisterClass *NewDstRC = - RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass); - - unsigned DstReg = MI->getOperand(0).getReg(); + .addReg(DWord0) + .addImm(AMDGPU::sub0) + .addReg(DWord1) + .addImm(AMDGPU::sub1) + .addReg(DWord2) + .addImm(AMDGPU::sub2) + .addReg(DWord3) + .addImm(AMDGPU::sub3); + + const MCInstrDesc &NewInstDesc = get(NewOpcode); + const TargetRegisterClass *NewDstRC + = RI.getRegClass(NewInstDesc.OpInfo[0].RegClass); unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); + unsigned DstReg = MI->getOperand(0).getReg(); MRI.replaceRegWith(DstReg, NewDstReg); + + MachineInstr *NewInst = + BuildMI(*MBB, MI, MI->getDebugLoc(), NewInstDesc, NewDstReg) + .addOperand(MI->getOperand(1)) // sbase + .addReg(SRsrc) + .addImm(0) + .addImm(ImmOffset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MI->eraseFromParent(); + + legalizeOperands(NewInst); + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); break; } - case AMDGPU::S_LOAD_DWORDX8_IMM: - case AMDGPU::S_LOAD_DWORDX8_SGPR: { + case 32: { MachineInstr *Lo, *Hi; splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM, AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi); MI->eraseFromParent(); - moveSMRDToVALU(Lo, MRI); - moveSMRDToVALU(Hi, MRI); + moveSMRDToVALU(Lo, MRI, Worklist); + moveSMRDToVALU(Hi, MRI, Worklist); break; } - case AMDGPU::S_LOAD_DWORDX16_IMM: - case AMDGPU::S_LOAD_DWORDX16_SGPR: { + case 64: { MachineInstr *Lo, *Hi; splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM, AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi); MI->eraseFromParent(); - moveSMRDToVALU(Lo, MRI); - moveSMRDToVALU(Hi, MRI); + moveSMRDToVALU(Lo, MRI, Worklist); + moveSMRDToVALU(Hi, MRI, Worklist); break; } } @@ -2185,51 +2457,28 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { // Handle some special cases switch (Opcode) { default: - if (isSMRD(Inst->getOpcode())) { - moveSMRDToVALU(Inst, MRI); + if (isSMRD(*Inst)) { + moveSMRDToVALU(Inst, MRI, Worklist); + continue; } break; - case AMDGPU::S_MOV_B64: { - DebugLoc DL = Inst->getDebugLoc(); - - // If the source operand is a register we can replace this with a - // copy. - if (Inst->getOperand(1).isReg()) { - MachineInstr *Copy = BuildMI(*MBB, Inst, DL, get(TargetOpcode::COPY)) - .addOperand(Inst->getOperand(0)) - .addOperand(Inst->getOperand(1)); - Worklist.push_back(Copy); - } else { - // Otherwise, we need to split this into two movs, because there is - // no 64-bit VALU move instruction. - unsigned Reg = Inst->getOperand(0).getReg(); - unsigned Dst = split64BitImm(Worklist, - Inst, - MRI, - MRI.getRegClass(Reg), - Inst->getOperand(1)); - MRI.replaceRegWith(Reg, Dst); - } - Inst->eraseFromParent(); - continue; - } case AMDGPU::S_AND_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); Inst->eraseFromParent(); continue; case AMDGPU::S_OR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); Inst->eraseFromParent(); continue; case AMDGPU::S_XOR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); Inst->eraseFromParent(); continue; case AMDGPU::S_NOT_B64: - splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); + splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); Inst->eraseFromParent(); continue; @@ -2281,6 +2530,11 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { } break; + case AMDGPU::S_ABS_I32: + lowerScalarAbs(Worklist, Inst); + Inst->eraseFromParent(); + continue; + case AMDGPU::S_BFE_U64: case AMDGPU::S_BFM_B64: llvm_unreachable("Moving this op to VALU not implemented"); @@ -2319,7 +2573,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { Inst->addOperand(MachineOperand::CreateImm(0)); } - addDescImplicitUseDef(NewDesc, Inst); + Inst->addImplicitDefUseOperands(*Inst->getParent()->getParent()); if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { const MachineOperand &OffsetWidthOp = Inst->getOperand(2); @@ -2337,27 +2591,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { } // Update the destination register class. - - const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0); - - switch (Opcode) { - // For target instructions, getOpRegClass just returns the virtual - // register class associated with the operand, so we need to find an - // equivalent VGPR register class in order to move the instruction to the - // VALU. - case AMDGPU::COPY: - case AMDGPU::PHI: - case AMDGPU::REG_SEQUENCE: - case AMDGPU::INSERT_SUBREG: - if (RI.hasVGPRs(NewDstRC)) - continue; - NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); - if (!NewDstRC) - continue; - break; - default: - break; - } + const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst); + if (!NewDstRC) + continue; unsigned DstReg = Inst->getOperand(0).getReg(); unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); @@ -2366,13 +2602,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { // Legalize the operands legalizeOperands(Inst); - for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg), - E = MRI.use_end(); I != E; ++I) { - MachineInstr &UseMI = *I->getParent(); - if (!canReadVGPR(UseMI, I.getOperandNo())) { - Worklist.push_back(&UseMI); - } - } + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); } } @@ -2390,6 +2620,30 @@ const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { return &AMDGPU::VGPR_32RegClass; } +void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + DebugLoc DL = Inst->getDebugLoc(); + + MachineOperand &Dest = Inst->getOperand(0); + MachineOperand &Src = Inst->getOperand(1); + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg) + .addImm(0) + .addReg(Src.getReg()); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) + .addReg(Src.getReg()) + .addReg(TmpReg); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); +} + void SIInstrInfo::splitScalar64BitUnaryOp( SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr *Inst, @@ -2414,20 +2668,21 @@ void SIInstrInfo::splitScalar64BitUnaryOp( AMDGPU::sub0, Src0SubRC); const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); - const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0); + const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); + const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); - unsigned DestSub0 = MRI.createVirtualRegister(DestRC); - MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) + unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); + BuildMI(MBB, MII, DL, InstDesc, DestSub0) .addOperand(SrcReg0Sub0); MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); - unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC); - MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) + unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); + BuildMI(MBB, MII, DL, InstDesc, DestSub1) .addOperand(SrcReg0Sub1); - unsigned FullDestReg = MRI.createVirtualRegister(DestRC); + unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) .addReg(DestSub0) .addImm(AMDGPU::sub0) @@ -2436,10 +2691,11 @@ void SIInstrInfo::splitScalar64BitUnaryOp( MRI.replaceRegWith(Dest.getReg(), FullDestReg); - // Try to legalize the operands in case we need to swap the order to keep it - // valid. - Worklist.push_back(LoHalf); - Worklist.push_back(HiHalf); + // We don't need to legalizeOperands here because for a single operand, src0 + // will support any kind of input. + + // Move all users of this moved value. + addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } void SIInstrInfo::splitScalar64BitBinaryOp( @@ -2474,9 +2730,10 @@ void SIInstrInfo::splitScalar64BitBinaryOp( AMDGPU::sub0, Src1SubRC); const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); - const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0); + const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); + const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); - unsigned DestSub0 = MRI.createVirtualRegister(DestRC); + unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) .addOperand(SrcReg0Sub0) .addOperand(SrcReg1Sub0); @@ -2486,12 +2743,12 @@ void SIInstrInfo::splitScalar64BitBinaryOp( MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); - unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC); + unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) .addOperand(SrcReg0Sub1) .addOperand(SrcReg1Sub1); - unsigned FullDestReg = MRI.createVirtualRegister(DestRC); + unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) .addReg(DestSub0) .addImm(AMDGPU::sub0) @@ -2502,8 +2759,11 @@ void SIInstrInfo::splitScalar64BitBinaryOp( // Try to legalize the operands in case we need to swap the order to keep it // valid. - Worklist.push_back(LoHalf); - Worklist.push_back(HiHalf); + legalizeOperands(LoHalf); + legalizeOperands(HiHalf); + + // Move all users of this moved vlaue. + addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, @@ -2532,18 +2792,19 @@ void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC); - MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg) + BuildMI(MBB, MII, DL, InstDesc, MidReg) .addOperand(SrcRegSub0) .addImm(0); - MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg) + BuildMI(MBB, MII, DL, InstDesc, ResultReg) .addOperand(SrcRegSub1) .addReg(MidReg); MRI.replaceRegWith(Dest.getReg(), ResultReg); - Worklist.push_back(First); - Worklist.push_back(Second); + // We don't need to legalize operands here. src0 for etiher instruction can be + // an SGPR, and the second input is unused or determined here. + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, @@ -2587,6 +2848,7 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, .addImm(AMDGPU::sub1); MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); return; } @@ -2605,33 +2867,53 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, .addImm(AMDGPU::sub1); MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc, - MachineInstr *Inst) const { - // Add the implict and explicit register definitions. - if (NewDesc.ImplicitUses) { - for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) { - unsigned Reg = NewDesc.ImplicitUses[i]; - Inst->addOperand(MachineOperand::CreateReg(Reg, false, true)); +void SIInstrInfo::addUsersToMoveToVALUWorklist( + unsigned DstReg, + MachineRegisterInfo &MRI, + SmallVectorImpl<MachineInstr *> &Worklist) const { + for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), + E = MRI.use_end(); I != E; ++I) { + MachineInstr &UseMI = *I->getParent(); + if (!canReadVGPR(UseMI, I.getOperandNo())) { + Worklist.push_back(&UseMI); } } +} - if (NewDesc.ImplicitDefs) { - for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) { - unsigned Reg = NewDesc.ImplicitDefs[i]; - Inst->addOperand(MachineOperand::CreateReg(Reg, true, true)); - } +const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( + const MachineInstr &Inst) const { + const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); + + switch (Inst.getOpcode()) { + // For target instructions, getOpRegClass just returns the virtual register + // class associated with the operand, so we need to find an equivalent VGPR + // register class in order to move the instruction to the VALU. + case AMDGPU::COPY: + case AMDGPU::PHI: + case AMDGPU::REG_SEQUENCE: + case AMDGPU::INSERT_SUBREG: + if (RI.hasVGPRs(NewDstRC)) + return nullptr; + + NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); + if (!NewDstRC) + return nullptr; + return NewDstRC; + default: + return NewDstRC; } } +// Find the one SGPR operand we are allowed to use. unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const { - const MCInstrDesc &Desc = get(MI->getOpcode()); + const MCInstrDesc &Desc = MI->getDesc(); // Find the one SGPR operand we are allowed to use. - unsigned SGPRReg = AMDGPU::NoRegister; - + // // First we need to consider the instruction's operand requirements before // legalizing. Some operands are required to be SGPRs, such as implicit uses // of VCC, but we are still bound by the constant bus requirement to only use @@ -2639,17 +2921,9 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, // // If the operand's class is an SGPR, we can never move it. - for (const MachineOperand &MO : MI->implicit_operands()) { - // We only care about reads. - if (MO.isDef()) - continue; - - if (MO.getReg() == AMDGPU::VCC) - return AMDGPU::VCC; - - if (MO.getReg() == AMDGPU::FLAT_SCR) - return AMDGPU::FLAT_SCR; - } + unsigned SGPRReg = findImplicitSGPRRead(*MI); + if (SGPRReg != AMDGPU::NoRegister) + return SGPRReg; unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); @@ -2660,15 +2934,22 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, break; const MachineOperand &MO = MI->getOperand(Idx); - if (RI.isSGPRClassID(Desc.OpInfo[Idx].RegClass)) - SGPRReg = MO.getReg(); + if (!MO.isReg()) + continue; - if (MO.isReg() && RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) - UsedSGPRs[i] = MO.getReg(); - } + // Is this operand statically required to be an SGPR based on the operand + // constraints? + const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); + bool IsRequiredSGPR = RI.isSGPRClass(OpRC); + if (IsRequiredSGPR) + return MO.getReg(); - if (SGPRReg != AMDGPU::NoRegister) - return SGPRReg; + // If this could be a VGPR or an SGPR, Check the dynamic register class. + unsigned Reg = MO.getReg(); + const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); + if (RI.isSGPRClass(RegRC)) + UsedSGPRs[i] = Reg; + } // We don't have a required SGPR operand, so we have a bit more freedom in // selecting operands to move. @@ -2680,6 +2961,9 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, // V_FMA_F32 v0, s0, s0, s0 -> No moves // V_FMA_F32 v0, s0, s1, s0 -> Move s1 + // TODO: If some of the operands are 64-bit SGPRs and some 32, we should + // prefer those. + if (UsedSGPRs[0] != AMDGPU::NoRegister) { if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) SGPRReg = UsedSGPRs[0]; @@ -2720,7 +3004,7 @@ MachineInstrBuilder SIInstrInfo::buildIndirectRead( unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( getIndirectIndexBegin(*MBB->getParent())); - return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC)) + return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC_V1)) .addOperand(I->getOperand(0)) .addOperand(I->getOperand(1)) .addReg(IndirectBaseReg) diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 5053786..307ef67 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -39,14 +39,11 @@ private: unsigned SubIdx, const TargetRegisterClass *SubRC) const; - unsigned split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist, - MachineBasicBlock::iterator MI, - MachineRegisterInfo &MRI, - const TargetRegisterClass *RC, - const MachineOperand &Op) const; - void swapOperands(MachineBasicBlock::iterator Inst) const; + void lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const; + void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr *Inst, unsigned Opcode) const; @@ -58,13 +55,24 @@ private: void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr *Inst) const; - void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const; + void addUsersToMoveToVALUWorklist( + unsigned Reg, MachineRegisterInfo &MRI, + SmallVectorImpl<MachineInstr *> &Worklist) const; + + const TargetRegisterClass * + getDestEquivalentVGPRClass(const MachineInstr &Inst) const; bool checkInstOffsetsDoNotOverlap(MachineInstr *MIa, MachineInstr *MIb) const; unsigned findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const; +protected: + MachineInstr *commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx0, + unsigned OpIdx1) const override; + public: explicit SIInstrInfo(const AMDGPUSubtarget &st); @@ -117,17 +125,14 @@ public: // register. If there is no hardware instruction that can store to \p // DstRC, then AMDGPU::COPY is returned. unsigned getMovOpcode(const TargetRegisterClass *DstRC) const; + + LLVM_READONLY int commuteOpcode(const MachineInstr &MI) const; - MachineInstr *commuteInstruction(MachineInstr *MI, - bool NewMI = false) const override; bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; - bool isTriviallyReMaterializable(const MachineInstr *MI, - AliasAnalysis *AA = nullptr) const; - bool areMemAccessesTriviallyDisjoint( MachineInstr *MIa, MachineInstr *MIb, AliasAnalysis *AA = nullptr) const override; @@ -137,8 +142,6 @@ public: unsigned DstReg, unsigned SrcReg) const override; bool isMov(unsigned Opcode) const override; - bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; - bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, unsigned Reg, MachineRegisterInfo *MRI) const final; @@ -148,78 +151,154 @@ public: MachineBasicBlock::iterator &MI, LiveVariables *LV) const override; + static bool isSALU(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SALU; + } + bool isSALU(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SALU; } + static bool isVALU(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VALU; + } + bool isVALU(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VALU; } + static bool isSOP1(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SOP1; + } + bool isSOP1(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SOP1; } + static bool isSOP2(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SOP2; + } + bool isSOP2(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SOP2; } + static bool isSOPC(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SOPC; + } + bool isSOPC(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SOPC; } + static bool isSOPK(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SOPK; + } + bool isSOPK(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SOPK; } + static bool isSOPP(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SOPP; + } + bool isSOPP(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SOPP; } + static bool isVOP1(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VOP1; + } + bool isVOP1(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VOP1; } + static bool isVOP2(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VOP2; + } + bool isVOP2(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VOP2; } + static bool isVOP3(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VOP3; + } + bool isVOP3(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VOP3; } + static bool isVOPC(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VOPC; + } + bool isVOPC(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VOPC; } + static bool isMUBUF(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::MUBUF; + } + bool isMUBUF(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::MUBUF; } + static bool isMTBUF(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::MTBUF; + } + bool isMTBUF(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::MTBUF; } + static bool isSMRD(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SMRD; + } + bool isSMRD(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SMRD; } + static bool isDS(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::DS; + } + bool isDS(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::DS; } + static bool isMIMG(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::MIMG; + } + bool isMIMG(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::MIMG; } + static bool isFLAT(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::FLAT; + } + bool isFLAT(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::FLAT; } + static bool isWQM(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::WQM; + } + bool isWQM(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::WQM; } + static bool isVGPRSpill(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VGPRSpill; + } + bool isVGPRSpill(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill; } @@ -302,6 +381,26 @@ public: bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx, const MachineOperand *MO = nullptr) const; + /// \brief Check if \p MO would be a valid operand for the given operand + /// definition \p OpInfo. Note this does not attempt to validate constant bus + /// restrictions (e.g. literal constant usage). + bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, + const MCOperandInfo &OpInfo, + const MachineOperand &MO) const; + + /// \brief Check if \p MO (a register operand) is a legal register for the + /// given operand description. + bool isLegalRegOperand(const MachineRegisterInfo &MRI, + const MCOperandInfo &OpInfo, + const MachineOperand &MO) const; + + /// \brief Legalize operands in \p MI by either commuting it or inserting a + /// copy of src1. + void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr *MI) const; + + /// \brief Fix operands in \p MI to satisfy constant bus requirements. + void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr *MI) const; + /// \brief Legalize all operands in this instruction. This function may /// create new instruction and insert them before \p MI. void legalizeOperands(MachineInstr *MI) const; @@ -312,7 +411,8 @@ public: unsigned HalfImmOp, unsigned HalfSGPROp, MachineInstr *&Lo, MachineInstr *&Hi) const; - void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const; + void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI, + SmallVectorImpl<MachineInstr *> &Worklist) const; /// \brief Replace this instruction's opcode with the equivalent VALU /// opcode. This function will also move the users of \p MI to the @@ -341,29 +441,49 @@ public: void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I, unsigned SavReg, unsigned IndexReg) const; - void insertNOPs(MachineBasicBlock::iterator MI, int Count) const; + void insertWaitStates(MachineBasicBlock::iterator MI, int Count) const; /// \brief Returns the operand named \p Op. If \p MI does not have an /// operand named \c Op, this function returns nullptr. + LLVM_READONLY MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const; + LLVM_READONLY const MachineOperand *getNamedOperand(const MachineInstr &MI, unsigned OpName) const { return getNamedOperand(const_cast<MachineInstr &>(MI), OpName); } + /// Get required immediate operand + int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const { + int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName); + return MI.getOperand(Idx).getImm(); + } + uint64_t getDefaultRsrcDataFormat() const; uint64_t getScratchRsrcWords23() const; }; namespace AMDGPU { - + LLVM_READONLY int getVOPe64(uint16_t Opcode); + + LLVM_READONLY int getVOPe32(uint16_t Opcode); + + LLVM_READONLY int getCommuteRev(uint16_t Opcode); + + LLVM_READONLY int getCommuteOrig(uint16_t Opcode); + + LLVM_READONLY int getAddr64Inst(uint16_t Opcode); + + LLVM_READONLY int getAtomicRetOp(uint16_t Opcode); + + LLVM_READONLY int getAtomicNoRetOp(uint16_t Opcode); const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 8d8110b..10f2adde 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -8,9 +8,9 @@ //===----------------------------------------------------------------------===// def isCI : Predicate<"Subtarget->getGeneration() " ">= AMDGPUSubtarget::SEA_ISLANDS">; -def isVI : Predicate < - "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, - AssemblerPredicate<"FeatureGCN3Encoding">; +def isCIOnly : Predicate<"Subtarget->getGeneration() ==" + "AMDGPUSubtarget::SEA_ISLANDS">, + AssemblerPredicate <"FeatureSeaIslands">; def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">; @@ -69,6 +69,15 @@ class sopk <bits<5> si, bits<5> vi = si> { field bits<5> VI = vi; } +// Specify an SMRD opcode for SI and SMEM opcode for VI + +// FIXME: This should really be bits<5> si, Tablegen crashes if +// parameter default value is other parameter with different bit size +class smrd<bits<8> si, bits<8> vi = si> { + field bits<5> SI = si{4-0}; + field bits<8> VI = vi; +} + // Execpt for the NONE field, this must be kept in sync with the SISubtarget enum // in AMDGPUInstrInfo.cpp def SISubtarget { @@ -121,9 +130,20 @@ def SIsampled : SDSample<"AMDGPUISD::SAMPLED">; def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">; def SIconstdata_ptr : SDNode< - "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]> + "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, i64>, + SDTCisVT<0, i64>]> >; +def mubuf_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ + return isGlobalLoad(cast<LoadSDNode>(N)) || + isConstantLoad(cast<LoadSDNode>(N), -1); +}]>; + +def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ + return isConstantLoad(cast<LoadSDNode>(N), -1) && + static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N); +}]>; + //===----------------------------------------------------------------------===// // SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1 // to be glued to the memory instructions. @@ -328,9 +348,9 @@ class SGPRImm <dag frag> : PatLeaf<frag, [{ static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); U != E; ++U) { - if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) { + const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo()); + if (RC && SIRI->isSGPRClass(RC)) return true; - } } return false; }]>; @@ -354,6 +374,8 @@ def sopp_brtarget : Operand<OtherVT> { let ParserMatchClass = SoppBrTarget; } +def const_ga : Operand<iPTR>; + include "SIInstrFormats.td" include "VIInstrFormats.td" @@ -393,7 +415,7 @@ def GDS01MatchClass : GDSBaseMatchClass <"parseDSOff01OptionalOps">; class GLCBaseMatchClass <string parser> : AsmOperandClass { let Name = "GLC"#parser; let PredicateMethod = "isImm"; - let ParserMethod = parser; + let ParserMethod = parser; let RenderMethod = "addImmOperands"; } @@ -436,6 +458,17 @@ def ClampMatchClass : AsmOperandClass { let RenderMethod = "addImmOperands"; } +class SMRDOffsetBaseMatchClass <string predicate> : AsmOperandClass { + let Name = "SMRDOffset"#predicate; + let PredicateMethod = predicate; + let RenderMethod = "addImmOperands"; +} + +def SMRDOffsetMatchClass : SMRDOffsetBaseMatchClass <"isSMRDOffset">; +def SMRDLiteralOffsetMatchClass : SMRDOffsetBaseMatchClass < + "isSMRDLiteralOffset" +>; + let OperandType = "OPERAND_IMMEDIATE" in { def offen : Operand<i1> { @@ -510,6 +543,16 @@ def ClampMod : Operand <i1> { let ParserMatchClass = ClampMatchClass; } +def smrd_offset : Operand <i32> { + let PrintMethod = "printU32ImmOperand"; + let ParserMatchClass = SMRDOffsetMatchClass; +} + +def smrd_literal_offset : Operand <i32> { + let PrintMethod = "printU32ImmOperand"; + let ParserMatchClass = SMRDLiteralOffsetMatchClass; +} + } // End OperandType = "OPERAND_IMMEDIATE" def VOPDstS64 : VOPDstOperand <SReg_64>; @@ -528,6 +571,13 @@ def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">; def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">; def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">; +def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">; +def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">; +def SMRDSgpr : ComplexPattern<i64, 2, "SelectSMRDSgpr">; +def SMRDBufferImm : ComplexPattern<i32, 1, "SelectSMRDBufferImm">; +def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">; +def SMRDBufferSgpr : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">; + def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">; def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">; def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">; @@ -717,19 +767,6 @@ class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm> : let AssemblerPredicates = [isVI]; } -multiclass SOP2_SELECT_32 <sop2 op, string opName, list<dag> pattern> { - def "" : SOP2_Pseudo <opName, (outs SReg_32:$dst), - (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), pattern>; - - def _si : SOP2_Real_si <op, opName, (outs SReg_32:$dst), - (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), - opName#" $dst, $src0, $src1 [$scc]">; - - def _vi : SOP2_Real_vi <op, opName, (outs SReg_32:$dst), - (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), - opName#" $dst, $src0, $src1 [$scc]">; -} - multiclass SOP2_m <sop2 op, string opName, dag outs, dag ins, string asm, list<dag> pattern> { @@ -758,8 +795,10 @@ multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m < class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt, string opName, PatLeaf cond> : SOPC < - op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1), - opName#" $src0, $src1", []>; + op, (outs), (ins rc:$src0, rc:$src1), + opName#" $src0, $src1", []> { + let Defs = [SCC]; +} class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL> : SOPC_Helper<op, SSrc_32, i32, opName, cond>; @@ -812,15 +851,20 @@ multiclass SOPK_32 <sopk op, string opName, list<dag> pattern> { } multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> { - def "" : SOPK_Pseudo <opName, (outs SCCReg:$dst), - (ins SReg_32:$src0, u16imm:$src1), pattern>; + def "" : SOPK_Pseudo <opName, (outs), + (ins SReg_32:$src0, u16imm:$src1), pattern> { + let Defs = [SCC]; + } + - let DisableEncoding = "$dst" in { - def _si : SOPK_Real_si <op, opName, (outs SCCReg:$dst), - (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16">; + def _si : SOPK_Real_si <op, opName, (outs), + (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16"> { + let Defs = [SCC]; + } - def _vi : SOPK_Real_vi <op, opName, (outs SCCReg:$dst), - (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16">; + def _vi : SOPK_Real_vi <op, opName, (outs), + (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16"> { + let Defs = [SCC]; } } @@ -868,35 +912,68 @@ class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins, } class SMRD_Real_vi <bits<8> op, string opName, bit imm, dag outs, dag ins, - string asm> : - SMRD <outs, ins, asm, []>, + string asm, list<dag> pattern = []> : + SMRD <outs, ins, asm, pattern>, SMEMe_vi <op, imm>, SIMCInstr<opName, SISubtarget.VI> { let AssemblerPredicates = [isVI]; } -multiclass SMRD_m <bits<5> op, string opName, bit imm, dag outs, dag ins, +multiclass SMRD_m <smrd op, string opName, bit imm, dag outs, dag ins, string asm, list<dag> pattern> { def "" : SMRD_Pseudo <opName, outs, ins, pattern>; - def _si : SMRD_Real_si <op, opName, imm, outs, ins, asm>; + def _si : SMRD_Real_si <op.SI, opName, imm, outs, ins, asm>; // glc is only applicable to scalar stores, which are not yet // implemented. let glc = 0 in { - def _vi : SMRD_Real_vi <{0, 0, 0, op}, opName, imm, outs, ins, asm>; + def _vi : SMRD_Real_vi <op.VI, opName, imm, outs, ins, asm>; } } -multiclass SMRD_Helper <bits<5> op, string opName, RegisterClass baseClass, +multiclass SMRD_Inval <smrd op, string opName, + SDPatternOperator node> { + let hasSideEffects = 1, mayStore = 1 in { + def "" : SMRD_Pseudo <opName, (outs), (ins), [(node)]>; + + let sbase = 0, offset = 0 in { + let sdst = 0 in { + def _si : SMRD_Real_si <op.SI, opName, 0, (outs), (ins), opName>; + } + + let glc = 0, sdata = 0 in { + def _vi : SMRD_Real_vi <op.VI, opName, 0, (outs), (ins), opName>; + } + } + } +} + +class SMEM_Inval <bits<8> op, string opName, SDPatternOperator node> : + SMRD_Real_vi<op, opName, 0, (outs), (ins), opName, [(node)]> { + let hasSideEffects = 1; + let mayStore = 1; + let sbase = 0; + let sdata = 0; + let glc = 0; + let offset = 0; +} + +multiclass SMRD_Helper <smrd op, string opName, RegisterClass baseClass, RegisterClass dstClass> { defm _IMM : SMRD_m < op, opName#"_IMM", 1, (outs dstClass:$dst), - (ins baseClass:$sbase, u32imm:$offset), + (ins baseClass:$sbase, smrd_offset:$offset), opName#" $dst, $sbase, $offset", [] >; + def _IMM_ci : SMRD < + (outs dstClass:$dst), (ins baseClass:$sbase, smrd_literal_offset:$offset), + opName#" $dst, $sbase, $offset", []>, SMRD_IMMe_ci <op.SI> { + let AssemblerPredicates = [isCIOnly]; + } + defm _SGPR : SMRD_m < op, opName#"_SGPR", 0, (outs dstClass:$dst), (ins baseClass:$sbase, SReg_32:$soff), @@ -922,11 +999,12 @@ def InputModsNoDefault : Operand <i32> { let ParserMatchClass = InputModsMatchClass; } -class getNumSrcArgs<ValueType Src1, ValueType Src2> { +class getNumSrcArgs<ValueType Src0, ValueType Src1, ValueType Src2> { int ret = - !if (!eq(Src1.Value, untyped.Value), 1, // VOP1 + !if (!eq(Src0.Value, untyped.Value), 0, + !if (!eq(Src1.Value, untyped.Value), 1, // VOP1 !if (!eq(Src2.Value, untyped.Value), 2, // VOP2 - 3)); // VOP3 + 3))); // VOP3 } // Returns the register class to use for the destination of VOP[123C] @@ -934,28 +1012,37 @@ class getNumSrcArgs<ValueType Src1, ValueType Src2> { class getVALUDstForVT<ValueType VT> { RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>, !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>, - VOPDstOperand<SReg_64>)); // else VT == i1 + !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32>, + VOPDstOperand<SReg_64>))); // else VT == i1 } // Returns the register class to use for source 0 of VOP[12C] // instructions for the given VT. class getVOPSrc0ForVT<ValueType VT> { - RegisterOperand ret = !if(!eq(VT.Size, 32), VSrc_32, VSrc_64); + RegisterOperand ret = !if(!eq(VT.Size, 64), VSrc_64, VSrc_32); } // Returns the register class to use for source 1 of VOP[12C] for the // given VT. class getVOPSrc1ForVT<ValueType VT> { - RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32, VReg_64); + RegisterClass ret = !if(!eq(VT.Size, 64), VReg_64, VGPR_32); } // Returns the register class to use for sources of VOP3 instructions for the // given VT. class getVOP3SrcForVT<ValueType VT> { - RegisterOperand ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64); + RegisterOperand ret = + !if(!eq(VT.Size, 64), + VCSrc_64, + !if(!eq(VT.Value, i1.Value), + SCSrc_64, + VCSrc_32 + ) + ); } // Returns 1 if the source arguments have modifiers, 0 if they do not. +// XXX - do f16 instructions? class hasModifiers<ValueType SrcVT> { bit ret = !if(!eq(SrcVT.Value, f32.Value), 1, !if(!eq(SrcVT.Value, f64.Value), 1, 0)); @@ -1009,17 +1096,20 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, // Returns the assembly string for the inputs and outputs of a VOP[12C] // instruction. This does not add the _e32 suffix, so it can be reused // by getAsm64. -class getAsm32 <int NumSrcArgs> { +class getAsm32 <bit HasDst, int NumSrcArgs> { + string dst = "$dst"; + string src0 = ", $src0"; string src1 = ", $src1"; string src2 = ", $src2"; - string ret = "$dst, $src0"# - !if(!eq(NumSrcArgs, 1), "", src1)# - !if(!eq(NumSrcArgs, 3), src2, ""); + string ret = !if(HasDst, dst, "") # + !if(!eq(NumSrcArgs, 1), src0, "") # + !if(!eq(NumSrcArgs, 2), src0#src1, "") # + !if(!eq(NumSrcArgs, 3), src0#src1#src2, ""); } // Returns the assembly string for the inputs and outputs of a VOP3 // instruction. -class getAsm64 <int NumSrcArgs, bit HasModifiers> { +class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers> { string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); string src1 = !if(!eq(NumSrcArgs, 1), "", !if(!eq(NumSrcArgs, 2), " $src1_modifiers", @@ -1027,11 +1117,10 @@ class getAsm64 <int NumSrcArgs, bit HasModifiers> { string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); string ret = !if(!eq(HasModifiers, 0), - getAsm32<NumSrcArgs>.ret, + getAsm32<HasDst, NumSrcArgs>.ret, "$dst, "#src0#src1#src2#"$clamp"#"$omod"); } - class VOPProfile <list<ValueType> _ArgVT> { field list<ValueType> ArgVT = _ArgVT; @@ -1047,29 +1136,38 @@ class VOPProfile <list<ValueType> _ArgVT> { field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret; field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret; - field int NumSrcArgs = getNumSrcArgs<Src1VT, Src2VT>.ret; + field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1); + field bit HasDst32 = HasDst; + field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret; field bit HasModifiers = hasModifiers<Src0VT>.ret; - field dag Outs = (outs DstRC:$dst); + field dag Outs = !if(HasDst,(outs DstRC:$dst),(outs)); + + // VOP3b instructions are a special case with a second explicit + // output. This is manually overridden for them. + field dag Outs32 = Outs; + field dag Outs64 = Outs; field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret; field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, HasModifiers>.ret; - field string Asm32 = getAsm32<NumSrcArgs>.ret; - field string Asm64 = getAsm64<NumSrcArgs, HasModifiers>.ret; + field string Asm32 = getAsm32<HasDst, NumSrcArgs>.ret; + field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers>.ret; } // FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order // for the instruction patterns to work. -def VOP_F16_F16 : VOPProfile <[f32, f32, untyped, untyped]>; -def VOP_F16_I16 : VOPProfile <[f32, i32, untyped, untyped]>; -def VOP_I16_F16 : VOPProfile <[i32, f32, untyped, untyped]>; +def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>; +def VOP_F16_I16 : VOPProfile <[f16, i32, untyped, untyped]>; +def VOP_I16_F16 : VOPProfile <[i32, f16, untyped, untyped]>; -def VOP_F16_F16_F16 : VOPProfile <[f32, f32, f32, untyped]>; -def VOP_F16_F16_I16 : VOPProfile <[f32, f32, i32, untyped]>; +def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>; +def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i32, untyped]>; def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>; +def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>; + def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>; def VOP_F32_F64 : VOPProfile <[f32, f64, untyped, untyped]>; def VOP_F32_I32 : VOPProfile <[f32, i32, untyped, untyped]>; @@ -1087,25 +1185,76 @@ def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>; def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>; def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>; def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; -def VOP_I32_I32_I32_VCC : VOPProfile <[i32, i32, i32, untyped]> { + +// Write out to vcc or arbitrary SGPR. +def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> { + let Asm32 = "$dst, vcc, $src0, $src1"; + let Asm64 = "$dst, $sdst, $src0, $src1"; + let Outs32 = (outs DstRC:$dst); + let Outs64 = (outs DstRC:$dst, SReg_64:$sdst); +} + +// Write out to vcc or arbitrary SGPR and read in from vcc or +// arbitrary SGPR. +def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { + // We use VCSrc_32 to exclude literal constants, even though the + // encoding normally allows them since the implicit VCC use means + // using one would always violate the constant bus + // restriction. SGPRs are still allowed because it should + // technically be possible to use VCC again as src0. let Src0RC32 = VCSrc_32; + let Asm32 = "$dst, vcc, $src0, $src1, vcc"; + let Asm64 = "$dst, $sdst, $src0, $src1, $src2"; + let Outs32 = (outs DstRC:$dst); + let Outs64 = (outs DstRC:$dst, SReg_64:$sdst); + + // Suppress src2 implied by type since the 32-bit encoding uses an + // implicit VCC use. + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1); } -def VOP_I1_F32_I32 : VOPProfile <[i1, f32, i32, untyped]> { - let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); - let Asm64 = "$dst, $src0_modifiers, $src1"; +class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> { + let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); + let Asm64 = "$vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod"; +} + +def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32> { + // FIXME: Hack to stop printing _e64 + let DstRC = RegisterOperand<VGPR_32>; } -def VOP_I1_F64_I32 : VOPProfile <[i1, f64, i32, untyped]> { +def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> { + // FIXME: Hack to stop printing _e64 + let DstRC = RegisterOperand<VReg_64>; +} + +// VOPC instructions are a special case because for the 32-bit +// encoding, we want to display the implicit vcc write as if it were +// an explicit $dst. +class VOPC_Profile<ValueType vt0, ValueType vt1 = vt0> : VOPProfile <[i1, vt0, vt1, untyped]> { + let Asm32 = "vcc, $src0, $src1"; + // The destination for 32-bit encoding is implicit. + let HasDst32 = 0; +} + +class VOPC_Class_Profile<ValueType vt> : VOPC_Profile<vt, i32> { let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); let Asm64 = "$dst, $src0_modifiers, $src1"; } +def VOPC_I1_F32_F32 : VOPC_Profile<f32>; +def VOPC_I1_F64_F64 : VOPC_Profile<f64>; +def VOPC_I1_I32_I32 : VOPC_Profile<i32>; +def VOPC_I1_I64_I64 : VOPC_Profile<i64>; + +def VOPC_I1_F32_I32 : VOPC_Class_Profile<f32>; +def VOPC_I1_F64_I32 : VOPC_Class_Profile<f64>; + def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>; def VOP_CNDMASK : VOPProfile <[i32, i32, i32, untyped]> { - let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VCCReg:$src2); + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1); let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, SSrc_64:$src2); let Asm64 = "$dst, $src0, $src1, $src2"; } @@ -1119,13 +1268,60 @@ def VOP_MAC : VOPProfile <[f32, f32, f32, f32]> { let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3, HasModifiers>.ret; - let Asm32 = getAsm32<2>.ret; - let Asm64 = getAsm64<2, HasModifiers>.ret; + let Asm32 = getAsm32<1, 2>.ret; + let Asm64 = getAsm64<1, 2, HasModifiers>.ret; } def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>; def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>; def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>; +class SIInstAlias <string asm, Instruction inst, VOPProfile p> : + InstAlias <asm, (inst)>, PredicateControl { + + field bit isCompare; + field bit isCommutable; + + let ResultInst = + !if (p.HasDst32, + !if (!eq(p.NumSrcArgs, 0), + // 1 dst, 0 src + (inst p.DstRC:$dst), + !if (!eq(p.NumSrcArgs, 1), + // 1 dst, 1 src + (inst p.DstRC:$dst, p.Src0RC32:$src0), + !if (!eq(p.NumSrcArgs, 2), + // 1 dst, 2 src + (inst p.DstRC:$dst, p.Src0RC32:$src0, p.Src1RC32:$src1), + // else - unreachable + (inst)))), + // else + !if (!eq(p.NumSrcArgs, 2), + // 0 dst, 2 src + (inst p.Src0RC32:$src0, p.Src1RC32:$src1), + !if (!eq(p.NumSrcArgs, 1), + // 0 dst, 1 src + (inst p.Src0RC32:$src1), + // else + // 0 dst, 0 src + (inst)))); +} + +class SIInstAliasSI <string asm, string op_name, VOPProfile p> : + SIInstAlias <asm, !cast<Instruction>(op_name#"_e32_si"), p> { + let AssemblerPredicate = SIAssemblerPredicate; +} + +class SIInstAliasVI <string asm, string op_name, VOPProfile p> : + SIInstAlias <asm, !cast<Instruction>(op_name#"_e32_vi"), p> { + let AssemblerPredicates = [isVI]; +} + +multiclass SIInstAliasBuilder <string asm, VOPProfile p> { + + def : SIInstAliasSI <asm, NAME, p>; + + def : SIInstAliasVI <asm, NAME, p>; +} class VOP <string opName> { string OpName = opName; @@ -1165,20 +1361,22 @@ class VOP1_Real_vi <string opName, vop1 op, dag outs, dag ins, string asm> : let AssemblerPredicates = [isVI]; } -multiclass VOP1_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern, - string opName> { - def "" : VOP1_Pseudo <outs, ins, pattern, opName>; +multiclass VOP1_m <vop1 op, string opName, VOPProfile p, list<dag> pattern, + string asm = opName#p.Asm32> { + def "" : VOP1_Pseudo <p.Outs, p.Ins32, pattern, opName>; - def _si : VOP1_Real_si <opName, op, outs, ins, asm>; + def _si : VOP1_Real_si <opName, op, p.Outs, p.Ins32, asm>; + + def _vi : VOP1_Real_vi <opName, op, p.Outs, p.Ins32, asm>; - def _vi : VOP1_Real_vi <opName, op, outs, ins, asm>; } -multiclass VOP1SI_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern, - string opName> { - def "" : VOP1_Pseudo <outs, ins, pattern, opName>; +multiclass VOP1SI_m <vop1 op, string opName, VOPProfile p, list<dag> pattern, + string asm = opName#p.Asm32> { + + def "" : VOP1_Pseudo <p.Outs, p.Ins32, pattern, opName>; - def _si : VOP1_Real_si <opName, op, outs, ins, asm>; + def _si : VOP1_Real_si <opName, op, p.Outs, p.Ins32, asm>; } class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : @@ -1202,22 +1400,24 @@ class VOP2_Real_vi <string opName, vop2 op, dag outs, dag ins, string asm> : let AssemblerPredicates = [isVI]; } -multiclass VOP2SI_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern, - string opName, string revOp> { - def "" : VOP2_Pseudo <outs, ins, pattern, opName>, +multiclass VOP2SI_m <vop2 op, string opName, VOPProfile p, list<dag> pattern, + string revOp> { + + def "" : VOP2_Pseudo <p.Outs32, p.Ins32, pattern, opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; - def _si : VOP2_Real_si <opName, op, outs, ins, asm>; + def _si : VOP2_Real_si <opName, op, p.Outs32, p.Ins32, p.Asm32>; } -multiclass VOP2_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern, - string opName, string revOp> { - def "" : VOP2_Pseudo <outs, ins, pattern, opName>, +multiclass VOP2_m <vop2 op, string opName, VOPProfile p, list <dag> pattern, + string revOp> { + + def "" : VOP2_Pseudo <p.Outs32, p.Ins32, pattern, opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; - def _si : VOP2_Real_si <opName, op, outs, ins, asm>; + def _si : VOP2_Real_si <opName, op, p.Outs32, p.Ins32, p.Asm32>; - def _vi : VOP2_Real_vi <opName, op, outs, ins, asm>; + def _vi : VOP2_Real_vi <opName, op, p.Outs32, p.Ins32, p.Asm32>; } @@ -1250,6 +1450,9 @@ class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : MnemonicAlias<opName#"_e64", opName> { let isPseudo = 1; let isCodeGenOnly = 1; + + field bit vdst; + field bit src0; } class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> : @@ -1295,22 +1498,6 @@ multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, HasMods>; } -// VOP3_m without source modifiers -multiclass VOP3_m_nomods <vop op, dag outs, dag ins, string asm, list<dag> pattern, - string opName, int NumSrcArgs, bit HasMods = 1> { - - def "" : VOP3_Pseudo <outs, ins, pattern, opName>; - - let src0_modifiers = 0, - src1_modifiers = 0, - src2_modifiers = 0, - clamp = 0, - omod = 0 in { - def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>; - def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>; - } -} - multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, bit HasMods = 1> { @@ -1335,7 +1522,7 @@ multiclass VOP3SI_1_m <vop op, dag outs, dag ins, string asm, multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, string revOp, - bit HasMods = 1, bit UseFullOp = 0> { + bit HasMods = 1> { def "" : VOP3_Pseudo <outs, ins, pattern, opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; @@ -1349,7 +1536,7 @@ multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm, multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, string revOp, - bit HasMods = 1, bit UseFullOp = 0> { + bit HasMods = 1> { def "" : VOP3_Pseudo <outs, ins, pattern, opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; @@ -1360,54 +1547,41 @@ multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm, // No VI instruction. This class is for SI only. } -// XXX - Is v_div_scale_{f32|f64} only available in vop3b without -// option of implicit vcc use? -multiclass VOP3b_2_m <vop op, dag outs, dag ins, string asm, - list<dag> pattern, string opName, string revOp, - bit HasMods = 1, bit UseFullOp = 0> { - def "" : VOP3_Pseudo <outs, ins, pattern, opName>, - VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; - - // The VOP2 variant puts the carry out into VCC, the VOP3 variant - // can write it into any SGPR. We currently don't use the carry out, - // so for now hardcode it to VCC as well. - let sdst = SIOperand.VCC, Defs = [VCC] in { - def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>, - VOP3DisableFields<1, 0, HasMods>; - - def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>, - VOP3DisableFields<1, 0, HasMods>; - } // End sdst = SIOperand.VCC, Defs = [VCC] -} - -multiclass VOP3b_3_m <vop op, dag outs, dag ins, string asm, - list<dag> pattern, string opName, string revOp, - bit HasMods = 1, bit UseFullOp = 0> { +// Two operand VOP3b instruction that may have a 3rd SGPR bool operand +// instead of an implicit VCC as in the VOP2b format. +multiclass VOP3b_2_3_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, string revOp, + bit HasMods = 1, bit useSrc2Input = 0> { def "" : VOP3_Pseudo <outs, ins, pattern, opName>; - def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>, - VOP3DisableFields<1, 1, HasMods>; + VOP3DisableFields<1, useSrc2Input, HasMods>; def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>, - VOP3DisableFields<1, 1, HasMods>; + VOP3DisableFields<1, useSrc2Input, HasMods>; } multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, - bit HasMods, bit defExec, string revOp> { + bit HasMods, bit defExec, + string revOp, list<SchedReadWrite> sched> { def "" : VOP3_Pseudo <outs, ins, pattern, opName>, - VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; + VOP2_REV<revOp#"_e64", !eq(revOp, opName)> { + let Defs = !if(defExec, [EXEC], []); + let SchedRW = sched; + } def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, VOP3DisableFields<1, 0, HasMods> { let Defs = !if(defExec, [EXEC], []); + let SchedRW = sched; } def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>, VOP3DisableFields<1, 0, HasMods> { let Defs = !if(defExec, [EXEC], []); + let SchedRW = sched; } } @@ -1432,32 +1606,28 @@ multiclass VOP2SI_3VI_m <vop3 op, string opName, dag outs, dag ins, } } -multiclass VOP1_Helper <vop1 op, string opName, dag outs, - dag ins32, string asm32, list<dag> pat32, - dag ins64, string asm64, list<dag> pat64, - bit HasMods> { +multiclass VOP1_Helper <vop1 op, string opName, VOPProfile p, list<dag> pat32, + list<dag> pat64> { - defm _e32 : VOP1_m <op, outs, ins32, opName#asm32, pat32, opName>; + defm _e32 : VOP1_m <op, opName, p, pat32>; - defm _e64 : VOP3_1_m <op, outs, ins64, opName#asm64, pat64, opName, HasMods>; + defm _e64 : VOP3_1_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName, + p.HasModifiers>; } multiclass VOP1Inst <vop1 op, string opName, VOPProfile P, SDPatternOperator node = null_frag> : VOP1_Helper < - op, opName, P.Outs, - P.Ins32, P.Asm32, [], - P.Ins64, P.Asm64, + op, opName, P, [], !if(P.HasModifiers, [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0))]), - P.HasModifiers + [(set P.DstVT:$dst, (node P.Src0VT:$src0))]) >; multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P, SDPatternOperator node = null_frag> { - defm _e32 : VOP1SI_m <op, P.Outs, P.Ins32, opName#P.Asm32, [], opName>; + defm _e32 : VOP1SI_m <op, opName, P, []>; defm _e64 : VOP3SI_1_m <op, P.Outs, P.Ins64, opName#P.Asm64, !if(P.HasModifiers, @@ -1467,36 +1637,33 @@ multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P, opName, P.HasModifiers>; } -multiclass VOP2_Helper <vop2 op, string opName, dag outs, - dag ins32, string asm32, list<dag> pat32, - dag ins64, string asm64, list<dag> pat64, - string revOp, bit HasMods> { - defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>; +multiclass VOP2_Helper <vop2 op, string opName, VOPProfile p, list<dag> pat32, + list<dag> pat64, string revOp> { - defm _e64 : VOP3_2_m <op, - outs, ins64, opName#asm64, pat64, opName, revOp, HasMods - >; + defm _e32 : VOP2_m <op, opName, p, pat32, revOp>; + + defm _e64 : VOP3_2_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName, + revOp, p.HasModifiers>; } multiclass VOP2Inst <vop2 op, string opName, VOPProfile P, SDPatternOperator node = null_frag, string revOp = opName> : VOP2_Helper < - op, opName, P.Outs, - P.Ins32, P.Asm32, [], - P.Ins64, P.Asm64, + op, opName, P, [], !if(P.HasModifiers, [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), - revOp, P.HasModifiers + revOp >; multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P, SDPatternOperator node = null_frag, string revOp = opName> { - defm _e32 : VOP2SI_m <op, P.Outs, P.Ins32, P.Asm32, [], opName, revOp>; + + defm _e32 : VOP2SI_m <op, opName, P, [], revOp>; defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#P.Asm64, !if(P.HasModifiers, @@ -1508,58 +1675,55 @@ multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P, opName, revOp, P.HasModifiers>; } -multiclass VOP2b_Helper <vop2 op, string opName, dag outs, - dag ins32, string asm32, list<dag> pat32, - dag ins64, string asm64, list<dag> pat64, - string revOp, bit HasMods> { +multiclass VOP2b_Helper <vop2 op, string opName, VOPProfile p, + list<dag> pat32, list<dag> pat64, + string revOp, bit useSGPRInput> { - defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>; + let SchedRW = [Write32Bit, WriteSALU] in { + let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in { + defm _e32 : VOP2_m <op, opName, p, pat32, revOp>; + } - defm _e64 : VOP3b_2_m <op, - outs, ins64, opName#asm64, pat64, opName, revOp, HasMods - >; + defm _e64 : VOP3b_2_3_m <op, p.Outs64, p.Ins64, opName#p.Asm64, pat64, + opName, revOp, p.HasModifiers, useSGPRInput>; + } } multiclass VOP2bInst <vop2 op, string opName, VOPProfile P, SDPatternOperator node = null_frag, string revOp = opName> : VOP2b_Helper < - op, opName, P.Outs, - P.Ins32, P.Asm32, [], - P.Ins64, P.Asm64, + op, opName, P, [], !if(P.HasModifiers, [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), - revOp, P.HasModifiers + revOp, !eq(P.NumSrcArgs, 3) >; // A VOP2 instruction that is VOP3-only on VI. -multiclass VOP2_VI3_Helper <vop23 op, string opName, dag outs, - dag ins32, string asm32, list<dag> pat32, - dag ins64, string asm64, list<dag> pat64, - string revOp, bit HasMods> { - defm _e32 : VOP2SI_m <op, outs, ins32, asm32, pat32, opName, revOp>; +multiclass VOP2_VI3_Helper <vop23 op, string opName, VOPProfile p, + list<dag> pat32, list<dag> pat64, string revOp> { - defm _e64 : VOP3_2_m <op, outs, ins64, opName#asm64, pat64, opName, - revOp, HasMods>; + defm _e32 : VOP2SI_m <op, opName, p, pat32, revOp>; + + defm _e64 : VOP3_2_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName, + revOp, p.HasModifiers>; } multiclass VOP2_VI3_Inst <vop23 op, string opName, VOPProfile P, SDPatternOperator node = null_frag, string revOp = opName> : VOP2_VI3_Helper < - op, opName, P.Outs, - P.Ins32, P.Asm32, [], - P.Ins64, P.Asm64, + op, opName, P, [], !if(P.HasModifiers, [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), - revOp, P.HasModifiers + revOp >; multiclass VOP2MADK <vop2 op, string opName, list<dag> pattern = []> { @@ -1583,64 +1747,75 @@ let isCodeGenOnly = 0 in { } // End isCodeGenOnly = 0 } -class VOPC_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : +class VOPC_Pseudo <dag ins, list<dag> pattern, string opName> : VOPCCommon <ins, "", pattern>, VOP <opName>, - SIMCInstr<opName#"_e32", SISubtarget.NONE>, - MnemonicAlias<opName#"_e32", opName> { + SIMCInstr<opName#"_e32", SISubtarget.NONE> { let isPseudo = 1; let isCodeGenOnly = 1; } -multiclass VOPC_m <vopc op, dag outs, dag ins, string asm, list<dag> pattern, - string opName, bit DefExec, string revOpName = ""> { - def "" : VOPC_Pseudo <outs, ins, pattern, opName>; - - def _si : VOPC<op.SI, ins, asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.SI> { - let Defs = !if(DefExec, [EXEC], []); - let hasSideEffects = DefExec; - let AssemblerPredicates = [isSICI]; +multiclass VOPC_m <vopc op, dag ins, string op_asm, list<dag> pattern, + string opName, bit DefExec, VOPProfile p, + list<SchedReadWrite> sched, + string revOpName = "", string asm = opName#"_e32 "#op_asm, + string alias_asm = opName#" "#op_asm> { + def "" : VOPC_Pseudo <ins, pattern, opName> { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let SchedRW = sched; } - def _vi : VOPC<op.VI, ins, asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.VI> { - let Defs = !if(DefExec, [EXEC], []); - let hasSideEffects = DefExec; - let AssemblerPredicates = [isVI]; - } + let AssemblerPredicates = [isSICI] in { + def _si : VOPC<op.SI, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.SI> { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let hasSideEffects = DefExec; + let SchedRW = sched; + } + + } // End AssemblerPredicates = [isSICI] + + let AssemblerPredicates = [isVI] in { + def _vi : VOPC<op.VI, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.VI> { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let hasSideEffects = DefExec; + let SchedRW = sched; + } + + } // End AssemblerPredicates = [isVI] + + defm : SIInstAliasBuilder<alias_asm, p>; } -multiclass VOPC_Helper <vopc op, string opName, - dag ins32, string asm32, list<dag> pat32, - dag out64, dag ins64, string asm64, list<dag> pat64, - bit HasMods, bit DefExec, string revOp> { - defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>; +multiclass VOPC_Helper <vopc op, string opName, list<dag> pat32, + list<dag> pat64, bit DefExec, string revOp, + VOPProfile p, list<SchedReadWrite> sched> { + defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>; - defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64, - opName, HasMods, DefExec, revOp>; + defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$dst), p.Ins64, opName#p.Asm64, pat64, + opName, p.HasModifiers, DefExec, revOp, sched>; } // Special case for class instructions which only have modifiers on // the 1st source operand. -multiclass VOPC_Class_Helper <vopc op, string opName, - dag ins32, string asm32, list<dag> pat32, - dag out64, dag ins64, string asm64, list<dag> pat64, - bit HasMods, bit DefExec, string revOp> { - defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>; - - defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64, - opName, HasMods, DefExec, revOp>, +multiclass VOPC_Class_Helper <vopc op, string opName, list<dag> pat32, + list<dag> pat64, bit DefExec, string revOp, + VOPProfile p, list<SchedReadWrite> sched> { + defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>; + + defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$dst), p.Ins64, opName#p.Asm64, pat64, + opName, p.HasModifiers, DefExec, revOp, sched>, VOP3DisableModFields<1, 0, 0>; } multiclass VOPCInst <vopc op, string opName, VOPProfile P, PatLeaf cond = COND_NULL, string revOp = opName, - bit DefExec = 0> : VOPC_Helper < - op, opName, - P.Ins32, P.Asm32, [], - (outs VOPDstS64:$dst), P.Ins64, P.Asm64, + bit DefExec = 0, + list<SchedReadWrite> sched = [Write32Bit]> : + VOPC_Helper < + op, opName, [], !if(P.HasModifiers, [(set i1:$dst, (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, @@ -1648,51 +1823,51 @@ multiclass VOPCInst <vopc op, string opName, (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), cond))], [(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]), - P.HasModifiers, DefExec, revOp + DefExec, revOp, P, sched >; multiclass VOPCClassInst <vopc op, string opName, VOPProfile P, - bit DefExec = 0> : VOPC_Class_Helper < - op, opName, - P.Ins32, P.Asm32, [], - (outs VOPDstS64:$dst), P.Ins64, P.Asm64, + bit DefExec = 0, + list<SchedReadWrite> sched> : VOPC_Class_Helper < + op, opName, [], !if(P.HasModifiers, [(set i1:$dst, (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))], [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]), - P.HasModifiers, DefExec, opName + DefExec, opName, P, sched >; multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : - VOPCInst <op, opName, VOP_F32_F32_F32, cond, revOp>; + VOPCInst <op, opName, VOPC_I1_F32_F32, cond, revOp>; multiclass VOPC_F64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : - VOPCInst <op, opName, VOP_F64_F64_F64, cond, revOp>; + VOPCInst <op, opName, VOPC_I1_F64_F64, cond, revOp, 0, [WriteDoubleAdd]>; multiclass VOPC_I32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : - VOPCInst <op, opName, VOP_I32_I32_I32, cond, revOp>; + VOPCInst <op, opName, VOPC_I1_I32_I32, cond, revOp>; multiclass VOPC_I64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : - VOPCInst <op, opName, VOP_I64_I64_I64, cond, revOp>; + VOPCInst <op, opName, VOPC_I1_I64_I64, cond, revOp, 0, [Write64Bit]>; multiclass VOPCX <vopc op, string opName, VOPProfile P, PatLeaf cond = COND_NULL, + list<SchedReadWrite> sched, string revOp = ""> - : VOPCInst <op, opName, P, cond, revOp, 1>; + : VOPCInst <op, opName, P, cond, revOp, 1, sched>; multiclass VOPCX_F32 <vopc op, string opName, string revOp = opName> : - VOPCX <op, opName, VOP_F32_F32_F32, COND_NULL, revOp>; + VOPCX <op, opName, VOPC_I1_F32_F32, COND_NULL, [Write32Bit], revOp>; multiclass VOPCX_F64 <vopc op, string opName, string revOp = opName> : - VOPCX <op, opName, VOP_F64_F64_F64, COND_NULL, revOp>; + VOPCX <op, opName, VOPC_I1_F64_F64, COND_NULL, [WriteDoubleAdd], revOp>; multiclass VOPCX_I32 <vopc op, string opName, string revOp = opName> : - VOPCX <op, opName, VOP_I32_I32_I32, COND_NULL, revOp>; + VOPCX <op, opName, VOPC_I1_I32_I32, COND_NULL, [Write32Bit], revOp>; multiclass VOPCX_I64 <vopc op, string opName, string revOp = opName> : - VOPCX <op, opName, VOP_I64_I64_I64, COND_NULL, revOp>; + VOPCX <op, opName, VOPC_I1_I64_I64, COND_NULL, [Write64Bit], revOp>; multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm, list<dag> pat, int NumSrcArgs, bit HasMods> : VOP3_m < @@ -1700,16 +1875,16 @@ multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm, >; multiclass VOPC_CLASS_F32 <vopc op, string opName> : - VOPCClassInst <op, opName, VOP_I1_F32_I32, 0>; + VOPCClassInst <op, opName, VOPC_I1_F32_I32, 0, [Write32Bit]>; multiclass VOPCX_CLASS_F32 <vopc op, string opName> : - VOPCClassInst <op, opName, VOP_I1_F32_I32, 1>; + VOPCClassInst <op, opName, VOPC_I1_F32_I32, 1, [Write32Bit]>; multiclass VOPC_CLASS_F64 <vopc op, string opName> : - VOPCClassInst <op, opName, VOP_I1_F64_I32, 0>; + VOPCClassInst <op, opName, VOPC_I1_F64_I32, 0, [WriteDoubleAdd]>; multiclass VOPCX_CLASS_F64 <vopc op, string opName> : - VOPCClassInst <op, opName, VOP_I1_F64_I32, 1>; + VOPCClassInst <op, opName, VOPC_I1_F64_I32, 1, [WriteDoubleAdd]>; multiclass VOP3Inst <vop3 op, string opName, VOPProfile P, SDPatternOperator node = null_frag> : VOP3_Helper < @@ -1761,25 +1936,13 @@ multiclass VOP3_VCC_Inst <vop3 op, string opName, 3, 1 >; -multiclass VOP3b_Helper <vop op, RegisterClass vrc, RegisterOperand arc, - string opName, list<dag> pattern> : - VOP3b_3_m < - op, (outs vrc:$vdst, SReg_64:$sdst), - (ins InputModsNoDefault:$src0_modifiers, arc:$src0, - InputModsNoDefault:$src1_modifiers, arc:$src1, - InputModsNoDefault:$src2_modifiers, arc:$src2, - ClampMod:$clamp, omod:$omod), - opName#" $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", pattern, - opName, opName, 1, 1 +multiclass VOP3bInst <vop op, string opName, VOPProfile P, list<dag> pattern = []> : + VOP3b_2_3_m < + op, P.Outs64, P.Ins64, + opName#" "#P.Asm64, pattern, + opName, "", 1, 1 >; -multiclass VOP3b_64 <vop3 op, string opName, list<dag> pattern> : - VOP3b_Helper <op, VReg_64, VSrc_64, opName, pattern>; - -multiclass VOP3b_32 <vop3 op, string opName, list<dag> pattern> : - VOP3b_Helper <op, VGPR_32, VSrc_32, opName, pattern>; - - class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat< (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), @@ -1925,12 +2088,14 @@ multiclass DS_1A1D_RET <bits<8> op, string opName, RegisterClass rc, dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds), string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> { - def "" : DS_Pseudo <opName, outs, ins, []>, - AtomicNoRet<noRetOp, 1>; + let hasPostISelHook = 1 in { + def "" : DS_Pseudo <opName, outs, ins, []>, + AtomicNoRet<noRetOp, 1>; - let data1 = 0 in { - def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + let data1 = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } } } @@ -1939,11 +2104,13 @@ multiclass DS_1A2D_RET_m <bits<8> op, string opName, RegisterClass rc, dag outs = (outs rc:$vdst), string asm = opName#" $vdst, $addr, $data0, $data1"#"$offset"#"$gds"> { - def "" : DS_Pseudo <opName, outs, ins, []>, - AtomicNoRet<noRetOp, 1>; + let hasPostISelHook = 1 in { + def "" : DS_Pseudo <opName, outs, ins, []>, + AtomicNoRet<noRetOp, 1>; - def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } } multiclass DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc, @@ -2214,7 +2381,7 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, defm _ADDR64 : MUBUFAtomicAddr64_m < op, name#"_addr64", (outs), - (ins rc:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, + (ins rc:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0 >; @@ -2233,7 +2400,7 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, defm _RTN_ADDR64 : MUBUFAtomicAddr64_m < op, name#"_rtn_addr64", (outs rc:$vdata), - (ins rc:$vdata_in, SReg_128:$srsrc, VReg_64:$vaddr, + (ins rc:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc", [(set vt:$vdata, @@ -2245,7 +2412,7 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, op, name#"_rtn_offset", (outs rc:$vdata), (ins rc:$vdata_in, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), - name#" $vdata, $srsrc, $soffset"#"$offset"#" glc $slc", + name#" $vdata, $srsrc, $soffset"#"$offset"#" glc$slc", [(set vt:$vdata, (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc), vt:$vdata_in))], 1 @@ -2256,6 +2423,8 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1 } +// FIXME: tfe can't be an operand because it requires a separate +// opcode because it needs an N+1 register class dest register. multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass, ValueType load_vt = i32, SDPatternOperator ld = null_frag> { @@ -2368,47 +2537,121 @@ multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass, } // End mayLoad = 0, mayStore = 1 } -class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : - FLAT <op, (outs regClass:$vdst), - (ins VReg_64:$addr, glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe), - asm#" $vdst, $addr"#"$glc"#"$slc"#"$tfe", []> { - let data = 0; - let mayLoad = 1; +// For cache invalidation instructions. +multiclass MUBUF_Invalidate <mubuf op, string opName, SDPatternOperator node> { + let hasSideEffects = 1, mayStore = 1, AsmMatchConverter = "" in { + def "" : MUBUF_Pseudo <opName, (outs), (ins), [(node)]>; + + // Set everything to 0. + let offset = 0, offen = 0, idxen = 0, glc = 0, vaddr = 0, + vdata = 0, srsrc = 0, slc = 0, tfe = 0, soffset = 0 in { + let addr64 = 0 in { + def _si : MUBUF_Real_si <op, opName, (outs), (ins), opName>; + } + + def _vi : MUBUF_Real_vi <op, opName, (outs), (ins), opName>; + } + } // End hasSideEffects = 1, mayStore = 1, AsmMatchConverter = "" } -class FLAT_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> : - FLAT <op, (outs), (ins vdataClass:$data, VReg_64:$addr, - glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe), - name#" $data, $addr"#"$glc"#"$slc"#"$tfe", - []> { +//===----------------------------------------------------------------------===// +// FLAT classes +//===----------------------------------------------------------------------===// + +class flat <bits<7> ci, bits<7> vi = ci> { + field bits<7> CI = ci; + field bits<7> VI = vi; +} - let mayLoad = 0; - let mayStore = 1; +class FLAT_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + FLAT <0, outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} - // Encoding - let vdst = 0; +class FLAT_Real_ci <bits<7> op, string opName, dag outs, dag ins, string asm> : + FLAT <op, outs, ins, asm, []>, + SIMCInstr<opName, SISubtarget.SI> { + let AssemblerPredicate = isCIOnly; } -multiclass FLAT_ATOMIC <bits<7> op, string name, RegisterClass vdst_rc, - RegisterClass data_rc = vdst_rc> { +class FLAT_Real_vi <bits<7> op, string opName, dag outs, dag ins, string asm> : + FLAT <op, outs, ins, asm, []>, + SIMCInstr<opName, SISubtarget.VI> { + let AssemblerPredicate = VIAssemblerPredicate; +} - let mayLoad = 1, mayStore = 1 in { - def "" : FLAT <op, (outs), - (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc, - tfe_flat_atomic:$tfe), - name#" $addr, $data"#"$slc"#"$tfe", []>, - AtomicNoRet <NAME, 0> { - let glc = 0; - let vdst = 0; - } +multiclass FLAT_AtomicRet_m <flat op, dag outs, dag ins, string asm, + list<dag> pattern> { + def "" : FLAT_Pseudo <NAME#"_RTN", outs, ins, pattern>, + AtomicNoRet <NAME, 1>; - def _RTN : FLAT <op, (outs vdst_rc:$vdst), - (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc, - tfe_flat_atomic:$tfe), - name#" $vdst, $addr, $data glc"#"$slc"#"$tfe", []>, - AtomicNoRet <NAME, 1> { - let glc = 1; - } + def _ci : FLAT_Real_ci <op.CI, NAME#"_RTN", outs, ins, asm>; + + def _vi : FLAT_Real_vi <op.VI, NAME#"_RTN", outs, ins, asm>; +} + +multiclass FLAT_Load_Helper <flat op, string asm_name, + RegisterClass regClass, + dag outs = (outs regClass:$vdst), + dag ins = (ins VReg_64:$addr, glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe), + string asm = asm_name#" $vdst, $addr"#"$glc"#"$slc"#"$tfe"> { + + let data = 0, mayLoad = 1 in { + + def "" : FLAT_Pseudo <NAME, outs, ins, []>; + + def _ci : FLAT_Real_ci <op.CI, NAME, outs, ins, asm>; + + def _vi : FLAT_Real_vi <op.VI, NAME, outs, ins, asm>; + } +} + +multiclass FLAT_Store_Helper <flat op, string asm_name, + RegisterClass vdataClass, + dag outs = (outs), + dag ins = (ins vdataClass:$data, VReg_64:$addr, glc_flat:$glc, + slc_flat:$slc, tfe_flat:$tfe), + string asm = asm_name#" $data, $addr"#"$glc"#"$slc"#"$tfe"> { + + let mayLoad = 0, mayStore = 1, vdst = 0 in { + + def "" : FLAT_Pseudo <NAME, outs, ins, []>; + + def _ci : FLAT_Real_ci <op.CI, NAME, outs, ins, asm>; + + def _vi : FLAT_Real_vi <op.VI, NAME, outs, ins, asm>; + } +} + +multiclass FLAT_ATOMIC <flat op, string asm_name, RegisterClass vdst_rc, + RegisterClass data_rc = vdst_rc, + dag outs_noret = (outs), + string asm_noret = asm_name#" $addr, $data"#"$slc"#"$tfe"> { + + let mayLoad = 1, mayStore = 1, glc = 0, vdst = 0 in { + def "" : FLAT_Pseudo <NAME, outs_noret, + (ins VReg_64:$addr, data_rc:$data, + slc_flat_atomic:$slc, tfe_flat_atomic:$tfe), []>, + AtomicNoRet <NAME, 0>; + + def _ci : FLAT_Real_ci <op.CI, NAME, outs_noret, + (ins VReg_64:$addr, data_rc:$data, + slc_flat_atomic:$slc, tfe_flat_atomic:$tfe), + asm_noret>; + + def _vi : FLAT_Real_vi <op.VI, NAME, outs_noret, + (ins VReg_64:$addr, data_rc:$data, + slc_flat_atomic:$slc, tfe_flat_atomic:$tfe), + asm_noret>; + } + + let glc = 1, hasPostISelHook = 1 in { + defm _RTN : FLAT_AtomicRet_m <op, (outs vdst_rc:$vdst), + (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc, + tfe_flat_atomic:$tfe), + asm_name#" $vdst, $addr, $data glc"#"$slc"#"$tfe", []>; } } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td index e0eeea9..6f653c7 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -30,7 +30,9 @@ def isGCN : Predicate<"Subtarget->getGeneration() " ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">, AssemblerPredicate<"FeatureGCN">; def isSI : Predicate<"Subtarget->getGeneration() " - "== AMDGPUSubtarget::SOUTHERN_ISLANDS">; + "== AMDGPUSubtarget::SOUTHERN_ISLANDS">, + AssemblerPredicate<"FeatureSouthernIslands">; + def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">; def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">; @@ -62,36 +64,38 @@ let mayLoad = 1 in { // We are using the SGPR_32 and not the SReg_32 register class for 32-bit // SMRD instructions, because the SGPR_32 register class does not include M0 // and writing to M0 from an SMRD instruction will hang the GPU. -defm S_LOAD_DWORD : SMRD_Helper <0x00, "s_load_dword", SReg_64, SGPR_32>; -defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "s_load_dwordx2", SReg_64, SReg_64>; -defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "s_load_dwordx4", SReg_64, SReg_128>; -defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "s_load_dwordx8", SReg_64, SReg_256>; -defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "s_load_dwordx16", SReg_64, SReg_512>; +defm S_LOAD_DWORD : SMRD_Helper <smrd<0x00>, "s_load_dword", SReg_64, SGPR_32>; +defm S_LOAD_DWORDX2 : SMRD_Helper <smrd<0x01>, "s_load_dwordx2", SReg_64, SReg_64>; +defm S_LOAD_DWORDX4 : SMRD_Helper <smrd<0x02>, "s_load_dwordx4", SReg_64, SReg_128>; +defm S_LOAD_DWORDX8 : SMRD_Helper <smrd<0x03>, "s_load_dwordx8", SReg_64, SReg_256>; +defm S_LOAD_DWORDX16 : SMRD_Helper <smrd<0x04>, "s_load_dwordx16", SReg_64, SReg_512>; defm S_BUFFER_LOAD_DWORD : SMRD_Helper < - 0x08, "s_buffer_load_dword", SReg_128, SGPR_32 + smrd<0x08>, "s_buffer_load_dword", SReg_128, SGPR_32 >; defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper < - 0x09, "s_buffer_load_dwordx2", SReg_128, SReg_64 + smrd<0x09>, "s_buffer_load_dwordx2", SReg_128, SReg_64 >; defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper < - 0x0a, "s_buffer_load_dwordx4", SReg_128, SReg_128 + smrd<0x0a>, "s_buffer_load_dwordx4", SReg_128, SReg_128 >; defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper < - 0x0b, "s_buffer_load_dwordx8", SReg_128, SReg_256 + smrd<0x0b>, "s_buffer_load_dwordx8", SReg_128, SReg_256 >; defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper < - 0x0c, "s_buffer_load_dwordx16", SReg_128, SReg_512 + smrd<0x0c>, "s_buffer_load_dwordx16", SReg_128, SReg_512 >; } // mayLoad = 1 //def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>; -//def S_DCACHE_INV : SMRD_ <0x0000001f, "s_dcache_inv", []>; + +defm S_DCACHE_INV : SMRD_Inval <smrd<0x1f, 0x20>, "s_dcache_inv", + int_amdgcn_s_dcache_inv>; //===----------------------------------------------------------------------===// // SOP1 Instructions @@ -123,7 +127,7 @@ let Defs = [SCC] in { defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32", - [(set i32:$dst, (AMDGPUbrev i32:$src0))] + [(set i32:$dst, (bitreverse i32:$src0))] >; defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>; @@ -183,10 +187,14 @@ defm S_XNOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2b, 0x27>, "s_xnor_saveexec_b64", []> defm S_QUADMASK_B32 : SOP1_32 <sop1<0x2c, 0x28>, "s_quadmask_b32", []>; defm S_QUADMASK_B64 : SOP1_64 <sop1<0x2d, 0x29>, "s_quadmask_b64", []>; + +let Uses = [M0] in { defm S_MOVRELS_B32 : SOP1_32 <sop1<0x2e, 0x2a>, "s_movrels_b32", []>; defm S_MOVRELS_B64 : SOP1_64 <sop1<0x2f, 0x2b>, "s_movrels_b64", []>; defm S_MOVRELD_B32 : SOP1_32 <sop1<0x30, 0x2c>, "s_movreld_b32", []>; defm S_MOVRELD_B64 : SOP1_64 <sop1<0x31, 0x2d>, "s_movreld_b64", []>; +} // End Uses = [M0] + defm S_CBRANCH_JOIN : SOP1_1 <sop1<0x32, 0x2e>, "s_cbranch_join", []>; defm S_MOV_REGRD_B32 : SOP1_32 <sop1<0x33, 0x2f>, "s_mov_regrd_b32", []>; let Defs = [SCC] in { @@ -354,7 +362,7 @@ def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">; // SOPK Instructions //===----------------------------------------------------------------------===// -let isReMaterializable = 1 in { +let isReMaterializable = 1, isMoveImm = 1 in { defm S_MOVK_I32 : SOPK_32 <sopk<0x00>, "s_movk_i32", []>; } // End isReMaterializable = 1 let Uses = [SCC] in { @@ -438,36 +446,38 @@ def S_BRANCH : SOPP < let isBarrier = 1; } -let DisableEncoding = "$scc" in { +let Uses = [SCC] in { def S_CBRANCH_SCC0 : SOPP < - 0x00000004, (ins sopp_brtarget:$simm16, SCCReg:$scc), + 0x00000004, (ins sopp_brtarget:$simm16), "s_cbranch_scc0 $simm16" >; def S_CBRANCH_SCC1 : SOPP < - 0x00000005, (ins sopp_brtarget:$simm16, SCCReg:$scc), + 0x00000005, (ins sopp_brtarget:$simm16), "s_cbranch_scc1 $simm16" >; -} // End DisableEncoding = "$scc" +} // End Uses = [SCC] +let Uses = [VCC] in { def S_CBRANCH_VCCZ : SOPP < - 0x00000006, (ins sopp_brtarget:$simm16, VCCReg:$vcc), + 0x00000006, (ins sopp_brtarget:$simm16), "s_cbranch_vccz $simm16" >; def S_CBRANCH_VCCNZ : SOPP < - 0x00000007, (ins sopp_brtarget:$simm16, VCCReg:$vcc), + 0x00000007, (ins sopp_brtarget:$simm16), "s_cbranch_vccnz $simm16" >; +} // End Uses = [VCC] -let DisableEncoding = "$exec" in { +let Uses = [EXEC] in { def S_CBRANCH_EXECZ : SOPP < - 0x00000008, (ins sopp_brtarget:$simm16, EXECReg:$exec), + 0x00000008, (ins sopp_brtarget:$simm16), "s_cbranch_execz $simm16" >; def S_CBRANCH_EXECNZ : SOPP < - 0x00000009, (ins sopp_brtarget:$simm16, EXECReg:$exec), + 0x00000009, (ins sopp_brtarget:$simm16), "s_cbranch_execnz $simm16" >; -} // End DisableEncoding = "$exec" +} // End Uses = [EXEC] } // End isBranch = 1 @@ -477,11 +487,11 @@ let hasSideEffects = 1 in { def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier", [(int_AMDGPU_barrier_local)] > { + let SchedRW = [WriteBarrier]; let simm16 = 0; - let isBarrier = 1; - let hasCtrlDep = 1; let mayLoad = 1; let mayStore = 1; + let isConvergent = 1; } def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">; @@ -805,9 +815,6 @@ defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmps defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">; defm DS_MIN_RTN_F32 : DS_1A2D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">; defm DS_MAX_RTN_F32 : DS_1A2D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">; -let SubtargetPredicate = isCI in { -defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">; -} // End isCI defm DS_SWIZZLE_B32 : DS_1A_RET <0x35, "ds_swizzle_b32", VGPR_32>; let mayStore = 0 in { defm DS_READ_B32 : DS_1A_RET <0x36, "ds_read_b32", VGPR_32>; @@ -905,11 +912,6 @@ defm DS_WRITE_SRC2_B64 : DS_1A <0xcc, "ds_write_src2_b64">; defm DS_MIN_SRC2_F64 : DS_1A <0xd2, "ds_min_src2_f64">; defm DS_MAX_SRC2_F64 : DS_1A <0xd3, "ds_max_src2_f64">; -//let SubtargetPredicate = isCI in { -// DS_CONDXCHG32_RTN_B64 -// DS_CONDXCHG32_RTN_B128 -//} // End isCI - //===----------------------------------------------------------------------===// // MUBUF Instructions //===----------------------------------------------------------------------===// @@ -951,13 +953,13 @@ defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper < mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global >; defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper < - mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, global_load + mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, mubuf_load >; defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper < - mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, global_load + mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, mubuf_load >; defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper < - mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, global_load + mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load >; defm BUFFER_STORE_BYTE : MUBUF_Store_Helper < @@ -1034,9 +1036,12 @@ defm BUFFER_ATOMIC_XOR : MUBUF_Atomic < //def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <mubuf<0x5e>, "buffer_atomic_fcmpswap_x2", []>; // isn't on VI //def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <mubuf<0x5f>, "buffer_atomic_fmin_x2", []>; // isn't on VI //def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <mubuf<0x60>, "buffer_atomic_fmax_x2", []>; // isn't on VI -//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <mubuf<0x70>, "buffer_wbinvl1_sc", []>; // isn't on CI & VI -//def BUFFER_WBINVL1_VOL : MUBUF_WBINVL1 <mubuf<0x70, 0x3f>, "buffer_wbinvl1_vol", []>; // isn't on SI -//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <mubuf<0x71, 0x3e>, "buffer_wbinvl1", []>; + +let SubtargetPredicate = isSI in { +defm BUFFER_WBINVL1_SC : MUBUF_Invalidate <mubuf<0x70>, "buffer_wbinvl1_sc", int_amdgcn_buffer_wbinvl1_sc>; // isn't on CI & VI +} + +defm BUFFER_WBINVL1 : MUBUF_Invalidate <mubuf<0x71, 0x3e>, "buffer_wbinvl1", int_amdgcn_buffer_wbinvl1>; //===----------------------------------------------------------------------===// // MTBUF Instructions @@ -1155,8 +1160,8 @@ defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o" // VOP1 Instructions //===----------------------------------------------------------------------===// -let vdst = 0, src0 = 0 in { -defm V_NOP : VOP1_m <vop1<0x0>, (outs), (ins), "v_nop", [], "v_nop">; +let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in { +defm V_NOP : VOP1Inst <vop1<0x0>, "v_nop", VOP_NONE>; } let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in { @@ -1292,7 +1297,9 @@ defm V_SQRT_F64 : VOP1Inst <vop1<0x34, 0x28>, "v_sqrt_f64", VOP_F64_F64, fsqrt >; -} // let SchedRW = [WriteDouble] +} // End SchedRW = [WriteDouble] + +let SchedRW = [WriteQuarterRate32] in { defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32", VOP_F32_F32, AMDGPUsin @@ -1300,6 +1307,9 @@ defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32", defm V_COS_F32 : VOP1Inst <vop1<0x36, 0x2a>, "v_cos_f32", VOP_F32_F32, AMDGPUcos >; + +} // End SchedRW = [WriteQuarterRate32] + defm V_NOT_B32 : VOP1Inst <vop1<0x37, 0x2b>, "v_not_b32", VOP_I32_I32>; defm V_BFREV_B32 : VOP1Inst <vop1<0x38, 0x2c>, "v_bfrev_b32", VOP_I32_I32>; defm V_FFBH_U32 : VOP1Inst <vop1<0x39, 0x2d>, "v_ffbh_u32", VOP_I32_I32>; @@ -1308,24 +1318,33 @@ defm V_FFBH_I32 : VOP1Inst <vop1<0x3b, 0x2f>, "v_ffbh_i32", VOP_I32_I32>; defm V_FREXP_EXP_I32_F64 : VOP1Inst <vop1<0x3c,0x30>, "v_frexp_exp_i32_f64", VOP_I32_F64 >; + +let SchedRW = [WriteDoubleAdd] in { defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d, 0x31>, "v_frexp_mant_f64", VOP_F64_F64 >; -defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64", VOP_F64_F64>; + +defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64", + VOP_F64_F64 +>; +} // End SchedRW = [WriteDoubleAdd] + + defm V_FREXP_EXP_I32_F32 : VOP1Inst <vop1<0x3f, 0x33>, "v_frexp_exp_i32_f32", VOP_I32_F32 >; defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40, 0x34>, "v_frexp_mant_f32", VOP_F32_F32 >; -let vdst = 0, src0 = 0 in { -defm V_CLREXCP : VOP1_m <vop1<0x41,0x35>, (outs), (ins), "v_clrexcp", [], - "v_clrexcp" ->; +let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in { +defm V_CLREXCP : VOP1Inst <vop1<0x41,0x35>, "v_clrexcp", VOP_NONE>; } + +let Uses = [M0, EXEC] in { defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_I32_I32>; defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_I32_I32>; defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_I32_I32>; +} // End Uses = [M0, EXEC] // These instruction only exist on SI and CI let SubtargetPredicate = isSICI in { @@ -1343,7 +1362,7 @@ defm V_RSQ_LEGACY_F32 : VOP1InstSI <vop1<0x2d>, "v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy >; -} // End let SchedRW = [WriteQuarterRate32] +} // End SchedRW = [WriteQuarterRate32] let SchedRW = [WriteDouble] in { @@ -1360,7 +1379,7 @@ defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64", // VINTRP Instructions //===----------------------------------------------------------------------===// -let Uses = [M0] in { +let Uses = [M0, EXEC] in { // FIXME: Specify SchedRW for VINTRP insturctions. @@ -1405,16 +1424,14 @@ defm V_INTERP_MOV_F32 : VINTRP_m < [(set f32:$dst, (AMDGPUinterp_mov (i32 imm:$src0), (i32 imm:$attr_chan), (i32 imm:$attr)))]>; -} // End Uses = [M0] +} // End Uses = [M0, EXEC] //===----------------------------------------------------------------------===// // VOP2 Instructions //===----------------------------------------------------------------------===// multiclass V_CNDMASK <vop2 op, string name> { - defm _e32 : VOP2_m < - op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins32, VOP_CNDMASK.Asm32, [], - name, name>; + defm _e32 : VOP2_m <op, name, VOP_CNDMASK, [], name>; defm _e64 : VOP3_m < op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins64, @@ -1500,34 +1517,32 @@ let isCommutable = 1 in { defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32">; } // End isCommutable = 1 -let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC +let isCommutable = 1 in { // No patterns so that the scalar instructions are always selected. // The scalar versions will be replaced with vector when needed later. // V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI, // but the VI instructions behave the same as the SI versions. defm V_ADD_I32 : VOP2bInst <vop2<0x25, 0x19>, "v_add_i32", - VOP_I32_I32_I32, add + VOP2b_I32_I1_I32_I32 >; -defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP_I32_I32_I32>; +defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP2b_I32_I1_I32_I32>; defm V_SUBREV_I32 : VOP2bInst <vop2<0x27, 0x1b>, "v_subrev_i32", - VOP_I32_I32_I32, null_frag, "v_sub_i32" + VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32" >; -let Uses = [VCC] in { // Carry-in comes from VCC defm V_ADDC_U32 : VOP2bInst <vop2<0x28, 0x1c>, "v_addc_u32", - VOP_I32_I32_I32_VCC + VOP2b_I32_I1_I32_I32_I1 >; defm V_SUBB_U32 : VOP2bInst <vop2<0x29, 0x1d>, "v_subb_u32", - VOP_I32_I32_I32_VCC + VOP2b_I32_I1_I32_I32_I1 >; defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32", - VOP_I32_I32_I32_VCC, null_frag, "v_subb_u32" + VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32" >; -} // End Uses = [VCC] -} // End isCommutable = 1, Defs = [VCC] +} // End isCommutable = 1 defm V_READLANE_B32 : VOP2SI_3VI_m < vop3 <0x001, 0x289>, @@ -1575,10 +1590,10 @@ defm V_BCNT_U32_B32 : VOP2_VI3_Inst <vop23<0x22, 0x28b>, "v_bcnt_u32_b32", VOP_I32_I32_I32 >; defm V_MBCNT_LO_U32_B32 : VOP2_VI3_Inst <vop23<0x23, 0x28c>, "v_mbcnt_lo_u32_b32", - VOP_I32_I32_I32 + VOP_I32_I32_I32, int_amdgcn_mbcnt_lo >; defm V_MBCNT_HI_U32_B32 : VOP2_VI3_Inst <vop23<0x24, 0x28d>, "v_mbcnt_hi_u32_b32", - VOP_I32_I32_I32 + VOP_I32_I32_I32, int_amdgcn_mbcnt_hi >; defm V_LDEXP_F32 : VOP2_VI3_Inst <vop23<0x2b, 0x288>, "v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp @@ -1704,15 +1719,15 @@ defm V_DIV_FIXUP_F32 : VOP3Inst < vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup >; -let SchedRW = [WriteDouble] in { +let SchedRW = [WriteDoubleAdd] in { defm V_DIV_FIXUP_F64 : VOP3Inst < vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup >; -} // let SchedRW = [WriteDouble] +} // End SchedRW = [WriteDouble] -let SchedRW = [WriteDouble] in { +let SchedRW = [WriteDoubleAdd] in { let isCommutable = 1 in { defm V_ADD_F64 : VOP3Inst <vop3<0x164, 0x280>, "v_add_f64", @@ -1735,7 +1750,7 @@ defm V_LDEXP_F64 : VOP3Inst <vop3<0x168, 0x284>, "v_ldexp_f64", VOP_F64_F64_I32, AMDGPUldexp >; -} // let SchedRW = [WriteDouble] +} // let SchedRW = [WriteDoubleAdd] let isCommutable = 1, SchedRW = [WriteQuarterRate32] in { @@ -1756,16 +1771,21 @@ defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32", } // isCommutable = 1, SchedRW = [WriteQuarterRate32] let SchedRW = [WriteFloatFMA, WriteSALU] in { -defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d, 0x1e0>, "v_div_scale_f32", []>; +defm V_DIV_SCALE_F32 : VOP3bInst <vop3<0x16d, 0x1e0>, "v_div_scale_f32", + VOP3b_F32_I1_F32_F32_F32 +>; } let SchedRW = [WriteDouble, WriteSALU] in { // Double precision division pre-scale. -defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e, 0x1e1>, "v_div_scale_f64", []>; +defm V_DIV_SCALE_F64 : VOP3bInst <vop3<0x16e, 0x1e1>, "v_div_scale_f64", + VOP3b_F64_I1_F64_F64_F64 +>; } // let SchedRW = [WriteDouble] -let isCommutable = 1, Uses = [VCC] in { +let isCommutable = 1, Uses = [VCC, EXEC] in { +let SchedRW = [WriteFloatFMA] in { // v_div_fmas_f32: // result = src0 * src1 + src2 // if (vcc) @@ -1774,6 +1794,7 @@ let isCommutable = 1, Uses = [VCC] in { defm V_DIV_FMAS_F32 : VOP3_VCC_Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fmas >; +} let SchedRW = [WriteDouble] in { // v_div_fmas_f64: @@ -1786,7 +1807,7 @@ defm V_DIV_FMAS_F64 : VOP3_VCC_Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64", >; } // End SchedRW = [WriteDouble] -} // End isCommutable = 1 +} // End isCommutable = 1, Uses = [VCC, EXEC] //def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>; //def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>; @@ -1835,13 +1856,13 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$dst), (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", [] >; -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { // 64-bit vector move instruction. This is mainly used by the SIFoldOperands // pass to enable folding of inline immediates. def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>; } // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0 -let hasSideEffects = 1 in { +let hasSideEffects = 1, SALU = 1 in { def SGPR_USE : InstSI <(outs),(ins), "", []>; } @@ -1921,39 +1942,9 @@ def SI_KILL : InstSI < let Uses = [EXEC], Defs = [EXEC,VCC,M0] in { -//defm SI_ : RegisterLoadStore <VGPR_32, FRAMEri, ADDRIndirect>; - -let UseNamedOperandTable = 1 in { - -def SI_RegisterLoad : InstSI < +class SI_INDIRECT_SRC<RegisterClass rc> : InstSI < (outs VGPR_32:$dst, SReg_64:$temp), - (ins FRAMEri32:$addr, i32imm:$chan), - "", [] -> { - let isRegisterLoad = 1; - let mayLoad = 1; -} - -class SIRegStore<dag outs> : InstSI < - outs, - (ins VGPR_32:$val, FRAMEri32:$addr, i32imm:$chan), - "", [] -> { - let isRegisterStore = 1; - let mayStore = 1; -} - -let usesCustomInserter = 1 in { -def SI_RegisterStorePseudo : SIRegStore<(outs)>; -} // End usesCustomInserter = 1 -def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>; - - -} // End UseNamedOperandTable = 1 - -def SI_INDIRECT_SRC : InstSI < - (outs VGPR_32:$dst, SReg_64:$temp), - (ins unknown:$src, VSrc_32:$idx, i32imm:$off), + (ins rc:$src, VSrc_32:$idx, i32imm:$off), "si_indirect_src $dst, $temp, $src, $idx, $off", [] >; @@ -1967,6 +1958,13 @@ class SI_INDIRECT_DST<RegisterClass rc> : InstSI < let Constraints = "$src = $dst"; } +// TODO: We can support indirect SGPR access. +def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>; +def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>; +def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>; +def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>; +def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>; + def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>; def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>; def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>; @@ -1977,19 +1975,24 @@ def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>; multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { - let UseNamedOperandTable = 1 in { + let UseNamedOperandTable = 1, Uses = [EXEC] in { def _SAVE : InstSI < (outs), - (ins sgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, - SReg_32:$scratch_offset), + (ins sgpr_class:$src, i32imm:$frame_idx), "", [] - >; + > { + let mayStore = 1; + let mayLoad = 0; + } def _RESTORE : InstSI < (outs sgpr_class:$dst), - (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), + (ins i32imm:$frame_idx), "", [] - >; + > { + let mayStore = 0; + let mayLoad = 1; + } } // End UseNamedOperandTable = 1 } @@ -2003,19 +2006,25 @@ defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> { - let UseNamedOperandTable = 1, VGPRSpill = 1 in { + let UseNamedOperandTable = 1, VGPRSpill = 1, Uses = [EXEC] in { def _SAVE : InstSI < (outs), (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), "", [] - >; + > { + let mayStore = 1; + let mayLoad = 0; + } def _RESTORE : InstSI < (outs vgpr_class:$dst), (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), "", [] - >; + > { + let mayStore = 0; + let mayLoad = 1; + } } // End UseNamedOperandTable = 1, VGPRSpill = 1 } @@ -2030,9 +2039,11 @@ let Defs = [SCC] in { def SI_CONSTDATA_PTR : InstSI < (outs SReg_64:$dst), - (ins), - "", [(set SReg_64:$dst, (i64 SIconstdata_ptr))] ->; + (ins const_ga:$ptr), + "", [(set SReg_64:$dst, (i64 (SIconstdata_ptr (tglobaladdr:$ptr))))] +> { + let SALU = 1; +} } // End Defs = [SCC] @@ -2072,84 +2083,63 @@ def : Pat < // SMRD Patterns //===----------------------------------------------------------------------===// -multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> { +multiclass SMRD_Pattern <string Instr, ValueType vt> { - // 1. SI-CI: Offset as 8bit DWORD immediate + // 1. IMM offset def : Pat < - (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))), - (vt (Instr_IMM $sbase, (as_dword_i32imm $offset))) + (smrd_load (SMRDImm i64:$sbase, i32:$offset)), + (vt (!cast<SMRD>(Instr#"_IMM") $sbase, $offset)) >; - // 2. Offset loaded in an 32bit SGPR + // 2. SGPR offset def : Pat < - (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))), - (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset))))) + (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)), + (vt (!cast<SMRD>(Instr#"_SGPR") $sbase, $offset)) >; - // 3. No offset at all def : Pat < - (constant_load i64:$sbase), - (vt (Instr_IMM $sbase, 0)) - >; + (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)), + (vt (!cast<SMRD>(Instr#"_IMM_ci") $sbase, $offset)) + > { + let Predicates = [isCIOnly]; + } } -multiclass SMRD_Pattern_vi <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> { - - // 1. VI: Offset as 20bit immediate in bytes - def : Pat < - (constant_load (add i64:$sbase, (i64 IMM20bit:$offset))), - (vt (Instr_IMM $sbase, (as_i32imm $offset))) - >; - - // 2. Offset loaded in an 32bit SGPR - def : Pat < - (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))), - (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset))))) - >; - - // 3. No offset at all - def : Pat < - (constant_load i64:$sbase), - (vt (Instr_IMM $sbase, 0)) - >; -} - -let Predicates = [isSICI] in { -defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>; -defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>; -defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>; -defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>; -defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>; -defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>; -defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>; -} // End Predicates = [isSICI] +// Global and constant loads can be selected to either MUBUF or SMRD +// instructions, but SMRD instructions are faster so we want the instruction +// selector to prefer those. +let AddedComplexity = 100 in { -let Predicates = [isVI] in { -defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>; -defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>; -defm : SMRD_Pattern_vi <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>; -defm : SMRD_Pattern_vi <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>; -defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>; -defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>; -defm : SMRD_Pattern_vi <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>; -} // End Predicates = [isVI] +defm : SMRD_Pattern <"S_LOAD_DWORD", i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX8", v32i8>; +defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>; -let Predicates = [isSICI] in { +// 1. Offset as an immediate +def : Pat < + (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)), + (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset) +>; -// 1. Offset as 8bit DWORD immediate +// 2. Offset loaded in an 32bit SGPR def : Pat < - (SIload_constant v4i32:$sbase, IMM8bitDWORD:$offset), - (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset)) + (SIload_constant v4i32:$sbase, (SMRDBufferSgpr i32:$offset)), + (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset) >; -} // End Predicates = [isSICI] +let Predicates = [isCI] in { -// 2. Offset loaded in an 32bit SGPR def : Pat < - (SIload_constant v4i32:$sbase, imm:$offset), - (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset)) + (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)), + (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset) >; +} // End Predicates = [isCI] + +} // End let AddedComplexity = 10000 + //===----------------------------------------------------------------------===// // SOP1 Patterns //===----------------------------------------------------------------------===// @@ -2161,6 +2151,11 @@ def : Pat < (S_MOV_B32 0), sub1)) >; +def : Pat < + (i32 (smax i32:$x, (i32 (ineg i32:$x)))), + (S_ABS_I32 $x) +>; + //===----------------------------------------------------------------------===// // SOP2 Patterns //===----------------------------------------------------------------------===// @@ -2488,6 +2483,11 @@ def : Pat < /********** Extraction, Insertion, Building and Casting **********/ /********** ============================================ **********/ +//def : Extract_Element<i64, v2i64, 0, sub0_sub1>; +//def : Extract_Element<i64, v2i64, 1, sub2_sub3>; +//def : Extract_Element<f64, v2f64, 0, sub0_sub1>; +//def : Extract_Element<f64, v2f64, 1, sub2_sub3>; + foreach Index = 0-2 in { def Extract_Element_v2i32_#Index : Extract_Element < i32, v2i32, Index, !cast<SubRegIndex>(sub#Index) @@ -2568,11 +2568,25 @@ def : BitConvert <v2i32, i64, VReg_64>; def : BitConvert <i64, v2i32, VReg_64>; def : BitConvert <v2f32, i64, VReg_64>; def : BitConvert <i64, v2f32, VReg_64>; +def : BitConvert <v2f32, f64, VReg_64>; def : BitConvert <v2i32, f64, VReg_64>; +def : BitConvert <f64, v2f32, VReg_64>; def : BitConvert <f64, v2i32, VReg_64>; def : BitConvert <v4f32, v4i32, VReg_128>; def : BitConvert <v4i32, v4f32, VReg_128>; + +def : BitConvert <v2i64, v4i32, SReg_128>; +def : BitConvert <v4i32, v2i64, SReg_128>; + +def : BitConvert <v2f64, v4f32, VReg_128>; +def : BitConvert <v2f64, v4i32, VReg_128>; +def : BitConvert <v4f32, v2f64, VReg_128>; +def : BitConvert <v4i32, v2f64, VReg_128>; + + + + def : BitConvert <v8f32, v8i32, SReg_256>; def : BitConvert <v8i32, v8f32, SReg_256>; def : BitConvert <v8i32, v32i8, SReg_256>; @@ -2601,10 +2615,9 @@ def : Pat < // Prevent expanding both fneg and fabs. -// FIXME: Should use S_OR_B32 def : Pat < (fneg (fabs f32:$src)), - (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */ + (S_OR_B32 $src, 0x80000000) /* Set sign bit */ >; // FIXME: Should use S_OR_B32 @@ -2836,10 +2849,6 @@ class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat < // -1. For the non-rtn variants, the manual says it does // DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max // will always do the increment so I'm assuming it's the same. -// -// We also load this -1 with s_mov_b32 / s_mov_b64 even though this -// needs to be a VGPR. The SGPR copy pass will fix this, and it's -// easier since there is no v_mov_b64. class DSAtomicIncRetPat<DS inst, ValueType vt, Instruction LoadImm, PatFrag frag> : Pat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)), @@ -2855,9 +2864,9 @@ class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat < // 32-bit atomics. def : DSAtomicIncRetPat<DS_INC_RTN_U32, i32, - S_MOV_B32, si_atomic_load_add_local>; + V_MOV_B32_e32, si_atomic_load_add_local>; def : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32, - S_MOV_B32, si_atomic_load_sub_local>; + V_MOV_B32_e32, si_atomic_load_sub_local>; def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>; def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>; @@ -2874,9 +2883,9 @@ def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>; // 64-bit atomics. def : DSAtomicIncRetPat<DS_INC_RTN_U64, i64, - S_MOV_B64, si_atomic_load_add_local>; + V_MOV_B64_PSEUDO, si_atomic_load_add_local>; def : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64, - S_MOV_B64, si_atomic_load_sub_local>; + V_MOV_B64_PSEUDO, si_atomic_load_sub_local>; def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>; def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>; @@ -3019,90 +3028,46 @@ def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>; def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>; def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>; -let SubtargetPredicate = isCI in { - -defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8", - VOP_I32_I32_I32 ->; -defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8", - VOP_I32_I32_I32 ->; -defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8", - VOP_I32_I32_I32 ->; - -let isCommutable = 1 in { -defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32", - VOP_I64_I32_I32_I64 ->; - -// XXX - Does this set VCC? -defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32", - VOP_I64_I32_I32_I64 ->; -} // End isCommutable = 1 - -// Remaining instructions: -// FLAT_* -// S_CBRANCH_CDBGUSER -// S_CBRANCH_CDBGSYS -// S_CBRANCH_CDBGSYS_OR_USER -// S_CBRANCH_CDBGSYS_AND_USER -// S_DCACHE_INV_VOL -// DS_NOP -// DS_GWS_SEMA_RELEASE_ALL -// DS_WRAP_RTN_B32 -// DS_CNDXCHG32_RTN_B64 -// DS_WRITE_B96 -// DS_WRITE_B128 -// DS_CONDXCHG32_RTN_B128 -// DS_READ_B96 -// DS_READ_B128 -// BUFFER_LOAD_DWORDX3 -// BUFFER_STORE_DWORDX3 - -} // End isCI - /********** ====================== **********/ /********** Indirect adressing **********/ /********** ====================== **********/ -multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, SI_INDIRECT_DST IndDst> { +multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> { // 1. Extract with offset def : Pat< - (eltvt (vector_extract vt:$vec, (add i32:$idx, imm:$off))), - (SI_INDIRECT_SRC $vec, $idx, imm:$off) + (eltvt (extractelt vt:$vec, (add i32:$idx, imm:$off))), + (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, imm:$off) >; // 2. Extract without offset def : Pat< - (eltvt (vector_extract vt:$vec, i32:$idx)), - (SI_INDIRECT_SRC $vec, $idx, 0) + (eltvt (extractelt vt:$vec, i32:$idx)), + (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, 0) >; // 3. Insert with offset def : Pat< - (vector_insert vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)), - (IndDst $vec, $idx, imm:$off, $val) + (insertelt vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)), + (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, imm:$off, $val) >; // 4. Insert without offset def : Pat< - (vector_insert vt:$vec, eltvt:$val, i32:$idx), - (IndDst $vec, $idx, 0, $val) + (insertelt vt:$vec, eltvt:$val, i32:$idx), + (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, 0, $val) >; } -defm : SI_INDIRECT_Pattern <v2f32, f32, SI_INDIRECT_DST_V2>; -defm : SI_INDIRECT_Pattern <v4f32, f32, SI_INDIRECT_DST_V4>; -defm : SI_INDIRECT_Pattern <v8f32, f32, SI_INDIRECT_DST_V8>; -defm : SI_INDIRECT_Pattern <v16f32, f32, SI_INDIRECT_DST_V16>; +defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">; +defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">; +defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">; +defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">; -defm : SI_INDIRECT_Pattern <v2i32, i32, SI_INDIRECT_DST_V2>; -defm : SI_INDIRECT_Pattern <v4i32, i32, SI_INDIRECT_DST_V4>; -defm : SI_INDIRECT_Pattern <v8i32, i32, SI_INDIRECT_DST_V8>; -defm : SI_INDIRECT_Pattern <v16i32, i32, SI_INDIRECT_DST_V16>; +defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">; +defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">; +defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">; +defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">; //===----------------------------------------------------------------------===// // Conversion Patterns @@ -3215,12 +3180,12 @@ def : Pat < def : Pat < (i1 (trunc i32:$a)), - (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), $a), 1) + (V_CMP_EQ_I32_e64 (S_AND_B32 (i32 1), $a), 1) >; def : Pat < (i1 (trunc i64:$a)), - (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), + (V_CMP_EQ_I32_e64 (S_AND_B32 (i32 1), (EXTRACT_SUBREG $a, sub0)), 1) >; @@ -3301,24 +3266,6 @@ def : Pat < } // End Predicates = [isSI] -let Predicates = [isCI] in { - -// Convert (x - floor(x)) to fract(x) -def : Pat < - (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), - (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), - (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) ->; - -// Convert (x + (-floor(x))) to fract(x) -def : Pat < - (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), - (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), - (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) ->; - -} // End Predicates = [isCI] - //============================================================================// // Miscellaneous Optimization Patterns //============================================================================// diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index c319b32..126f624 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -103,6 +103,10 @@ public: return "SI Lower control flow instructions"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } }; } // End anonymous namespace @@ -140,8 +144,7 @@ void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) { DebugLoc DL = From.getDebugLoc(); BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) - .addOperand(To) - .addReg(AMDGPU::EXEC); + .addOperand(To); } void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) { @@ -159,8 +162,7 @@ void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) { // If the exec mask is non-zero, skip the next two instructions BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addImm(3) - .addReg(AMDGPU::EXEC); + .addImm(3); // Exec mask is zero: Export to NULL target... BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP)) @@ -269,8 +271,7 @@ void SILowerControlFlowPass::Loop(MachineInstr &MI) { .addReg(Src); BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addOperand(MI.getOperand(1)) - .addReg(AMDGPU::EXEC); + .addOperand(MI.getOperand(1)); MI.eraseFromParent(); } @@ -316,7 +317,7 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) { .addImm(0); } } else { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC) + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32)) .addImm(0) .addOperand(Op); } @@ -362,9 +363,9 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int .addReg(AMDGPU::VCC_LO); // Compare the just read M0 value to all possible Idx values - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC) - .addReg(AMDGPU::M0) - .addReg(Idx); + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32)) + .addReg(AMDGPU::M0) + .addReg(Idx); // Update EXEC, save the original EXEC value to VCC BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) @@ -385,8 +386,7 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addImm(-7) - .addReg(AMDGPU::EXEC); + .addImm(-7); // Restore EXEC BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) @@ -438,7 +438,6 @@ void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) { MachineInstr *MovRel = BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) .addReg(Reg) - .addReg(AMDGPU::M0, RegState::Implicit) .addReg(Vec, RegState::Implicit); LoadM0(MI, MovRel, Off); @@ -460,7 +459,6 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) { BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32)) .addReg(Reg, RegState::Define) .addReg(Val) - .addReg(AMDGPU::M0, RegState::Implicit) .addReg(Dst, RegState::Implicit); LoadM0(MI, MovRel, Off); @@ -486,11 +484,11 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { Next = std::next(I); MachineInstr &MI = *I; - if (TII->isWQM(MI.getOpcode()) || TII->isDS(MI.getOpcode())) + if (TII->isWQM(MI) || TII->isDS(MI)) NeedWQM = true; // Flat uses m0 in case it needs to access LDS. - if (TII->isFLAT(MI.getOpcode())) + if (TII->isFLAT(MI)) NeedFlat = true; switch (MI.getOpcode()) { @@ -541,7 +539,11 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { Branch(MI); break; - case AMDGPU::SI_INDIRECT_SRC: + case AMDGPU::SI_INDIRECT_SRC_V1: + case AMDGPU::SI_INDIRECT_SRC_V2: + case AMDGPU::SI_INDIRECT_SRC_V4: + case AMDGPU::SI_INDIRECT_SRC_V8: + case AMDGPU::SI_INDIRECT_SRC_V16: IndirectSrc(MI); break; diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp index 67421e2..a2fa5fd 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -48,6 +48,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 587ea63..935aad4 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -29,10 +29,114 @@ void SIMachineFunctionInfo::anchor() {} SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), TIDReg(AMDGPU::NoRegister), - HasSpilledVGPRs(false), + ScratchRSrcReg(AMDGPU::NoRegister), + ScratchWaveOffsetReg(AMDGPU::NoRegister), + PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister), + DispatchPtrUserSGPR(AMDGPU::NoRegister), + QueuePtrUserSGPR(AMDGPU::NoRegister), + KernargSegmentPtrUserSGPR(AMDGPU::NoRegister), + DispatchIDUserSGPR(AMDGPU::NoRegister), + FlatScratchInitUserSGPR(AMDGPU::NoRegister), + PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister), + WorkGroupIDXSystemSGPR(AMDGPU::NoRegister), + WorkGroupIDYSystemSGPR(AMDGPU::NoRegister), + WorkGroupIDZSystemSGPR(AMDGPU::NoRegister), + WorkGroupInfoSystemSGPR(AMDGPU::NoRegister), + PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), + LDSWaveSpillSize(0), PSInputAddr(0), NumUserSGPRs(0), - LDSWaveSpillSize(0) { } + NumSystemSGPRs(0), + HasSpilledSGPRs(false), + HasSpilledVGPRs(false), + PrivateSegmentBuffer(false), + DispatchPtr(false), + QueuePtr(false), + DispatchID(false), + KernargSegmentPtr(false), + FlatScratchInit(false), + GridWorkgroupCountX(false), + GridWorkgroupCountY(false), + GridWorkgroupCountZ(false), + WorkGroupIDX(true), + WorkGroupIDY(false), + WorkGroupIDZ(false), + WorkGroupInfo(false), + PrivateSegmentWaveByteOffset(false), + WorkItemIDX(true), + WorkItemIDY(false), + WorkItemIDZ(false) { + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + const Function *F = MF.getFunction(); + + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + + if (getShaderType() == ShaderType::COMPUTE) + KernargSegmentPtr = true; + + if (F->hasFnAttribute("amdgpu-work-group-id-y")) + WorkGroupIDY = true; + + if (F->hasFnAttribute("amdgpu-work-group-id-z")) + WorkGroupIDZ = true; + + if (F->hasFnAttribute("amdgpu-work-item-id-y")) + WorkItemIDY = true; + + if (F->hasFnAttribute("amdgpu-work-item-id-z")) + WorkItemIDZ = true; + + bool MaySpill = ST.isVGPRSpillingEnabled(this); + bool HasStackObjects = FrameInfo->hasStackObjects(); + + if (HasStackObjects || MaySpill) + PrivateSegmentWaveByteOffset = true; + + if (ST.isAmdHsaOS()) { + if (HasStackObjects || MaySpill) + PrivateSegmentBuffer = true; + + if (F->hasFnAttribute("amdgpu-dispatch-ptr")) + DispatchPtr = true; + } + + // X, XY, and XYZ are the only supported combinations, so make sure Y is + // enabled if Z is. + if (WorkItemIDZ) + WorkItemIDY = true; +} + +unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( + const SIRegisterInfo &TRI) { + PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass); + NumUserSGPRs += 4; + return PrivateSegmentBufferUserSGPR; +} + +unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) { + DispatchPtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return DispatchPtrUserSGPR; +} + +unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) { + QueuePtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return QueuePtrUserSGPR; +} + +unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) { + KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return KernargSegmentPtrUserSGPR; +} SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( MachineFunction *MF, @@ -53,7 +157,6 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( if (!LaneVGPRs.count(LaneVGPRIdx)) { unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass); LaneVGPRs[LaneVGPRIdx] = LaneVGPR; - MRI.setPhysRegUsed(LaneVGPR); // Add this register as live-in to all blocks to avoid machine verifer // complaining about use of an undefined physical register. diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 667da4c..9c528d6 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -26,13 +26,83 @@ class MachineRegisterInfo; /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. class SIMachineFunctionInfo : public AMDGPUMachineFunction { + // FIXME: This should be removed and getPreloadedValue moved here. + friend struct SIRegisterInfo; void anchor() override; unsigned TIDReg; - bool HasSpilledVGPRs; + + // Registers that may be reserved for spilling purposes. These may be the same + // as the input registers. + unsigned ScratchRSrcReg; + unsigned ScratchWaveOffsetReg; + + // Input registers setup for the HSA ABI. + // User SGPRs in allocation order. + unsigned PrivateSegmentBufferUserSGPR; + unsigned DispatchPtrUserSGPR; + unsigned QueuePtrUserSGPR; + unsigned KernargSegmentPtrUserSGPR; + unsigned DispatchIDUserSGPR; + unsigned FlatScratchInitUserSGPR; + unsigned PrivateSegmentSizeUserSGPR; + unsigned GridWorkGroupCountXUserSGPR; + unsigned GridWorkGroupCountYUserSGPR; + unsigned GridWorkGroupCountZUserSGPR; + + // System SGPRs in allocation order. + unsigned WorkGroupIDXSystemSGPR; + unsigned WorkGroupIDYSystemSGPR; + unsigned WorkGroupIDZSystemSGPR; + unsigned WorkGroupInfoSystemSGPR; + unsigned PrivateSegmentWaveByteOffsetSystemSGPR; public: + // FIXME: Make private + unsigned LDSWaveSpillSize; + unsigned PSInputAddr; + std::map<unsigned, unsigned> LaneVGPRs; + unsigned ScratchOffsetReg; + unsigned NumUserSGPRs; + unsigned NumSystemSGPRs; + +private: + bool HasSpilledSGPRs; + bool HasSpilledVGPRs; + + // Feature bits required for inputs passed in user SGPRs. + bool PrivateSegmentBuffer : 1; + bool DispatchPtr : 1; + bool QueuePtr : 1; + bool DispatchID : 1; + bool KernargSegmentPtr : 1; + bool FlatScratchInit : 1; + bool GridWorkgroupCountX : 1; + bool GridWorkgroupCountY : 1; + bool GridWorkgroupCountZ : 1; + + // Feature bits required for inputs passed in system SGPRs. + bool WorkGroupIDX : 1; // Always initialized. + bool WorkGroupIDY : 1; + bool WorkGroupIDZ : 1; + bool WorkGroupInfo : 1; + bool PrivateSegmentWaveByteOffset : 1; + + bool WorkItemIDX : 1; // Always initialized. + bool WorkItemIDY : 1; + bool WorkItemIDZ : 1; + + MCPhysReg getNextUserSGPR() const { + assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); + return AMDGPU::SGPR0 + NumUserSGPRs; + } + + MCPhysReg getNextSystemSGPR() const { + return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; + } + +public: struct SpilledReg { unsigned VGPR; int Lane; @@ -46,16 +116,162 @@ public: SIMachineFunctionInfo(const MachineFunction &MF); SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex, unsigned SubIdx); - unsigned PSInputAddr; - unsigned NumUserSGPRs; - std::map<unsigned, unsigned> LaneVGPRs; - unsigned LDSWaveSpillSize; - unsigned ScratchOffsetReg; bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; }; unsigned getTIDReg() const { return TIDReg; }; void setTIDReg(unsigned Reg) { TIDReg = Reg; } - bool hasSpilledVGPRs() const { return HasSpilledVGPRs; } - void setHasSpilledVGPRs(bool Spill = true) { HasSpilledVGPRs = Spill; } + + // Add user SGPRs. + unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI); + unsigned addDispatchPtr(const SIRegisterInfo &TRI); + unsigned addQueuePtr(const SIRegisterInfo &TRI); + unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI); + + // Add system SGPRs. + unsigned addWorkGroupIDX() { + WorkGroupIDXSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDXSystemSGPR; + } + + unsigned addWorkGroupIDY() { + WorkGroupIDYSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDYSystemSGPR; + } + + unsigned addWorkGroupIDZ() { + WorkGroupIDZSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDZSystemSGPR; + } + + unsigned addWorkGroupInfo() { + WorkGroupInfoSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupInfoSystemSGPR; + } + + unsigned addPrivateSegmentWaveByteOffset() { + PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return PrivateSegmentWaveByteOffsetSystemSGPR; + } + + bool hasPrivateSegmentBuffer() const { + return PrivateSegmentBuffer; + } + + bool hasDispatchPtr() const { + return DispatchPtr; + } + + bool hasQueuePtr() const { + return QueuePtr; + } + + bool hasDispatchID() const { + return DispatchID; + } + + bool hasKernargSegmentPtr() const { + return KernargSegmentPtr; + } + + bool hasFlatScratchInit() const { + return FlatScratchInit; + } + + bool hasGridWorkgroupCountX() const { + return GridWorkgroupCountX; + } + + bool hasGridWorkgroupCountY() const { + return GridWorkgroupCountY; + } + + bool hasGridWorkgroupCountZ() const { + return GridWorkgroupCountZ; + } + + bool hasWorkGroupIDX() const { + return WorkGroupIDX; + } + + bool hasWorkGroupIDY() const { + return WorkGroupIDY; + } + + bool hasWorkGroupIDZ() const { + return WorkGroupIDZ; + } + + bool hasWorkGroupInfo() const { + return WorkGroupInfo; + } + + bool hasPrivateSegmentWaveByteOffset() const { + return PrivateSegmentWaveByteOffset; + } + + bool hasWorkItemIDX() const { + return WorkItemIDX; + } + + bool hasWorkItemIDY() const { + return WorkItemIDY; + } + + bool hasWorkItemIDZ() const { + return WorkItemIDZ; + } + + unsigned getNumUserSGPRs() const { + return NumUserSGPRs; + } + + unsigned getNumPreloadedSGPRs() const { + return NumUserSGPRs + NumSystemSGPRs; + } + + unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const { + return PrivateSegmentWaveByteOffsetSystemSGPR; + } + + /// \brief Returns the physical register reserved for use as the resource + /// descriptor for scratch accesses. + unsigned getScratchRSrcReg() const { + return ScratchRSrcReg; + } + + void setScratchRSrcReg(unsigned Reg) { + assert(Reg != AMDGPU::NoRegister && "Should never be unset"); + ScratchRSrcReg = Reg; + } + + unsigned getScratchWaveOffsetReg() const { + return ScratchWaveOffsetReg; + } + + void setScratchWaveOffsetReg(unsigned Reg) { + assert(Reg != AMDGPU::NoRegister && "Should never be unset"); + ScratchWaveOffsetReg = Reg; + } + + bool hasSpilledSGPRs() const { + return HasSpilledSGPRs; + } + + void setHasSpilledSGPRs(bool Spill = true) { + HasSpilledSGPRs = Spill; + } + + bool hasSpilledVGPRs() const { + return HasSpilledVGPRs; + } + + void setHasSpilledVGPRs(bool Spill = true) { + HasSpilledVGPRs = Spill; + } unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const; }; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp deleted file mode 100644 index 2cd600d..0000000 --- a/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp +++ /dev/null @@ -1,193 +0,0 @@ -//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// -/// This pass loads scratch pointer and scratch offset into a register or a -/// frame index which can be used anywhere in the program. These values will -/// be used for spilling VGPRs. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "SIDefines.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RegisterScavenging.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/LLVMContext.h" - -using namespace llvm; - -namespace { - -class SIPrepareScratchRegs : public MachineFunctionPass { - -private: - static char ID; - -public: - SIPrepareScratchRegs() : MachineFunctionPass(ID) { } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI prepare scratch registers"; - } - -}; - -} // End anonymous namespace - -char SIPrepareScratchRegs::ID = 0; - -FunctionPass *llvm::createSIPrepareScratchRegs() { - return new SIPrepareScratchRegs(); -} - -bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) { - SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); - const SIRegisterInfo *TRI = &TII->getRegisterInfo(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - MachineBasicBlock *Entry = MF.begin(); - MachineBasicBlock::iterator I = Entry->begin(); - DebugLoc DL = I->getDebugLoc(); - - // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to - // run this pass. - if (!MFI->hasSpilledVGPRs()) - return false; - - unsigned ScratchPtrPreloadReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); - unsigned ScratchOffsetPreloadReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); - - if (!Entry->isLiveIn(ScratchPtrPreloadReg)) - Entry->addLiveIn(ScratchPtrPreloadReg); - - if (!Entry->isLiveIn(ScratchOffsetPreloadReg)) - Entry->addLiveIn(ScratchOffsetPreloadReg); - - // Load the scratch offset. - unsigned ScratchOffsetReg = - TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass); - int ScratchOffsetFI = -1; - - if (ScratchOffsetReg != AMDGPU::NoRegister) { - // Found an SGPR to use - MRI.setPhysRegUsed(ScratchOffsetReg); - BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg) - .addReg(ScratchOffsetPreloadReg); - } else { - // No SGPR is available, we must spill. - ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4); - BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE)) - .addReg(ScratchOffsetPreloadReg) - .addFrameIndex(ScratchOffsetFI) - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - } - - - // Now that we have the scratch pointer and offset values, we need to - // add them to all the SI_SPILL_V* instructions. - - RegScavenger RS; - unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4); - RS.addScavengingFrameIndex(ScratchRsrcFI); - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - - MachineBasicBlock &MBB = *BI; - // Add the scratch offset reg as a live-in so that the register scavenger - // doesn't re-use it. - if (!MBB.isLiveIn(ScratchOffsetReg) && - ScratchOffsetReg != AMDGPU::NoRegister) - MBB.addLiveIn(ScratchOffsetReg); - RS.enterBasicBlock(&MBB); - - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - MachineInstr &MI = *I; - RS.forward(I); - DebugLoc DL = MI.getDebugLoc(); - if (!TII->isVGPRSpill(MI.getOpcode())) - continue; - - // Scratch resource - unsigned ScratchRsrcReg = - RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0); - - uint64_t Rsrc23 = TII->getScratchRsrcWords23(); - - unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); - unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); - unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); - unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0) - .addExternalSymbol("SCRATCH_RSRC_DWORD0") - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1) - .addExternalSymbol("SCRATCH_RSRC_DWORD1") - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2) - .addImm(Rsrc23 & 0xffffffff) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3) - .addImm(Rsrc23 >> 32) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - // Scratch Offset - if (ScratchOffsetReg == AMDGPU::NoRegister) { - ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); - BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE), - ScratchOffsetReg) - .addFrameIndex(ScratchOffsetFI) - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - } else if (!MBB.isLiveIn(ScratchOffsetReg)) { - MBB.addLiveIn(ScratchOffsetReg); - } - - if (ScratchRsrcReg == AMDGPU::NoRegister || - ScratchOffsetReg == AMDGPU::NoRegister) { - LLVMContext &Ctx = MF.getFunction()->getContext(); - Ctx.emitError("ran out of SGPRs for spilling VGPRs"); - ScratchRsrcReg = AMDGPU::SGPR0; - ScratchOffsetReg = AMDGPU::SGPR0; - } - MI.getOperand(2).setReg(ScratchRsrcReg); - MI.getOperand(2).setIsKill(true); - MI.getOperand(2).setIsUndef(false); - MI.getOperand(3).setReg(ScratchOffsetReg); - MI.getOperand(3).setIsUndef(false); - MI.getOperand(3).setIsKill(false); - MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true)); - } - } - return true; -} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index e9e8412..3cdffef 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -12,7 +12,6 @@ // //===----------------------------------------------------------------------===// - #include "SIRegisterInfo.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" @@ -33,6 +32,40 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co Reserved.set(*R); } +unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( + const MachineFunction &MF) const { + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + if (ST.hasSGPRInitBug()) { + unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 4; + unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); + return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); + } + + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // 98/99 need to be reserved for flat_scr, and 100/101 for vcc. This is the + // next sgpr128 down. + return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95; + } + + return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99; +} + +unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( + const MachineFunction &MF) const { + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + if (ST.hasSGPRInitBug()) { + unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 5; + return AMDGPU::SGPR_32RegClass.getRegister(Idx); + } + + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // Next register before reservations for flat_scr and vcc. + return AMDGPU::SGPR97; + } + + return AMDGPU::SGPR95; +} + BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); @@ -42,13 +75,22 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::EXEC); reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); - // Reserve some VGPRs to use as temp registers in case we have to spill VGPRs - reserveRegisterTuples(Reserved, AMDGPU::VGPR254); - reserveRegisterTuples(Reserved, AMDGPU::VGPR255); + // Reserve the last 2 registers so we will always have at least 2 more that + // will physically contain VCC. + reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103); + + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation + // for VCC/FLAT_SCR. + reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99); + reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101); + } // Tonga and Iceland can only allocate a fixed number of SGPRs due // to a hw bug. - if (MF.getSubtarget<AMDGPUSubtarget>().hasSGPRInitBug()) { + if (ST.hasSGPRInitBug()) { unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); // Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs). // Assume XNACK_MASK is unused. @@ -60,34 +102,57 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { } } + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + if (ScratchWaveOffsetReg != AMDGPU::NoRegister) { + // Reserve 1 SGPR for scratch wave offset in case we need to spill. + reserveRegisterTuples(Reserved, ScratchWaveOffsetReg); + } + + unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); + if (ScratchRSrcReg != AMDGPU::NoRegister) { + // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need + // to spill. + // TODO: May need to reserve a VGPR if doing LDS spilling. + reserveRegisterTuples(Reserved, ScratchRSrcReg); + assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); + } + return Reserved; } unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const { - const AMDGPUSubtarget &STI = MF.getSubtarget<AMDGPUSubtarget>(); // FIXME: We should adjust the max number of waves based on LDS size. unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(), STI.getMaxWavesPerCU()); unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU()); + unsigned VSLimit = SGPRLimit + VGPRLimit; + for (regclass_iterator I = regclass_begin(), E = regclass_end(); I != E; ++I) { + const TargetRegisterClass *RC = *I; - unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1); + unsigned NumSubRegs = std::max((int)RC->getSize() / 4, 1); unsigned Limit; - if (isSGPRClass(*I)) { + if (isPseudoRegClass(RC)) { + // FIXME: This is a hack. We should never be considering the pressure of + // these since no virtual register should ever have this class. + Limit = VSLimit; + } else if (isSGPRClass(RC)) { Limit = SGPRLimit / NumSubRegs; } else { Limit = VGPRLimit / NumSubRegs; } - const int *Sets = getRegClassPressureSets(*I); + const int *Sets = getRegClassPressureSets(RC); assert(Sets); for (unsigned i = 0; Sets[i] != -1; ++i) { - if (Sets[i] == (int)Idx) + if (Sets[i] == (int)Idx) return Limit; } } @@ -174,17 +239,17 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, unsigned SubReg = NumSubRegs > 1 ? getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) : Value; - bool IsKill = (i == e - 1); BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) - .addReg(SubReg, getDefRegState(IsLoad)) - .addReg(ScratchRsrcReg, getKillRegState(IsKill)) - .addReg(SOffset) - .addImm(Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .addReg(Value, RegState::Implicit | getDefRegState(IsLoad)); + .addReg(SubReg, getDefRegState(IsLoad)) + .addReg(ScratchRsrcReg) + .addReg(SOffset) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addReg(Value, RegState::Implicit | getDefRegState(IsLoad)) + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); } } @@ -228,6 +293,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, .addReg(SubReg) .addImm(Spill.Lane); + // FIXME: Since this spills to another register instead of an actual + // frame index, we should delete the frame index when all references to + // it are fixed. } MI->eraseFromParent(); break; @@ -263,16 +331,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, // TODO: only do this when it is needed switch (MF->getSubtarget<AMDGPUSubtarget>().getGeneration()) { case AMDGPUSubtarget::SOUTHERN_ISLANDS: - // "VALU writes SGPR" -> "SMRD reads that SGPR" needs "S_NOP 3" on SI - TII->insertNOPs(MI, 3); + // "VALU writes SGPR" -> "SMRD reads that SGPR" needs 4 wait states + // ("S_NOP 3") on SI + TII->insertWaitStates(MI, 4); break; case AMDGPUSubtarget::SEA_ISLANDS: break; default: // VOLCANIC_ISLANDS and later - // "VALU writes SGPR -> VMEM reads that SGPR" needs "S_NOP 4" on VI - // and later. This also applies to VALUs which write VCC, but we're - // unlikely to see VMEM use VCC. - TII->insertNOPs(MI, 4); + // "VALU writes SGPR -> VMEM reads that SGPR" needs 5 wait states + // ("S_NOP 4") on VI and later. This also applies to VALUs which write + // VCC, but we're unlikely to see VMEM use VCC. + TII->insertWaitStates(MI, 5); } MI->eraseFromParent(); @@ -322,22 +391,16 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } } -const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass( - MVT VT) const { - switch(VT.SimpleTy) { - default: - case MVT::i32: return &AMDGPU::VGPR_32RegClass; - } -} - unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const { return getEncodingValue(Reg) & 0xff; } +// FIXME: This is very slow. It might be worth creating a map from physreg to +// register class. const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { assert(!TargetRegisterInfo::isVirtualRegister(Reg)); - static const TargetRegisterClass *BaseClasses[] = { + static const TargetRegisterClass *const BaseClasses[] = { &AMDGPU::VGPR_32RegClass, &AMDGPU::SReg_32RegClass, &AMDGPU::VReg_64RegClass, @@ -359,33 +422,45 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { return nullptr; } +// TODO: It might be helpful to have some target specific flags in +// TargetRegisterClass to mark which classes are VGPRs to make this trivial. bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { - return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_512RegClass, RC); + switch (RC->getSize()) { + case 4: + return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; + case 8: + return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr; + case 12: + return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr; + case 16: + return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr; + case 32: + return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr; + case 64: + return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; + default: + llvm_unreachable("Invalid register class size"); + } } const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( const TargetRegisterClass *SRC) const { - if (hasVGPRs(SRC)) { - return SRC; - } else if (SRC == &AMDGPU::SCCRegRegClass) { - return &AMDGPU::VCCRegRegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_32RegClass)) { - return &AMDGPU::VGPR_32RegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_64RegClass)) { - return &AMDGPU::VReg_64RegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SReg_128RegClass)) { - return &AMDGPU::VReg_128RegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SReg_256RegClass)) { - return &AMDGPU::VReg_256RegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SReg_512RegClass)) { - return &AMDGPU::VReg_512RegClass; - } - return nullptr; + switch (SRC->getSize()) { + case 4: + return &AMDGPU::VGPR_32RegClass; + case 8: + return &AMDGPU::VReg_64RegClass; + case 12: + return &AMDGPU::VReg_96RegClass; + case 16: + return &AMDGPU::VReg_128RegClass; + case 32: + return &AMDGPU::VReg_256RegClass; + case 64: + return &AMDGPU::VReg_512RegClass; + default: + llvm_unreachable("Invalid register class size"); + } } const TargetRegisterClass *SIRegisterInfo::getSubRegClass( @@ -402,6 +477,30 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( } } +bool SIRegisterInfo::shouldRewriteCopySrc( + const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const { + // We want to prefer the smallest register class possible, so we don't want to + // stop and rewrite on anything that looks like a subregister + // extract. Operations mostly don't care about the super register class, so we + // only want to stop on the most basic of copies between the smae register + // class. + // + // e.g. if we have something like + // vreg0 = ... + // vreg1 = ... + // vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1, vreg2, sub2 + // vreg3 = COPY vreg2, sub0 + // + // We want to look through the COPY to find: + // => vreg3 = COPY vreg0 + + // Plain copy. + return getCommonSubClass(DefRC, SrcRC) != nullptr; +} + unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC, unsigned Channel) const { @@ -462,30 +561,47 @@ bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { return OpType == AMDGPU::OPERAND_REG_INLINE_C; } +// FIXME: Most of these are flexible with HSA and we don't need to reserve them +// as input registers if unused. Whether the dispatch ptr is necessary should be +// easy to detect from used intrinsics. Scratch setup is harder to know. unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, enum PreloadedValue Value) const { const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + (void)ST; switch (Value) { - case SIRegisterInfo::TGID_X: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0); - case SIRegisterInfo::TGID_Y: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1); - case SIRegisterInfo::TGID_Z: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2); - case SIRegisterInfo::SCRATCH_WAVE_OFFSET: - if (MFI->getShaderType() != ShaderType::COMPUTE) - return MFI->ScratchOffsetReg; - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4); - case SIRegisterInfo::SCRATCH_PTR: - return AMDGPU::SGPR2_SGPR3; - case SIRegisterInfo::INPUT_PTR: - return AMDGPU::SGPR0_SGPR1; - case SIRegisterInfo::TIDIG_X: + case SIRegisterInfo::WORKGROUP_ID_X: + assert(MFI->hasWorkGroupIDX()); + return MFI->WorkGroupIDXSystemSGPR; + case SIRegisterInfo::WORKGROUP_ID_Y: + assert(MFI->hasWorkGroupIDY()); + return MFI->WorkGroupIDYSystemSGPR; + case SIRegisterInfo::WORKGROUP_ID_Z: + assert(MFI->hasWorkGroupIDZ()); + return MFI->WorkGroupIDZSystemSGPR; + case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: + return MFI->PrivateSegmentWaveByteOffsetSystemSGPR; + case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER: + assert(ST.isAmdHsaOS() && "Non-HSA ABI currently uses relocations"); + assert(MFI->hasPrivateSegmentBuffer()); + return MFI->PrivateSegmentBufferUserSGPR; + case SIRegisterInfo::KERNARG_SEGMENT_PTR: + assert(MFI->hasKernargSegmentPtr()); + return MFI->KernargSegmentPtrUserSGPR; + case SIRegisterInfo::DISPATCH_PTR: + assert(MFI->hasDispatchPtr()); + return MFI->DispatchPtrUserSGPR; + case SIRegisterInfo::QUEUE_PTR: + llvm_unreachable("not implemented"); + case SIRegisterInfo::WORKITEM_ID_X: + assert(MFI->hasWorkItemIDX()); return AMDGPU::VGPR0; - case SIRegisterInfo::TIDIG_Y: + case SIRegisterInfo::WORKITEM_ID_Y: + assert(MFI->hasWorkItemIDY()); return AMDGPU::VGPR1; - case SIRegisterInfo::TIDIG_Z: + case SIRegisterInfo::WORKITEM_ID_Z: + assert(MFI->hasWorkItemIDZ()); return AMDGPU::VGPR2; } llvm_unreachable("unexpected preloaded value type"); @@ -496,12 +612,9 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, // AMDGPU::NoRegister. unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC) const { - - for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); - I != E; ++I) { - if (!MRI.isPhysRegUsed(*I)) - return *I; - } + for (unsigned Reg : *RC) + if (!MRI.isPhysRegUsed(Reg)) + return Reg; return AMDGPU::NoRegister; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 7da6de2..1795237 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -18,6 +18,7 @@ #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/Debug.h" namespace llvm { @@ -29,6 +30,15 @@ private: public: SIRegisterInfo(); + /// Return the end register initially reserved for the scratch buffer in case + /// spilling is needed. + unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; + + /// Return the end register initially reserved for the scratch wave offset in + /// case spilling is needed. + unsigned reservedPrivateSegmentWaveByteOffsetReg( + const MachineFunction &MF) const; + BitVector getReservedRegs(const MachineFunction &MF) const override; unsigned getRegPressureSetLimit(const MachineFunction &MF, @@ -40,10 +50,6 @@ public: unsigned FIOperandNum, RegScavenger *RS) const override; - /// \brief get the register class of the specified type to use in the - /// CFGStructurizer - const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override; - unsigned getHWRegIndex(unsigned Reg) const override; /// \brief Return the 'base' register class for this register. @@ -52,23 +58,30 @@ public: /// \returns true if this class contains only SGPR registers bool isSGPRClass(const TargetRegisterClass *RC) const { - if (!RC) - return false; - return !hasVGPRs(RC); } /// \returns true if this class ID contains only SGPR registers bool isSGPRClassID(unsigned RCID) const { - if (static_cast<int>(RCID) == -1) - return false; - return isSGPRClass(getRegClass(RCID)); } + bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const { + if (TargetRegisterInfo::isVirtualRegister(Reg)) + return isSGPRClass(MRI.getRegClass(Reg)); + return getPhysRegClass(Reg); + } + /// \returns true if this class contains VGPR registers. bool hasVGPRs(const TargetRegisterClass *RC) const; + /// returns true if this is a pseudoregister class combination of VGPRs and + /// SGPRs for operand modeling. FIXME: We should set isAllocatable = 0 on + /// them. + static bool isPseudoRegClass(const TargetRegisterClass *RC) { + return RC == &AMDGPU::VS_32RegClass || RC == &AMDGPU::VS_64RegClass; + } + /// \returns A VGPR reg class with the same width as \p SRC const TargetRegisterClass *getEquivalentVGPRClass( const TargetRegisterClass *SRC) const; @@ -79,6 +92,11 @@ public: const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC, unsigned SubIdx) const; + bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const override; + /// \p Channel This is the register channel (e.g. a value from 0-16), not the /// SubReg index. /// \returns The sub-register of Reg that is in Channel. @@ -91,19 +109,25 @@ public: /// \returns True if operands defined with this operand type can accept /// an inline constant. i.e. An integer value in the range (-16, 64) or - /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. + /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. bool opCanUseInlineConstant(unsigned OpType) const; enum PreloadedValue { - TGID_X, - TGID_Y, - TGID_Z, - SCRATCH_WAVE_OFFSET, - SCRATCH_PTR, - INPUT_PTR, - TIDIG_X, - TIDIG_Y, - TIDIG_Z + // SGPRS: + PRIVATE_SEGMENT_BUFFER = 0, + DISPATCH_PTR = 1, + QUEUE_PTR = 2, + KERNARG_SEGMENT_PTR = 3, + WORKGROUP_ID_X = 10, + WORKGROUP_ID_Y = 11, + WORKGROUP_ID_Z = 12, + PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, + + // VGPRS: + FIRST_VGPR_VALUE = 15, + WORKITEM_ID_X = FIRST_VGPR_VALUE, + WORKITEM_ID_Y = 16, + WORKITEM_ID_Z = 17 }; /// \brief Returns the physical register that \p Value is stored in. diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 2a9017f..bfaf937 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -10,10 +10,13 @@ //===----------------------------------------------------------------------===// // Declarations that describe the SI registers //===----------------------------------------------------------------------===// - -class SIReg <string n, bits<16> encoding = 0> : Register<n> { +class SIReg <string n, bits<16> regIdx = 0> : Register<n>, + DwarfRegNum<[!cast<int>(HWEncoding)]> { let Namespace = "AMDGPU"; - let HWEncoding = encoding; + + // This is the not yet the complete register encoding. An additional + // bit is set for VGPRs. + let HWEncoding = regIdx; } // Special Registers @@ -21,7 +24,8 @@ def VCC_LO : SIReg<"vcc_lo", 106>; def VCC_HI : SIReg<"vcc_hi", 107>; // VCC for 64-bit instructions -def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> { +def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>, + DwarfRegAlias<VCC_LO> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; let HWEncoding = 106; @@ -30,7 +34,8 @@ def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> { def EXEC_LO : SIReg<"exec_lo", 126>; def EXEC_HI : SIReg<"exec_hi", 127>; -def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> { +def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>, + DwarfRegAlias<EXEC_LO> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; let HWEncoding = 126; @@ -39,18 +44,29 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> { def SCC : SIReg<"scc", 253>; def M0 : SIReg <"m0", 124>; -def FLAT_SCR_LO : SIReg<"flat_scr_lo", 104>; // Offset in units of 256-bytes. -def FLAT_SCR_HI : SIReg<"flat_scr_hi", 105>; // Size is the per-thread scratch size, in bytes. +multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> { + def _ci : SIReg<n, ci_e>; + def _vi : SIReg<n, vi_e>; + def "" : SIReg<"", 0>; +} -// Pair to indicate location of scratch space for flat accesses. -def FLAT_SCR : RegisterWithSubRegs <"flat_scr", [FLAT_SCR_LO, FLAT_SCR_HI]> { +class FlatReg <Register lo, Register hi, bits<16> encoding> : + RegisterWithSubRegs<"flat_scratch", [lo, hi]>, + DwarfRegAlias<lo> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; - let HWEncoding = 104; + let HWEncoding = encoding; } +defm FLAT_SCR_LO : FLAT_SCR_LOHI_m<"flat_scratch_lo", 104, 102>; // Offset in units of 256-bytes. +defm FLAT_SCR_HI : FLAT_SCR_LOHI_m<"flat_scratch_hi", 105, 103>; // Size is the per-thread scratch size, in bytes. + +def FLAT_SCR_ci : FlatReg<FLAT_SCR_LO_ci, FLAT_SCR_HI_ci, 104>; +def FLAT_SCR_vi : FlatReg<FLAT_SCR_LO_vi, FLAT_SCR_HI_vi, 102>; +def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>; + // SGPR registers -foreach Index = 0-101 in { +foreach Index = 0-103 in { def SGPR#Index : SIReg <"SGPR"#Index, Index>; } @@ -65,25 +81,27 @@ foreach Index = 0-255 in { // Groupings using register classes and tuples //===----------------------------------------------------------------------===// +// TODO: Do we need to set DwarfRegAlias on register tuples? + // SGPR 32-bit registers def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add (sequence "SGPR%u", 0, 101))>; + (add (sequence "SGPR%u", 0, 103))>; // SGPR 64-bit registers def SGPR_64Regs : RegisterTuples<[sub0, sub1], - [(add (decimate (trunc SGPR_32, 101), 2)), + [(add (decimate SGPR_32, 2)), (add (decimate (shl SGPR_32, 1), 2))]>; // SGPR 128-bit registers def SGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3], - [(add (decimate (trunc SGPR_32, 99), 4)), + [(add (decimate SGPR_32, 4)), (add (decimate (shl SGPR_32, 1), 4)), (add (decimate (shl SGPR_32, 2), 4)), (add (decimate (shl SGPR_32, 3), 4))]>; // SGPR 256-bit registers def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], - [(add (decimate (trunc SGPR_32, 95), 4)), + [(add (decimate SGPR_32, 4)), (add (decimate (shl SGPR_32, 1), 4)), (add (decimate (shl SGPR_32, 2), 4)), (add (decimate (shl SGPR_32, 3), 4)), @@ -95,7 +113,7 @@ def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], // SGPR 512-bit registers def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15], - [(add (decimate (trunc SGPR_32, 87), 4)), + [(add (decimate SGPR_32, 4)), (add (decimate (shl SGPR_32, 1), 4)), (add (decimate (shl SGPR_32, 2), 4)), (add (decimate (shl SGPR_32, 3), 4)), @@ -174,44 +192,57 @@ class RegImmMatcher<string name> : AsmOperandClass { let RenderMethod = "addRegOrImmOperands"; } -// Special register classes for predicates and the M0 register -def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)> { - let CopyCost = -1; // Theoretically it is possible to read from SCC, - // but it should never be necessary. -} - -def VCCReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add VCC)>; -def EXECReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add EXEC)>; - // Register class for all scalar registers (SGPRs + Special Registers) def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI) >; -def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 64, (add SGPR_64Regs)>; +def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)>; -def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 64, - (add SGPR_64, VCCReg, EXECReg, FLAT_SCR) +def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32, + (add SGPR_64, VCC, EXEC, FLAT_SCR) >; -def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>; +def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128)> { + // Requires 2 s_mov_b64 to copy + let CopyCost = 2; +} -def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>; +def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add SGPR_256)> { + // Requires 4 s_mov_b64 to copy + let CopyCost = 4; +} -def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 512, (add SGPR_512)>; +def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> { + // Requires 8 s_mov_b64 to copy + let CopyCost = 8; +} // Register class for all vector registers (VGPRs + Interploation Registers) -def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>; +def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> { + // Requires 2 v_mov_b32 to copy + let CopyCost = 2; +} -def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> { +def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> { let Size = 96; + + // Requires 3 v_mov_b32 to copy + let CopyCost = 3; } -def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>; +def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> { + // Requires 4 v_mov_b32 to copy + let CopyCost = 4; +} -def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256)>; +def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add VGPR_256)> { + let CopyCost = 8; +} -def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>; +def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> { + let CopyCost = 16; +} def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { let Size = 32; @@ -253,7 +284,9 @@ def SCSrc_32 : RegInlineOperand<SReg_32> { def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)>; -def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>; +def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> { + let CopyCost = 2; +} def VSrc_32 : RegisterOperand<VS_32> { let OperandNamespace = "AMDGPU"; @@ -282,3 +315,13 @@ def VCSrc_64 : RegisterOperand<VS_64> { let OperandType = "OPERAND_REG_INLINE_C"; let ParserMatchClass = RegImmMatcher<"VCSrc64">; } + +//===----------------------------------------------------------------------===// +// SCSrc_* Operands with an SGPR or an inline constant +//===----------------------------------------------------------------------===// + +def SCSrc_64 : RegisterOperand<SReg_64> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_INLINE_C"; + let ParserMatchClass = RegImmMatcher<"SCSrc64">; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td index 9b1f676..cd77e51 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td @@ -17,16 +17,28 @@ def WriteLDS : SchedWrite; def WriteSALU : SchedWrite; def WriteSMEM : SchedWrite; def WriteVMEM : SchedWrite; +def WriteBarrier : SchedWrite; // Vector ALU instructions def Write32Bit : SchedWrite; def WriteQuarterRate32 : SchedWrite; +def WriteFullOrQuarterRate32 : SchedWrite; def WriteFloatFMA : SchedWrite; -def WriteDouble : SchedWrite; +// Slow quarter rate f64 instruction. +def WriteDouble : SchedWrite; + +// half rate f64 instruction (same as v_add_f64) def WriteDoubleAdd : SchedWrite; +// Half rate 64-bit instructions. +def Write64Bit : SchedWrite; + +// FIXME: Should there be a class for instructions which are VALU +// instructions and have VALU rates, but write to the SALU (i.e. VOPC +// instructions) + def SIFullSpeedModel : SchedMachineModel; def SIQuarterSpeedModel : SchedMachineModel; @@ -53,7 +65,7 @@ class HWVALUWriteRes<SchedWrite write, int latency> : // The latency numbers are taken from AMD Accelerated Parallel Processing -// guide. They may not be acurate. +// guide. They may not be accurate. // The latency values are 1 / (operations / cycle) / 4. multiclass SICommonWriteRes { @@ -64,8 +76,10 @@ multiclass SICommonWriteRes { def : HWWriteRes<WriteSALU, [HWSALU], 1>; def : HWWriteRes<WriteSMEM, [HWLGKM], 10>; // XXX: Guessed ??? def : HWWriteRes<WriteVMEM, [HWVMEM], 450>; // 300 - 600 + def : HWWriteRes<WriteBarrier, [HWBranch], 500>; // XXX: Guessed ??? def : HWVALUWriteRes<Write32Bit, 1>; + def : HWVALUWriteRes<Write64Bit, 2>; def : HWVALUWriteRes<WriteQuarterRate32, 4>; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 5d00bdd..4f0913f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -141,8 +141,7 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, if (!MRI.isSSA()) return; - assert(TII->isVOP1(MI.getOpcode()) || TII->isVOP2(MI.getOpcode()) || - TII->isVOPC(MI.getOpcode())); + assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); const SIRegisterInfo &TRI = TII->getRegisterInfo(); int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); @@ -187,6 +186,21 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, } +// Copy MachineOperand with all flags except setting it as implicit. +static MachineOperand copyRegOperandAsImplicit(const MachineOperand &Orig) { + assert(!Orig.isImplicit()); + return MachineOperand::CreateReg(Orig.getReg(), + Orig.isDef(), + true, + Orig.isKill(), + Orig.isDead(), + Orig.isUndef(), + Orig.isEarlyClobber(), + Orig.getSubReg(), + Orig.isDebug(), + Orig.isInternalRead()); +} + bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); const SIInstrInfo *TII = @@ -236,14 +250,10 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { if (TII->isVOPC(Op32)) { unsigned DstReg = MI.getOperand(0).getReg(); if (TargetRegisterInfo::isVirtualRegister(DstReg)) { - // VOPC instructions can only write to the VCC register. We can't - // force them to use VCC here, because the register allocator has - // trouble with sequences like this, which cause the allocator to run - // out of registers if vreg0 and vreg1 belong to the VCCReg register - // class: - // vreg0 = VOPC; - // vreg1 = VOPC; - // S_AND_B64 vreg0, vreg1 + // VOPC instructions can only write to the VCC register. We can't + // force them to use VCC here, because this is only one register and + // cannot deal with sequences which would require multiple copies of + // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) // // So, instead of forcing the instruction to write to VCC, we provide // a hint to the register allocator to use VCC and then we we will run @@ -272,13 +282,22 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { } // We can shrink this instruction - DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';); + DEBUG(dbgs() << "Shrinking " << MI); MachineInstrBuilder Inst32 = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32)); - // dst - Inst32.addOperand(MI.getOperand(0)); + // Add the dst operand if the 32-bit encoding also has an explicit $dst. + // For VOPC instructions, this is replaced by an implicit def of vcc. + int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::dst); + if (Op32DstIdx != -1) { + // dst + Inst32.addOperand(MI.getOperand(0)); + } else { + assert(MI.getOperand(0).getReg() == AMDGPU::VCC && + "Unexpected case"); + } + Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); @@ -288,9 +307,19 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { Inst32.addOperand(*Src1); const MachineOperand *Src2 = - TII->getNamedOperand(MI, AMDGPU::OpName::src2); - if (Src2) - Inst32.addOperand(*Src2); + TII->getNamedOperand(MI, AMDGPU::OpName::src2); + if (Src2) { + int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); + if (Op32Src2Idx != -1) { + Inst32.addOperand(*Src2); + } else { + // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is + // replaced with an implicit read of vcc. + assert(Src2->getReg() == AMDGPU::VCC && + "Unexpected missing register operand"); + Inst32.addOperand(copyRegOperandAsImplicit(*Src2)); + } + } ++NumInstructionsShrunk; MI.eraseFromParent(); diff --git a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp index 591ce85..dbdc76b 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp @@ -22,6 +22,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" @@ -61,14 +62,7 @@ bool SITypeRewriter::doInitialization(Module &M) { } bool SITypeRewriter::runOnFunction(Function &F) { - Attribute A = F.getFnAttribute("ShaderType"); - - unsigned ShaderType = ShaderType::COMPUTE; - if (A.isStringAttribute()) { - StringRef Str = A.getValueAsString(); - Str.getAsInteger(0, ShaderType); - } - if (ShaderType == ShaderType::COMPUTE) + if (AMDGPU::getShaderType(F) == ShaderType::COMPUTE) return false; visit(F); diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index b76b400..add415e 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -7,12 +7,23 @@ // //===----------------------------------------------------------------------===// #include "AMDGPUBaseInfo.h" +#include "AMDGPU.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/SubtargetFeature.h" #define GET_SUBTARGETINFO_ENUM #include "AMDGPUGenSubtargetInfo.inc" #undef GET_SUBTARGETINFO_ENUM +#define GET_REGINFO_ENUM +#include "AMDGPUGenRegisterInfo.inc" +#undef GET_REGINFO_ENUM + namespace llvm { namespace AMDGPU { @@ -56,5 +67,91 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, Header.private_segment_alignment = 4; } +MCSection *getHSATextSection(MCContext &Ctx) { + return Ctx.getELFSection(".hsatext", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_WRITE | + ELF::SHF_EXECINSTR | + ELF::SHF_AMDGPU_HSA_AGENT | + ELF::SHF_AMDGPU_HSA_CODE); +} + +MCSection *getHSADataGlobalAgentSection(MCContext &Ctx) { + return Ctx.getELFSection(".hsadata_global_agent", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_WRITE | + ELF::SHF_AMDGPU_HSA_GLOBAL | + ELF::SHF_AMDGPU_HSA_AGENT); +} + +MCSection *getHSADataGlobalProgramSection(MCContext &Ctx) { + return Ctx.getELFSection(".hsadata_global_program", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_WRITE | + ELF::SHF_AMDGPU_HSA_GLOBAL); +} + +MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx) { + return Ctx.getELFSection(".hsarodata_readonly_agent", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_AMDGPU_HSA_READONLY | + ELF::SHF_AMDGPU_HSA_AGENT); +} + +bool isGroupSegment(const GlobalValue *GV) { + return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; +} + +bool isGlobalSegment(const GlobalValue *GV) { + return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; +} + +bool isReadOnlySegment(const GlobalValue *GV) { + return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; +} + +static const char ShaderTypeAttribute[] = "ShaderType"; + +unsigned getShaderType(const Function &F) { + Attribute A = F.getFnAttribute(ShaderTypeAttribute); + unsigned ShaderType = ShaderType::COMPUTE; + + if (A.isStringAttribute()) { + StringRef Str = A.getValueAsString(); + if (Str.getAsInteger(0, ShaderType)) { + LLVMContext &Ctx = F.getContext(); + Ctx.emitError("can't parse shader type"); + } + } + return ShaderType; +} + +bool isSI(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands]; +} + +bool isCI(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureSeaIslands]; +} + +bool isVI(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]; +} + +unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { + + switch(Reg) { + default: break; + case AMDGPU::FLAT_SCR: + assert(!isSI(STI)); + return isCI(STI) ? AMDGPU::FLAT_SCR_ci : AMDGPU::FLAT_SCR_vi; + + case AMDGPU::FLAT_SCR_LO: + assert(!isSI(STI)); + return isCI(STI) ? AMDGPU::FLAT_SCR_LO_ci : AMDGPU::FLAT_SCR_LO_vi; + + case AMDGPU::FLAT_SCR_HI: + assert(!isSI(STI)); + return isCI(STI) ? AMDGPU::FLAT_SCR_HI_ci : AMDGPU::FLAT_SCR_HI_vi; + } + return Reg; +} + } // End namespace AMDGPU } // End namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index f57028c..19419a2 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -15,6 +15,11 @@ namespace llvm { class FeatureBitset; +class Function; +class GlobalValue; +class MCContext; +class MCSection; +class MCSubtargetInfo; namespace AMDGPU { @@ -27,6 +32,27 @@ struct IsaVersion { IsaVersion getIsaVersion(const FeatureBitset &Features); void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const FeatureBitset &Features); +MCSection *getHSATextSection(MCContext &Ctx); + +MCSection *getHSADataGlobalAgentSection(MCContext &Ctx); + +MCSection *getHSADataGlobalProgramSection(MCContext &Ctx); + +MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx); + +bool isGroupSegment(const GlobalValue *GV); +bool isGlobalSegment(const GlobalValue *GV); +bool isReadOnlySegment(const GlobalValue *GV); + +unsigned getShaderType(const Function &F); + +bool isSI(const MCSubtargetInfo &STI); +bool isCI(const MCSubtargetInfo &STI); +bool isVI(const MCSubtargetInfo &STI); + +/// If \p Reg is a pseudo reg, return the correct hardware register given +/// \p STI otherwise return \p Reg. +unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI); } // end namespace AMDGPU } // end namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td index aca4673..20a026a 100644 --- a/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td @@ -73,8 +73,8 @@ defm V_MIN_I16 : VOP2Inst <vop2<0,0x32>, "v_min_i16", VOP_I16_I16_I16>; } // End isCommutable = 1 defm V_LDEXP_F16 : VOP2Inst <vop2<0,0x33>, "v_ldexp_f16", VOP_F16_F16_I16>; -// Aliases to simplify matching of floating-pint instructions that are VOP2 on -// SI and VOP3 on VI. +// Aliases to simplify matching of floating-point instructions that +// are VOP2 on SI and VOP3 on VI. class SI2_VI3Alias <string name, Instruction inst> : InstAlias < name#" $dst, $src0, $src1", @@ -89,60 +89,15 @@ def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>; def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>; def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>; -} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI - //===----------------------------------------------------------------------===// -// SMEM Patterns +// SMEM Instructions //===----------------------------------------------------------------------===// -let Predicates = [isVI] in { +def S_DCACHE_WB : SMEM_Inval <0x21, + "s_dcache_wb", int_amdgcn_s_dcache_wb>; -// 1. Offset as 20bit DWORD immediate -def : Pat < - (SIload_constant v4i32:$sbase, IMM20bit:$offset), - (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) ->; - -// Patterns for global loads with no offset -class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < - (vt (node i64:$addr)), - (inst $addr, 0, 0, 0) ->; - -def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_global, i32>; -def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_global, i32>; -def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_global, i32>; -def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_global, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORD, global_load, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX2, global_load, v2i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX4, global_load, v4i32>; - -class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < - (node vt:$data, i64:$addr), - (inst $data, $addr, 0, 0, 0) ->; - -def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_global, i32>; -def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_global, i32>; -def : FlatStorePat <FLAT_STORE_DWORD, global_store, i32>; -def : FlatStorePat <FLAT_STORE_DWORDX2, global_store, v2i32>; -def : FlatStorePat <FLAT_STORE_DWORDX4, global_store, v4i32>; - -class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < - (vt (node i64:$addr, vt:$data)), - (inst $addr, $data, 0, 0) ->; - -def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>; +def S_DCACHE_WB_VOL : SMEM_Inval <0x23, + "s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>; +} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI -} // End Predicates = [isVI] |