diff options
Diffstat (limited to 'contrib/llvm/lib/Target/R600')
77 files changed, 5267 insertions, 2753 deletions
diff --git a/contrib/llvm/lib/Target/R600/AMDGPU.h b/contrib/llvm/lib/Target/R600/AMDGPU.h index c660055..0a05d25 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPU.h +++ b/contrib/llvm/lib/Target/R600/AMDGPU.h @@ -43,6 +43,7 @@ FunctionPass *createSILowerI1CopiesPass(); FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); +FunctionPass *createSIFixControlFlowLiveIntervalsPass(); FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm); FunctionPass *createSIFixSGPRLiveRangesPass(); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); @@ -64,9 +65,8 @@ Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUISelDag(TargetMachine &tm); ModulePass *createAMDGPUAlwaysInlinePass(); -/// \brief Creates an AMDGPU-specific Target Transformation Info pass. -ImmutablePass * -createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM); +void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&); +extern char &SIFixControlFlowLiveIntervalsID; void initializeSIFixSGPRLiveRangesPass(PassRegistry&); extern char &SIFixSGPRLiveRangesID; @@ -105,7 +105,7 @@ namespace ShaderType { /// a separate piece of memory that is unique from other /// memory locations. namespace AMDGPUAS { -enum AddressSpaces { +enum AddressSpaces : unsigned { PRIVATE_ADDRESS = 0, ///< Address space for private memory. GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). CONSTANT_ADDRESS = 2, ///< Address space for constant memory @@ -137,7 +137,10 @@ enum AddressSpaces { CONSTANT_BUFFER_14 = 22, CONSTANT_BUFFER_15 = 23, ADDRESS_NONE = 24, ///< Address space for unknown memory. - LAST_ADDRESS = ADDRESS_NONE + LAST_ADDRESS = ADDRESS_NONE, + + // Some places use this if the address space can't be determined. + UNKNOWN_ADDRESS_SPACE = ~0u }; } // namespace AMDGPUAS diff --git a/contrib/llvm/lib/Target/R600/AMDGPU.td b/contrib/llvm/lib/Target/R600/AMDGPU.td index 03f2bbe..2e7e39a 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPU.td +++ b/contrib/llvm/lib/Target/R600/AMDGPU.td @@ -1,11 +1,11 @@ -//===-- AMDIL.td - AMDIL Tablegen files --*- tablegen -*-------------------===// +//===-- AMDGPU.td - AMDGPU Tablegen files ------------------*- tablegen -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // -//==-----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// include "llvm/Target/Target.td" @@ -20,6 +20,11 @@ def FeatureDumpCode : SubtargetFeature <"DumpCode", "true", "Dump MachineInstrs in the CodeEmitter">; +def FeatureDumpCodeLower : SubtargetFeature <"dumpcode", + "DumpCode", + "true", + "Dump MachineInstrs in the CodeEmitter">; + def FeatureIRStructurizer : SubtargetFeature <"disable-irstructurizer", "EnableIRStructurizer", "false", @@ -48,6 +53,12 @@ def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals", "Enable double precision denormal handling", [FeatureFP64]>; +def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf", + "FastFMAF32", + "true", + "Assuming f32 fma is at least as fast as mul + add", + []>; + // Some instructions do not support denormals despite this flag. Using // fp32 denormals also causes instructions to run at the double // precision rate for the device. @@ -121,12 +132,47 @@ def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>; def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>; def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>; +class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature < + "ldsbankcount"#Value, + "LDSBankCount", + !cast<string>(Value), + "The number of LDS banks per compute unit.">; + +def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>; +def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>; + class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature< "localmemorysize"#Value, "LocalMemorySize", !cast<string>(Value), "The size of local memory in bytes">; +def FeatureGCN : SubtargetFeature<"gcn", + "IsGCN", + "true", + "GCN or newer GPU">; + +def FeatureGCN1Encoding : SubtargetFeature<"gcn1-encoding", + "GCN1Encoding", + "true", + "Encoding format for SI and CI">; + +def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding", + "GCN3Encoding", + "true", + "Encoding format for VI">; + +def FeatureCIInsts : SubtargetFeature<"ci-insts", + "CIInsts", + "true", + "Additional intstructions for CI+">; + +// Dummy feature used to disable assembler instructions. +def FeatureDisable : SubtargetFeature<"", + "FeatureDisable","true", + "Dummy feature to disable assembler" + " instructions">; + class SubtargetFeatureGeneration <string Value, list<SubtargetFeature> Implies> : SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value, @@ -152,20 +198,24 @@ def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS", def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS", [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768, - FeatureWavefrontSize64]>; + FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding, + FeatureLDSBankCount32]>; def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, - FeatureWavefrontSize64, FeatureFlatAddressSpace]>; + FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace, + FeatureGCN1Encoding, FeatureCIInsts]>; def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, - FeatureWavefrontSize64, FeatureFlatAddressSpace]>; + FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, + FeatureGCN3Encoding, FeatureCIInsts, FeatureLDSBankCount32]>; //===----------------------------------------------------------------------===// def AMDGPUInstrInfo : InstrInfo { let guessInstructionProperties = 1; + let noNamedPositionallyEncodedOperands = 1; } def AMDGPUAsmParser : AsmParser { @@ -188,10 +238,20 @@ def NullALU : InstrItinClass; // Predicate helper class //===----------------------------------------------------------------------===// +def TruePredicate : Predicate<"true">; +def isSICI : Predicate< + "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS" +>, AssemblerPredicate<"FeatureGCN1Encoding">; + class PredicateControl { Predicate SubtargetPredicate; + Predicate SIAssemblerPredicate = isSICI; + list<Predicate> AssemblerPredicates = []; + Predicate AssemblerPredicate = TruePredicate; list<Predicate> OtherPredicates = []; - list<Predicate> Predicates = !listconcat([SubtargetPredicate], + list<Predicate> Predicates = !listconcat([SubtargetPredicate, AssemblerPredicate], + AssemblerPredicates, OtherPredicates); } diff --git a/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp index 1fae26e..56b50a9 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -17,6 +17,7 @@ // #include "AMDGPUAsmPrinter.h" +#include "InstPrinter/AMDGPUInstPrinter.h" #include "AMDGPU.h" #include "AMDKernelCodeT.h" #include "AMDGPUSubtarget.h" @@ -58,7 +59,7 @@ using namespace llvm; // instructions to run at the double precision rate for the device so it's // probably best to just report no single precision denormals. static uint32_t getFPMode(const MachineFunction &F) { - const AMDGPUSubtarget& ST = F.getTarget().getSubtarget<AMDGPUSubtarget>(); + const AMDGPUSubtarget& ST = F.getSubtarget<AMDGPUSubtarget>(); // TODO: Is there any real use for the flush in only / flush out only modes? uint32_t FP32Denormals = @@ -73,9 +74,10 @@ static uint32_t getFPMode(const MachineFunction &F) { FP_DENORM_MODE_DP(FP64Denormals); } -static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm, - MCStreamer &Streamer) { - return new AMDGPUAsmPrinter(tm, Streamer); +static AsmPrinter * +createAMDGPUAsmPrinterPass(TargetMachine &tm, + std::unique_ptr<MCStreamer> &&Streamer) { + return new AMDGPUAsmPrinter(tm, std::move(Streamer)); } extern "C" void LLVMInitializeR600AsmPrinter() { @@ -83,19 +85,18 @@ extern "C" void LLVMInitializeR600AsmPrinter() { TargetRegistry::RegisterAsmPrinter(TheGCNTarget, createAMDGPUAsmPrinterPass); } -AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) - : AsmPrinter(TM, Streamer) { - DisasmEnabled = TM.getSubtarget<AMDGPUSubtarget>().dumpCode(); -} +AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, + std::unique_ptr<MCStreamer> Streamer) + : AsmPrinter(TM, std::move(Streamer)) {} void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { // This label is used to mark the end of the .text section. const TargetLoweringObjectFile &TLOF = getObjFileLowering(); - OutStreamer.SwitchSection(TLOF.getTextSection()); + OutStreamer->SwitchSection(TLOF.getTextSection()); MCSymbol *EndOfTextLabel = - OutContext.GetOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); - OutStreamer.EmitLabel(EndOfTextLabel); + OutContext.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); + OutStreamer->EmitLabel(EndOfTextLabel); } bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { @@ -105,20 +106,17 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { SetupMachineFunction(MF); - EmitFunctionHeader(); - MCContext &Context = getObjFileLowering().getContext(); - const MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config", - ELF::SHT_PROGBITS, 0, - SectionKind::getReadOnly()); - OutStreamer.SwitchSection(ConfigSection); + MCSectionELF *ConfigSection = + Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); + OutStreamer->SwitchSection(ConfigSection); - const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); + const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); SIProgramInfo KernelInfo; if (STM.isAmdHsaOS()) { getSIProgramInfo(KernelInfo, MF); EmitAmdKernelCodeT(MF, KernelInfo); - OutStreamer.EmitCodeAlignment(2 << (MF.getAlignment() - 1)); + OutStreamer->EmitCodeAlignment(2 << (MF.getAlignment() - 1)); } else if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { getSIProgramInfo(KernelInfo, MF); EmitProgramInfoSI(MF, KernelInfo); @@ -130,49 +128,45 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { HexLines.clear(); DisasmLineMaxLen = 0; - OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); EmitFunctionBody(); if (isVerbose()) { - const MCSectionELF *CommentSection - = Context.getELFSection(".AMDGPU.csdata", - ELF::SHT_PROGBITS, 0, - SectionKind::getReadOnly()); - OutStreamer.SwitchSection(CommentSection); + MCSectionELF *CommentSection = + Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); + OutStreamer->SwitchSection(CommentSection); if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - OutStreamer.emitRawComment(" Kernel info:", false); - OutStreamer.emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen), - false); - OutStreamer.emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR), - false); - OutStreamer.emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR), - false); - OutStreamer.emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode), - false); - OutStreamer.emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode), - false); - OutStreamer.emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize), - false); + OutStreamer->emitRawComment(" Kernel info:", false); + OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen), + false); + OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR), + false); + OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR), + false); + OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode), + false); + OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode), + false); + OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize), + false); } else { R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); - OutStreamer.emitRawComment( + OutStreamer->emitRawComment( Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize))); } } - if (STM.dumpCode() && DisasmEnabled) { + if (STM.dumpCode()) { - OutStreamer.SwitchSection(Context.getELFSection(".AMDGPU.disasm", - ELF::SHT_NOTE, 0, - SectionKind::getReadOnly())); + OutStreamer->SwitchSection( + Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0)); for (size_t i = 0; i < DisasmLines.size(); ++i) { std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' '); Comment += " ; " + HexLines[i] + "\n"; - OutStreamer.EmitBytes(StringRef(DisasmLines[i])); - OutStreamer.EmitBytes(StringRef(Comment)); + OutStreamer->EmitBytes(StringRef(DisasmLines[i])); + OutStreamer->EmitBytes(StringRef(Comment)); } } @@ -182,10 +176,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { unsigned MaxGPR = 0; bool killPixel = false; - const R600RegisterInfo *RI = static_cast<const R600RegisterInfo *>( - TM.getSubtargetImpl()->getRegisterInfo()); + const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); + const R600RegisterInfo *RI = + static_cast<const R600RegisterInfo *>(STM.getRegisterInfo()); const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); - const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); for (const MachineBasicBlock &MBB : MF) { for (const MachineInstr &MI : MBB) { @@ -227,29 +221,29 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { } } - OutStreamer.EmitIntValue(RsrcReg, 4); - OutStreamer.EmitIntValue(S_NUM_GPRS(MaxGPR + 1) | + OutStreamer->EmitIntValue(RsrcReg, 4); + OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) | S_STACK_SIZE(MFI->StackSize), 4); - OutStreamer.EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); - OutStreamer.EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); + OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); + OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); if (MFI->getShaderType() == ShaderType::COMPUTE) { - OutStreamer.EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); - OutStreamer.EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4); + OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); + OutStreamer->EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4); } } void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const MachineFunction &MF) const { - const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); + const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); uint64_t CodeSize = 0; unsigned MaxSGPR = 0; unsigned MaxVGPR = 0; bool VCCUsed = false; bool FlatUsed = false; - const SIRegisterInfo *RI = static_cast<const SIRegisterInfo *>( - TM.getSubtargetImpl()->getRegisterInfo()); + const SIRegisterInfo *RI = + static_cast<const SIRegisterInfo *>(STM.getRegisterInfo()); for (const MachineBasicBlock &MBB : MF) { for (const MachineInstr &MI : MBB) { @@ -427,45 +421,45 @@ static unsigned getRsrcReg(unsigned ShaderType) { void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo) { - const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); + const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); unsigned RsrcReg = getRsrcReg(MFI->getShaderType()); if (MFI->getShaderType() == ShaderType::COMPUTE) { - OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); + OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); - OutStreamer.EmitIntValue(KernelInfo.ComputePGMRSrc1, 4); + OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4); - OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); - OutStreamer.EmitIntValue(KernelInfo.ComputePGMRSrc2, 4); + OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); + OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4); - OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4); - OutStreamer.EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4); + OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4); + OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4); // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = // 0" comment but I don't see a corresponding field in the register spec. } else { - OutStreamer.EmitIntValue(RsrcReg, 4); - OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) | - S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4); + OutStreamer->EmitIntValue(RsrcReg, 4); + OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) | + S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4); if (STM.isVGPRSpillingEnabled(MFI)) { - OutStreamer.EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); - OutStreamer.EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4); + OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); + OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4); } } if (MFI->getShaderType() == ShaderType::PIXEL) { - OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); - OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4); - OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); - OutStreamer.EmitIntValue(MFI->PSInputAddr, 4); + OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); + OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4); + OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); + OutStreamer->EmitIntValue(MFI->PSInputAddr, 4); } } void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, const SIProgramInfo &KernelInfo) const { const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); + const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); amd_kernel_code_t header; memset(&header, 0, sizeof(header)); @@ -515,69 +509,92 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, header.wavefront_size = STM.getWavefrontSize(); - const MCSectionELF *VersionSection = OutContext.getELFSection(".hsa.version", - ELF::SHT_PROGBITS, 0, SectionKind::getReadOnly()); - OutStreamer.SwitchSection(VersionSection); - OutStreamer.EmitBytes(Twine("HSA Code Unit:" + - Twine(header.hsail_version_major) + "." + - Twine(header.hsail_version_minor) + ":" + - "AMD:" + - Twine(header.amd_code_version_major) + "." + - Twine(header.amd_code_version_minor) + ":" + - "GFX8.1:0").str()); + MCSectionELF *VersionSection = + OutContext.getELFSection(".hsa.version", ELF::SHT_PROGBITS, 0); + OutStreamer->SwitchSection(VersionSection); + OutStreamer->EmitBytes(Twine("HSA Code Unit:" + + Twine(header.hsail_version_major) + "." + + Twine(header.hsail_version_minor) + ":" + + "AMD:" + + Twine(header.amd_code_version_major) + "." + + Twine(header.amd_code_version_minor) + ":" + + "GFX8.1:0").str()); - OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); + OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); if (isVerbose()) { - OutStreamer.emitRawComment("amd_code_version_major = " + - Twine(header.amd_code_version_major), false); - OutStreamer.emitRawComment("amd_code_version_minor = " + - Twine(header.amd_code_version_minor), false); - OutStreamer.emitRawComment("struct_byte_size = " + - Twine(header.struct_byte_size), false); - OutStreamer.emitRawComment("target_chip = " + - Twine(header.target_chip), false); - OutStreamer.emitRawComment(" compute_pgm_rsrc1: " + - Twine::utohexstr(KernelInfo.ComputePGMRSrc1), false); - OutStreamer.emitRawComment(" compute_pgm_rsrc2: " + - Twine::utohexstr(KernelInfo.ComputePGMRSrc2), false); - OutStreamer.emitRawComment("enable_sgpr_private_segment_buffer = " + + OutStreamer->emitRawComment("amd_code_version_major = " + + Twine(header.amd_code_version_major), false); + OutStreamer->emitRawComment("amd_code_version_minor = " + + Twine(header.amd_code_version_minor), false); + OutStreamer->emitRawComment("struct_byte_size = " + + Twine(header.struct_byte_size), false); + OutStreamer->emitRawComment("target_chip = " + + Twine(header.target_chip), false); + OutStreamer->emitRawComment(" compute_pgm_rsrc1: " + + Twine::utohexstr(KernelInfo.ComputePGMRSrc1), + false); + OutStreamer->emitRawComment(" compute_pgm_rsrc2: " + + Twine::utohexstr(KernelInfo.ComputePGMRSrc2), + false); + OutStreamer->emitRawComment("enable_sgpr_private_segment_buffer = " + Twine((bool)(header.code_properties & AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)), false); - OutStreamer.emitRawComment("enable_sgpr_kernarg_segment_ptr = " + + OutStreamer->emitRawComment("enable_sgpr_kernarg_segment_ptr = " + Twine((bool)(header.code_properties & AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)), false); - OutStreamer.emitRawComment("private_element_size = 2 ", false); - OutStreamer.emitRawComment("is_ptr64 = " + + OutStreamer->emitRawComment("private_element_size = 2 ", false); + OutStreamer->emitRawComment("is_ptr64 = " + Twine((bool)(header.code_properties & AMD_CODE_PROPERTY_IS_PTR64)), false); - OutStreamer.emitRawComment("workitem_private_segment_byte_size = " + - Twine(header.workitem_private_segment_byte_size), - false); - OutStreamer.emitRawComment("workgroup_group_segment_byte_size = " + - Twine(header.workgroup_group_segment_byte_size), - false); - OutStreamer.emitRawComment("gds_segment_byte_size = " + - Twine(header.gds_segment_byte_size), false); - OutStreamer.emitRawComment("kernarg_segment_byte_size = " + - Twine(header.kernarg_segment_byte_size), false); - OutStreamer.emitRawComment("wavefront_sgpr_count = " + - Twine(header.wavefront_sgpr_count), false); - OutStreamer.emitRawComment("workitem_vgpr_count = " + - Twine(header.workitem_vgpr_count), false); - OutStreamer.emitRawComment("code_type = " + Twine(header.code_type), false); - OutStreamer.emitRawComment("wavefront_size = " + - Twine((int)header.wavefront_size), false); - OutStreamer.emitRawComment("optimization_level = " + - Twine(header.optimization_level), false); - OutStreamer.emitRawComment("hsail_profile = " + - Twine(header.hsail_profile), false); - OutStreamer.emitRawComment("hsail_machine_model = " + - Twine(header.hsail_machine_model), false); - OutStreamer.emitRawComment("hsail_version_major = " + - Twine(header.hsail_version_major), false); - OutStreamer.emitRawComment("hsail_version_minor = " + - Twine(header.hsail_version_minor), false); + OutStreamer->emitRawComment("workitem_private_segment_byte_size = " + + Twine(header.workitem_private_segment_byte_size), + false); + OutStreamer->emitRawComment("workgroup_group_segment_byte_size = " + + Twine(header.workgroup_group_segment_byte_size), + false); + OutStreamer->emitRawComment("gds_segment_byte_size = " + + Twine(header.gds_segment_byte_size), false); + OutStreamer->emitRawComment("kernarg_segment_byte_size = " + + Twine(header.kernarg_segment_byte_size), false); + OutStreamer->emitRawComment("wavefront_sgpr_count = " + + Twine(header.wavefront_sgpr_count), false); + OutStreamer->emitRawComment("workitem_vgpr_count = " + + Twine(header.workitem_vgpr_count), false); + OutStreamer->emitRawComment("code_type = " + Twine(header.code_type), false); + OutStreamer->emitRawComment("wavefront_size = " + + Twine((int)header.wavefront_size), false); + OutStreamer->emitRawComment("optimization_level = " + + Twine(header.optimization_level), false); + OutStreamer->emitRawComment("hsail_profile = " + + Twine(header.hsail_profile), false); + OutStreamer->emitRawComment("hsail_machine_model = " + + Twine(header.hsail_machine_model), false); + OutStreamer->emitRawComment("hsail_version_major = " + + Twine(header.hsail_version_major), false); + OutStreamer->emitRawComment("hsail_version_minor = " + + Twine(header.hsail_version_minor), false); + } + + OutStreamer->EmitBytes(StringRef((char*)&header, sizeof(header))); +} + +bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode, raw_ostream &O) { + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) + return true; // Unknown modifier. + + switch (ExtraCode[0]) { + default: + // See if this is a generic print operand + return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); + case 'r': + break; + } } - OutStreamer.EmitBytes(StringRef((char*)&header, sizeof(header))); + AMDGPUInstPrinter::printRegOperand(MI->getOperand(OpNo).getReg(), O, + *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo()); + return false; } diff --git a/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.h b/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.h index b360ae8..1acff3a 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.h +++ b/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.h @@ -85,7 +85,8 @@ private: const SIProgramInfo &KernelInfo) const; public: - explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer); + explicit AMDGPUAsmPrinter(TargetMachine &TM, + std::unique_ptr<MCStreamer> Streamer); bool runOnMachineFunction(MachineFunction &MF) override; @@ -98,8 +99,11 @@ public: void EmitEndOfAsmFile(Module &M) override; + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &O) override; + protected: - bool DisasmEnabled; std::vector<std::string> DisasmLines, HexLines; size_t DisasmLineMaxLen; }; diff --git a/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.cpp b/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.cpp index 9e8302e..8175786 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.cpp +++ b/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.cpp @@ -99,9 +99,8 @@ AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const { NumEntries = 0; return nullptr; } -void -AMDGPUFrameLowering::emitPrologue(MachineFunction &MF) const { -} +void AMDGPUFrameLowering::emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const {} void AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { diff --git a/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.h b/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.h index 15a6636..9f31be1 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.h +++ b/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.h @@ -37,7 +37,7 @@ public: int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; const SpillSlot * getCalleeSavedSpillSlots(unsigned &NumEntries) const override; - void emitPrologue(MachineFunction &MF) const override; + void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; bool hasFP(const MachineFunction &MF) const override; }; diff --git a/contrib/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/contrib/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index 68d557a..df4461e 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -39,18 +39,17 @@ namespace { class AMDGPUDAGToDAGISel : public SelectionDAGISel { // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can // make the right decision when generating code for different targets. - const AMDGPUSubtarget &Subtarget; + const AMDGPUSubtarget *Subtarget; public: AMDGPUDAGToDAGISel(TargetMachine &TM); virtual ~AMDGPUDAGToDAGISel(); - + bool runOnMachineFunction(MachineFunction &MF) override; SDNode *Select(SDNode *N) override; const char *getPassName() const override; void PostprocessISelDAG() override; private: bool isInlineImmediate(SDNode *N) const; - inline SDValue getSmallIPtrImm(unsigned Imm); bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs, const R600InstrInfo *TII); bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &); @@ -79,6 +78,8 @@ private: bool isLocalLoad(const LoadSDNode *N) const; bool isRegionLoad(const LoadSDNode *N) const; + SDNode *glueCopyToM0(SDNode *N) const; + const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg, @@ -95,9 +96,10 @@ private: SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, SDValue &TFE) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, - SDValue &Offset) const; + SDValue &SOffset, SDValue &Offset, SDValue &GLC, + SDValue &SLC, SDValue &TFE) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, - SDValue &VAddr, SDValue &Offset, + SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &SLC) const; bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr, SDValue &SOffset, SDValue &ImmOffset) const; @@ -120,6 +122,11 @@ private: SDNode *SelectADD_SUB_I64(SDNode *N); SDNode *SelectDIV_SCALE(SDNode *N); + SDNode *getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val, + uint32_t Offset, uint32_t Width); + SDNode *SelectS_BFEFromShifts(SDNode *N); + SDNode *SelectS_BFE(SDNode *N); + // Include the pieces autogenerated from the target description. #include "AMDGPUGenDAGISel.inc" }; @@ -132,7 +139,11 @@ FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM) { } AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM) - : SelectionDAGISel(TM), Subtarget(TM.getSubtarget<AMDGPUSubtarget>()) { + : SelectionDAGISel(TM) {} + +bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { + Subtarget = &static_cast<const AMDGPUSubtarget &>(MF.getSubtarget()); + return SelectionDAGISel::runOnMachineFunction(MF); } AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() { @@ -156,7 +167,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, switch (N->getMachineOpcode()) { default: { const MCInstrDesc &Desc = - TM.getSubtargetImpl()->getInstrInfo()->get(N->getMachineOpcode()); + Subtarget->getInstrInfo()->get(N->getMachineOpcode()); unsigned OpIdx = Desc.getNumDefs() + OpNo; if (OpIdx >= Desc.getNumOperands()) return nullptr; @@ -164,42 +175,38 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, if (RegClass == -1) return nullptr; - return TM.getSubtargetImpl()->getRegisterInfo()->getRegClass(RegClass); + return Subtarget->getRegisterInfo()->getRegClass(RegClass); } case AMDGPU::REG_SEQUENCE: { unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); const TargetRegisterClass *SuperRC = - TM.getSubtargetImpl()->getRegisterInfo()->getRegClass(RCID); + Subtarget->getRegisterInfo()->getRegClass(RCID); SDValue SubRegOp = N->getOperand(OpNo + 1); unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue(); - return TM.getSubtargetImpl()->getRegisterInfo()->getSubClassWithSubReg( - SuperRC, SubRegIdx); + return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC, + SubRegIdx); } } } -SDValue AMDGPUDAGToDAGISel::getSmallIPtrImm(unsigned int Imm) { - return CurDAG->getTargetConstant(Imm, MVT::i32); -} - bool AMDGPUDAGToDAGISel::SelectADDRParam( SDValue Addr, SDValue& R1, SDValue& R2) { if (Addr.getOpcode() == ISD::FrameIndex) { if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); - R2 = CurDAG->getTargetConstant(0, MVT::i32); + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); } else { R1 = Addr; - R2 = CurDAG->getTargetConstant(0, MVT::i32); + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); } } else if (Addr.getOpcode() == ISD::ADD) { R1 = Addr.getOperand(0); R2 = Addr.getOperand(1); } else { R1 = Addr; - R2 = CurDAG->getTargetConstant(0, MVT::i32); + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); } return true; } @@ -222,21 +229,47 @@ bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) { if (Addr.getOpcode() == ISD::FrameIndex) { if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64); - R2 = CurDAG->getTargetConstant(0, MVT::i64); + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); } else { R1 = Addr; - R2 = CurDAG->getTargetConstant(0, MVT::i64); + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); } } else if (Addr.getOpcode() == ISD::ADD) { R1 = Addr.getOperand(0); R2 = Addr.getOperand(1); } else { R1 = Addr; - R2 = CurDAG->getTargetConstant(0, MVT::i64); + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); } return true; } +SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || + !checkType(cast<MemSDNode>(N)->getMemOperand()->getValue(), + AMDGPUAS::LOCAL_ADDRESS)) + return N; + + const SITargetLowering& Lowering = + *static_cast<const SITargetLowering*>(getTargetLowering()); + + // Write max value to m0 before each load operation + + SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N), + CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); + + SDValue Glue = M0.getValue(1); + + SmallVector <SDValue, 8> Ops; + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { + Ops.push_back(N->getOperand(i)); + } + Ops.push_back(Glue); + CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); + + return N; +} + SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { unsigned int Opc = N->getOpcode(); if (N->isMachineOpcode()) { @@ -244,7 +277,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { return nullptr; // Already selected. } - const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); + if (isa<AtomicSDNode>(N)) + N = glueCopyToM0(N); + switch (Opc) { default: break; // We are selecting i64 ADD here instead of custom lower it during @@ -253,7 +288,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { case ISD::ADD: case ISD::SUB: { if (N->getValueType(0) != MVT::i64 || - ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) + Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) break; return SelectADD_SUB_I64(N); @@ -262,15 +297,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { case AMDGPUISD::BUILD_VERTICAL_VECTOR: case ISD::BUILD_VECTOR: { unsigned RegClassID; - const AMDGPURegisterInfo *TRI = static_cast<const AMDGPURegisterInfo *>( - TM.getSubtargetImpl()->getRegisterInfo()); - const SIRegisterInfo *SIRI = static_cast<const SIRegisterInfo *>( - TM.getSubtargetImpl()->getRegisterInfo()); + const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo(); EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); EVT EltVT = VT.getVectorElementType(); assert(EltVT.bitsEq(MVT::i32)); - if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { bool UseVReg = true; for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); U != E; ++U) { @@ -281,7 +313,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { if (!RC) { continue; } - if (SIRI->isSGPRClass(RC)) { + if (static_cast<const SIRegisterInfo *>(TRI)->isSGPRClass(RC)) { UseVReg = false; } } @@ -320,7 +352,8 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { } } - SDValue RegClass = CurDAG->getTargetConstant(RegClassID, MVT::i32); + SDLoc DL(N); + SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); if (NumVectorElts == 1) { return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, @@ -334,18 +367,19 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { // 1 = Vector Register Class SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1); - RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, MVT::i32); + RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); bool IsRegSeq = true; unsigned NOps = N->getNumOperands(); for (unsigned i = 0; i < NOps; i++) { // XXX: Why is this here? - if (dyn_cast<RegisterSDNode>(N->getOperand(i))) { + if (isa<RegisterSDNode>(N->getOperand(i))) { IsRegSeq = false; break; } RegSeqArgs[1 + (2 * i)] = N->getOperand(i); RegSeqArgs[1 + (2 * i) + 1] = - CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32); + CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, + MVT::i32); } if (NOps != NumVectorElts) { @@ -353,11 +387,11 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { assert(Opc == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts); MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, - SDLoc(N), EltVT); + DL, EltVT); for (unsigned i = NOps; i < NumVectorElts; ++i) { RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0); RegSeqArgs[1 + (2 * i) + 1] = - CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32); + CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32); } } @@ -368,30 +402,30 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { } case ISD::BUILD_PAIR: { SDValue RC, SubReg0, SubReg1; - if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { + if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { break; } + SDLoc DL(N); if (N->getValueType(0) == MVT::i128) { - RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32); - SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, MVT::i32); - SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, MVT::i32); + RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32); + SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32); + SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32); } else if (N->getValueType(0) == MVT::i64) { - RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32); - SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32); - SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32); + RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32); + SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); } else { llvm_unreachable("Unhandled value type for BUILD_PAIR"); } const SDValue Ops[] = { RC, N->getOperand(0), SubReg0, N->getOperand(1), SubReg1 }; return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, - SDLoc(N), N->getValueType(0), Ops); + DL, N->getValueType(0), Ops); } case ISD::Constant: case ISD::ConstantFP: { - const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); - if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N)) break; @@ -403,38 +437,46 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { Imm = C->getZExtValue(); } - SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SDLoc(N), MVT::i32, - CurDAG->getConstant(Imm & 0xFFFFFFFF, MVT::i32)); - SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SDLoc(N), MVT::i32, - CurDAG->getConstant(Imm >> 32, MVT::i32)); + SDLoc DL(N); + SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, + MVT::i32)); + SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getConstant(Imm >> 32, DL, MVT::i32)); const SDValue Ops[] = { - CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32), - SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32), - SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32) + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), + SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32) }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SDLoc(N), + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, N->getValueType(0), Ops); } case ISD::LOAD: { + LoadSDNode *LD = cast<LoadSDNode>(N); + SDLoc SL(N); + EVT VT = N->getValueType(0); + + if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) { + N = glueCopyToM0(N); + break; + } + // To simplify the TableGen patters, we replace all i64 loads with // v2i32 loads. Alternatively, we could promote i64 loads to v2i32 // during DAG legalization, however, so places (ExpandUnalignedLoad) // in the DAG legalizer assume that if i64 is legal, so doing this // promotion early can cause problems. - EVT VT = N->getValueType(0); - LoadSDNode *LD = cast<LoadSDNode>(N); - if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) - break; SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(), - LD->getBasePtr(), LD->getMemOperand()); - SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SDLoc(N), + LD->getBasePtr(), LD->getMemOperand()); + SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL, MVT::i64, NewLoad); CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1)); CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast); - SelectCode(NewLoad.getNode()); + SDNode *Load = glueCopyToM0(NewLoad.getNode()); + SelectCode(Load); N = BitCast.getNode(); break; } @@ -443,63 +485,68 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { // Handle i64 stores here for the same reason mentioned above for loads. StoreSDNode *ST = cast<StoreSDNode>(N); SDValue Value = ST->getValue(); - if (Value.getValueType() != MVT::i64 || ST->isTruncatingStore()) - break; + if (Value.getValueType() == MVT::i64 && !ST->isTruncatingStore()) { - SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N), - MVT::v2i32, Value); - SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue, - ST->getBasePtr(), ST->getMemOperand()); + SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N), + MVT::v2i32, Value); + SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue, + ST->getBasePtr(), ST->getMemOperand()); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore); + + if (NewValue.getOpcode() == ISD::BITCAST) { + Select(NewStore.getNode()); + return SelectCode(NewValue.getNode()); + } - if (NewValue.getOpcode() == ISD::BITCAST) { - Select(NewStore.getNode()); - return SelectCode(NewValue.getNode()); + // getNode() may fold the bitcast if its input was another bitcast. If that + // happens we should only select the new store. + N = NewStore.getNode(); } - // getNode() may fold the bitcast if its input was another bitcast. If that - // happens we should only select the new store. - N = NewStore.getNode(); + N = glueCopyToM0(N); break; } case AMDGPUISD::REGISTER_LOAD: { - if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) + if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) break; SDValue Addr, Offset; + SDLoc DL(N); SelectADDRIndirect(N->getOperand(1), Addr, Offset); const SDValue Ops[] = { Addr, Offset, - CurDAG->getTargetConstant(0, MVT::i32), + CurDAG->getTargetConstant(0, DL, MVT::i32), N->getOperand(0), }; - return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, SDLoc(N), - CurDAG->getVTList(MVT::i32, MVT::i64, MVT::Other), + return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, DL, + CurDAG->getVTList(MVT::i32, MVT::i64, + MVT::Other), Ops); } case AMDGPUISD::REGISTER_STORE: { - if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) + if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) break; SDValue Addr, Offset; SelectADDRIndirect(N->getOperand(2), Addr, Offset); + SDLoc DL(N); const SDValue Ops[] = { N->getOperand(1), Addr, Offset, - CurDAG->getTargetConstant(0, MVT::i32), + CurDAG->getTargetConstant(0, DL, MVT::i32), N->getOperand(0), }; - return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, SDLoc(N), + return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, DL, CurDAG->getVTList(MVT::Other), Ops); } case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { - if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) break; // There is a scalar version available, but unlike the vector version which @@ -520,21 +567,11 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { bool Signed = Opc == AMDGPUISD::BFE_I32; - // Transformation function, pack the offset and width of a BFE into - // the format expected by the S_BFE_I32 / S_BFE_U32. In the second - // source, bits [5:0] contain the offset and bits [22:16] the width. - uint32_t OffsetVal = Offset->getZExtValue(); uint32_t WidthVal = Width->getZExtValue(); - uint32_t PackedVal = OffsetVal | WidthVal << 16; - - SDValue PackedOffsetWidth = CurDAG->getTargetConstant(PackedVal, MVT::i32); - return CurDAG->getMachineNode(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, - SDLoc(N), - MVT::i32, - N->getOperand(0), - PackedOffsetWidth); + return getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, SDLoc(N), + N->getOperand(0), OffsetVal, WidthVal); } case AMDGPUISD::DIV_SCALE: { @@ -548,6 +585,14 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { } case ISD::ADDRSPACECAST: return SelectAddrSpaceCast(N); + case ISD::AND: + case ISD::SRL: + case ISD::SRA: + if (N->getValueType(0) != MVT::i32 || + Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) + break; + + return SelectS_BFE(N); } return SelectCode(N); @@ -604,13 +649,11 @@ bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const { } bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const { - if (N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) { - const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); - if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || - N->getMemoryVT().bitsLT(MVT::i32)) { + if (N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || + N->getMemoryVT().bitsLT(MVT::i32)) return true; - } - } + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS); } @@ -681,7 +724,8 @@ const char *AMDGPUDAGToDAGISel::getPassName() const { bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr) { if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) { - IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, true); + IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr), + true); return true; } return false; @@ -691,7 +735,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr, SDValue& BaseReg, SDValue &Offset) { if (!isa<ConstantSDNode>(Addr)) { BaseReg = Addr; - Offset = CurDAG->getIntPtrConstant(0, true); + Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true); return true; } return false; @@ -706,7 +750,8 @@ bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, && isInt<16>(IMMOffset->getZExtValue())) { Base = Addr.getOperand(0); - Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32); + Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), + MVT::i32); return true; // If the pointer address is constant, we can move it to the offset field. } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr)) @@ -714,30 +759,32 @@ bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(CurDAG->getEntryNode()), AMDGPU::ZERO, MVT::i32); - Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32); + Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), + MVT::i32); return true; } // Default case, no offset Base = Addr; - Offset = CurDAG->getTargetConstant(0, MVT::i32); + Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); return true; } bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset) { ConstantSDNode *C; + SDLoc DL(Addr); if ((C = dyn_cast<ConstantSDNode>(Addr))) { Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); - Offset = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32); + Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) { Base = Addr.getOperand(0); - Offset = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32); + Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); } else { Base = Addr; - Offset = CurDAG->getTargetConstant(0, MVT::i32); + Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); } return true; @@ -750,8 +797,8 @@ SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { bool IsAdd = (N->getOpcode() == ISD::ADD); - SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32); - SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32); + SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i32, LHS, Sub0); @@ -777,7 +824,7 @@ SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { SDValue(Hi0, 0), SDValue(Hi1, 0), Carry); SDValue Args[5] = { - CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32), + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), SDValue(AddLo,0), Sub0, SDValue(AddHi,0), @@ -808,12 +855,11 @@ SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset, unsigned OffsetBits) const { - const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); if ((OffsetBits == 16 && !isUInt<16>(Offset)) || (OffsetBits == 8 && !isUInt<8>(Offset))) return false; - if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) return true; // On Southern Islands instruction with a negative base value and an offset @@ -835,15 +881,17 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, } } + SDLoc DL(Addr); + // If we have a constant address, prefer to put the constant into the // offset. This can save moves to load the constant address since multiple // operations can share the zero base address register, and enables merging // into read2 / write2 instructions. if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { if (isUInt<16>(CAddr->getZExtValue())) { - SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32); + SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, - SDLoc(Addr), MVT::i32, Zero); + DL, MVT::i32, Zero); Base = SDValue(MovZero, 0); Offset = Addr; return true; @@ -852,13 +900,15 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, // default case Base = Addr; - Offset = CurDAG->getTargetConstant(0, MVT::i16); + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); return true; } bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, SDValue &Offset0, SDValue &Offset1) const { + SDLoc DL(Addr); + if (CurDAG->isBaseWithConstantOffset(Addr)) { SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); @@ -868,8 +918,8 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, // (add n0, c0) if (isDSOffsetLegal(N0, DWordOffset1, 8)) { Base = N0; - Offset0 = CurDAG->getTargetConstant(DWordOffset0, MVT::i8); - Offset1 = CurDAG->getTargetConstant(DWordOffset1, MVT::i8); + Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); return true; } } @@ -880,21 +930,21 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, assert(4 * DWordOffset0 == CAddr->getZExtValue()); if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) { - SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32); + SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, - SDLoc(Addr), MVT::i32, Zero); + DL, MVT::i32, Zero); Base = SDValue(MovZero, 0); - Offset0 = CurDAG->getTargetConstant(DWordOffset0, MVT::i8); - Offset1 = CurDAG->getTargetConstant(DWordOffset1, MVT::i8); + Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); return true; } } // default case Base = Addr; - Offset0 = CurDAG->getTargetConstant(0, MVT::i8); - Offset1 = CurDAG->getTargetConstant(1, MVT::i8); + Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8); return true; } @@ -910,62 +960,70 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &TFE) const { SDLoc DL(Addr); - GLC = CurDAG->getTargetConstant(0, MVT::i1); - SLC = CurDAG->getTargetConstant(0, MVT::i1); - TFE = CurDAG->getTargetConstant(0, MVT::i1); + GLC = CurDAG->getTargetConstant(0, DL, MVT::i1); + SLC = CurDAG->getTargetConstant(0, DL, MVT::i1); + TFE = CurDAG->getTargetConstant(0, DL, MVT::i1); - Idxen = CurDAG->getTargetConstant(0, MVT::i1); - Offen = CurDAG->getTargetConstant(0, MVT::i1); - Addr64 = CurDAG->getTargetConstant(0, MVT::i1); - SOffset = CurDAG->getTargetConstant(0, MVT::i32); + Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); + Offen = CurDAG->getTargetConstant(0, DL, MVT::i1); + Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1); + SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); if (CurDAG->isBaseWithConstantOffset(Addr)) { SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); ConstantSDNode *C1 = cast<ConstantSDNode>(N1); - if (isLegalMUBUFImmOffset(C1)) { - - if (N0.getOpcode() == ISD::ADD) { - // (add (add N2, N3), C1) -> addr64 - SDValue N2 = N0.getOperand(0); - SDValue N3 = N0.getOperand(1); - Addr64 = CurDAG->getTargetConstant(1, MVT::i1); - Ptr = N2; - VAddr = N3; - Offset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16); - return; - } + if (N0.getOpcode() == ISD::ADD) { + // (add (add N2, N3), C1) -> addr64 + SDValue N2 = N0.getOperand(0); + SDValue N3 = N0.getOperand(1); + Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); + Ptr = N2; + VAddr = N3; + } else { // (add N0, C1) -> offset - VAddr = CurDAG->getTargetConstant(0, MVT::i32); + VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); Ptr = N0; - Offset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16); + } + + if (isLegalMUBUFImmOffset(C1)) { + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); + return; + } else if (isUInt<32>(C1->getZExtValue())) { + // Illegal offset, store it in soffset. + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)), + 0); return; } } + if (Addr.getOpcode() == ISD::ADD) { // (add N0, N1) -> addr64 SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); - Addr64 = CurDAG->getTargetConstant(1, MVT::i1); + Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); Ptr = N0; VAddr = N1; - Offset = CurDAG->getTargetConstant(0, MVT::i16); + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); return; } // default case -> offset - VAddr = CurDAG->getTargetConstant(0, MVT::i32); + VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); Ptr = Addr; - Offset = CurDAG->getTargetConstant(0, MVT::i16); + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); } bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, - SDValue &VAddr, - SDValue &Offset) const { - SDValue Ptr, SOffset, Offen, Idxen, Addr64, GLC, SLC, TFE; + SDValue &VAddr, SDValue &SOffset, + SDValue &Offset, SDValue &GLC, + SDValue &SLC, SDValue &TFE) const { + SDValue Ptr, Offen, Idxen, Addr64; SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, GLC, SLC, TFE); @@ -985,11 +1043,13 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, } bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, - SDValue &VAddr, SDValue &Offset, - SDValue &SLC) const { - SLC = CurDAG->getTargetConstant(0, MVT::i1); + SDValue &VAddr, SDValue &SOffset, + SDValue &Offset, + SDValue &SLC) const { + SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1); + SDValue GLC, TFE; - return SelectMUBUFAddr64(Addr, SRsrc, VAddr, Offset); + return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE); } bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, @@ -999,7 +1059,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, SDLoc DL(Addr); MachineFunction &MF = CurDAG->getMachineFunction(); const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); MachineRegisterInfo &MRI = MF.getRegInfo(); const SITargetLowering& Lowering = *static_cast<const SITargetLowering*>(getTargetLowering()); @@ -1017,11 +1077,11 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0); const SDValue RsrcOps[] = { - CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32), + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), ScratchRsrcDword0, - CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32), + CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), ScratchRsrcDword1, - CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32), + CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32), }; SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, RsrcOps), 0); @@ -1036,14 +1096,14 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, if (isLegalMUBUFImmOffset(C1)) { VAddr = Addr.getOperand(0); - ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16); + ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); return true; } } // (node) VAddr = Addr; - ImmOffset = CurDAG->getTargetConstant(0, MVT::i16); + ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); return true; } @@ -1053,7 +1113,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &TFE) const { SDValue Ptr, VAddr, Offen, Idxen, Addr64; const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(Subtarget.getInstrInfo()); + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, GLC, SLC, TFE); @@ -1087,7 +1147,7 @@ SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N); SDLoc DL(N); - assert(Subtarget.hasFlatAddressSpace() && + assert(Subtarget->hasFlatAddressSpace() && "addrspacecast only supported with flat address space!"); assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && @@ -1116,7 +1176,7 @@ SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { DL, DestVT, Src, - CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32)); + CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32)); } @@ -1125,25 +1185,115 @@ SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { // FIXME: This is probably wrong, we should never be defining // a register class with both VGPRs and SGPRs - SDValue RC = CurDAG->getTargetConstant(AMDGPU::VS_64RegClassID, MVT::i32); + SDValue RC = CurDAG->getTargetConstant(AMDGPU::VS_64RegClassID, DL, + MVT::i32); const SDValue Ops[] = { RC, Src, - CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32), - SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SDLoc(N), MVT::i32, - CurDAG->getConstant(0, MVT::i32)), 0), - CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32) + CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getConstant(0, DL, MVT::i32)), 0), + CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32) }; return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, - SDLoc(N), N->getValueType(0), Ops); + DL, N->getValueType(0), Ops); } assert(SrcSize == 64 && DestSize == 64); return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode(); } +SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val, + uint32_t Offset, uint32_t Width) { + // Transformation function, pack the offset and width of a BFE into + // the format expected by the S_BFE_I32 / S_BFE_U32. In the second + // source, bits [5:0] contain the offset and bits [22:16] the width. + uint32_t PackedVal = Offset | (Width << 16); + SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32); + + return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst); +} + +SDNode *AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) { + // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c) + // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c) + // Predicate: 0 < b <= c < 32 + + const SDValue &Shl = N->getOperand(0); + ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1)); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); + + if (B && C) { + uint32_t BVal = B->getZExtValue(); + uint32_t CVal = C->getZExtValue(); + + if (0 < BVal && BVal <= CVal && CVal < 32) { + bool Signed = N->getOpcode() == ISD::SRA; + unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; + + return getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), + CVal - BVal, 32 - CVal); + } + } + return SelectCode(N); +} + +SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { + switch (N->getOpcode()) { + case ISD::AND: + if (N->getOperand(0).getOpcode() == ISD::SRL) { + // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)" + // Predicate: isMask(mask) + const SDValue &Srl = N->getOperand(0); + ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1)); + ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1)); + + if (Shift && Mask) { + uint32_t ShiftVal = Shift->getZExtValue(); + uint32_t MaskVal = Mask->getZExtValue(); + + if (isMask_32(MaskVal)) { + uint32_t WidthVal = countPopulation(MaskVal); + + return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), Srl.getOperand(0), + ShiftVal, WidthVal); + } + } + } + break; + case ISD::SRL: + if (N->getOperand(0).getOpcode() == ISD::AND) { + // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)" + // Predicate: isMask(mask >> b) + const SDValue &And = N->getOperand(0); + ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1)); + ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1)); + + if (Shift && Mask) { + uint32_t ShiftVal = Shift->getZExtValue(); + uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal; + + if (isMask_32(MaskVal)) { + uint32_t WidthVal = countPopulation(MaskVal); + + return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), And.getOperand(0), + ShiftVal, WidthVal); + } + } + } else if (N->getOperand(0).getOpcode() == ISD::SHL) + return SelectS_BFEFromShifts(N); + break; + case ISD::SRA: + if (N->getOperand(0).getOpcode() == ISD::SHL) + return SelectS_BFEFromShifts(N); + break; + } + + return SelectCode(N); +} + bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const { @@ -1161,7 +1311,7 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, Src = Src.getOperand(0); } - SrcMods = CurDAG->getTargetConstant(Mods, MVT::i32); + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; } @@ -1169,9 +1319,10 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const { + SDLoc DL(In); // FIXME: Handle Clamp and Omod - Clamp = CurDAG->getTargetConstant(0, MVT::i32); - Omod = CurDAG->getTargetConstant(0, MVT::i32); + Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32); + Omod = CurDAG->getTargetConstant(0, DL, MVT::i32); return SelectVOP3Mods(In, Src, SrcMods); } @@ -1180,7 +1331,7 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Omod) const { // FIXME: Handle Omod - Omod = CurDAG->getTargetConstant(0, MVT::i32); + Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); return SelectVOP3Mods(In, Src, SrcMods); } @@ -1189,7 +1340,7 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const { - Clamp = Omod = CurDAG->getTargetConstant(0, MVT::i32); + Clamp = Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); return SelectVOP3Mods(In, Src, SrcMods); } diff --git a/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.cpp index b137053..d56838e 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.cpp @@ -102,11 +102,9 @@ EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) { return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); } -AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : - TargetLowering(TM) { - - Subtarget = &TM.getSubtarget<AMDGPUSubtarget>(); - +AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, + const AMDGPUSubtarget &STI) + : TargetLowering(TM), Subtarget(&STI) { setOperationAction(ISD::Constant, MVT::i32, Legal); setOperationAction(ISD::Constant, MVT::i64, Legal); setOperationAction(ISD::ConstantFP, MVT::f32, Legal); @@ -127,12 +125,23 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::FABS, MVT::f32, Legal); setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FRINT, MVT::f32, Legal); - setOperationAction(ISD::FROUND, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); + + setOperationAction(ISD::FROUND, MVT::f32, Custom); + setOperationAction(ISD::FROUND, MVT::f64, Custom); setOperationAction(ISD::FREM, MVT::f32, Custom); setOperationAction(ISD::FREM, MVT::f64, Custom); + // v_mad_f32 does not support denormals according to some sources. + if (!Subtarget->hasFP32Denormals()) + setOperationAction(ISD::FMAD, MVT::f32, Legal); + + // Expand to fneg + fadd. + setOperationAction(ISD::FSUB, MVT::f64, Expand); + // Lower floating point store/load to integer store/load to reduce the number // of patterns in tablegen. setOperationAction(ISD::STORE, MVT::f32, Promote); @@ -248,9 +257,22 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); + setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); + setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand); + setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f32, Expand); const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; for (MVT VT : ScalarIntVTs) { @@ -292,6 +314,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + setOperationAction(ISD::SMIN, MVT::i32, Legal); + setOperationAction(ISD::UMIN, MVT::i32, Legal); + setOperationAction(ISD::SMAX, MVT::i32, Legal); + setOperationAction(ISD::UMAX, MVT::i32, Legal); + if (!Subtarget->hasFFBH()) setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); @@ -384,6 +411,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::FSUB); + setBooleanContents(ZeroOrNegativeOneBooleanContent); setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); @@ -497,6 +527,12 @@ bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { return VT == MVT::f32 || VT == MVT::f64; } +bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT, + unsigned NumElem, + unsigned AS) const { + return true; +} + bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { // Truncate is just accessing a subregister. return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0); @@ -601,6 +637,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); case ISD::FRINT: return LowerFRINT(Op, DAG); case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); + case ISD::FROUND: return LowerFROUND(Op, DAG); case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); @@ -660,14 +697,14 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, const SDValue &InitPtr, SDValue Chain, SelectionDAG &DAG) const { - const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout(); + const DataLayout *TD = getDataLayout(); SDLoc DL(InitPtr); Type *InitTy = Init->getType(); if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) { EVT VT = EVT::getEVT(InitTy); PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); - return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr, + return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr, MachinePointerInfo(UndefValue::get(PtrTy)), false, false, TD->getPrefTypeAlignment(InitTy)); } @@ -675,7 +712,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) { EVT VT = EVT::getEVT(CFP->getType()); PointerType *PtrTy = PointerType::get(CFP->getType(), 0); - return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, VT), InitPtr, + return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr, MachinePointerInfo(UndefValue::get(PtrTy)), false, false, TD->getPrefTypeAlignment(CFP->getType())); } @@ -687,7 +724,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, SmallVector<SDValue, 8> Chains; for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) { - SDValue Offset = DAG.getConstant(SL->getElementOffset(I), PtrVT); + SDValue Offset = DAG.getConstant(SL->getElementOffset(I), DL, PtrVT); SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); Constant *Elt = Init->getAggregateElement(I); @@ -711,7 +748,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType()); SmallVector<SDValue, 8> Chains; for (unsigned i = 0; i < NumElements; ++i) { - SDValue Offset = DAG.getConstant(i * EltSize, PtrVT); + SDValue Offset = DAG.getConstant(i * EltSize, DL, PtrVT); SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); Constant *Elt = Init->getAggregateElement(i); @@ -748,7 +785,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, SDValue Op, SelectionDAG &DAG) const { - const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout(); + const DataLayout *TD = getDataLayout(); GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = G->getGlobal(); @@ -773,7 +810,8 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, Offset = MFI->LocalMemoryObjects[GV]; } - return DAG.getConstant(Offset, getPointerTy(AMDGPUAS::LOCAL_ADDRESS)); + return DAG.getConstant(Offset, SDLoc(Op), + getPointerTy(AMDGPUAS::LOCAL_ADDRESS)); } case AMDGPUAS::CONSTANT_ADDRESS: { MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); @@ -849,14 +887,13 @@ SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>( - getTargetMachine().getSubtargetImpl()->getFrameLowering()); + const AMDGPUFrameLowering *TFL = Subtarget->getFrameLowering(); FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); unsigned FrameIndex = FIN->getIndex(); unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); - return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), + return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op), Op.getValueType()); } @@ -931,9 +968,9 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, - DAG.getConstantFP(Max, VT)); + DAG.getConstantFP(Max, DL, VT)); return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, - DAG.getConstantFP(Min, VT)); + DAG.getConstantFP(Min, DL, VT)); } else { return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1)); } @@ -943,17 +980,17 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(2)); case AMDGPUIntrinsic::AMDGPU_imax: - return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1), - Op.getOperand(2)); + return DAG.getNode(ISD::SMAX, DL, VT, Op.getOperand(1), + Op.getOperand(2)); case AMDGPUIntrinsic::AMDGPU_umax: - return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1), - Op.getOperand(2)); + return DAG.getNode(ISD::UMAX, DL, VT, Op.getOperand(1), + Op.getOperand(2)); case AMDGPUIntrinsic::AMDGPU_imin: - return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1), - Op.getOperand(2)); + return DAG.getNode(ISD::SMIN, DL, VT, Op.getOperand(1), + Op.getOperand(2)); case AMDGPUIntrinsic::AMDGPU_umin: - return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1), - Op.getOperand(2)); + return DAG.getNode(ISD::UMIN, DL, VT, Op.getOperand(1), + Op.getOperand(2)); case AMDGPUIntrinsic::AMDGPU_umul24: return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, @@ -1028,10 +1065,10 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT VT = Op.getValueType(); - SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), - Op.getOperand(1)); + SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), + Op.getOperand(1)); - return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Neg, Op.getOperand(1)); + return DAG.getNode(ISD::SMAX, DL, VT, Neg, Op.getOperand(1)); } /// Linear Interpolation @@ -1041,7 +1078,7 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, SDLoc DL(Op); EVT VT = Op.getValueType(); SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT, - DAG.getConstantFP(1.0f, MVT::f32), + DAG.getConstantFP(1.0f, DL, MVT::f32), Op.getOperand(1)); SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA, Op.getOperand(3)); @@ -1130,7 +1167,7 @@ SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL, return SDValue(); } -/// \brief Generate Min/Max node +// FIXME: Remove this when combines added to DAGCombiner. SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL, EVT VT, SDValue LHS, @@ -1146,22 +1183,22 @@ SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL, switch (CCOpcode) { case ISD::SETULE: case ISD::SETULT: { - unsigned Opc = (LHS == True) ? AMDGPUISD::UMIN : AMDGPUISD::UMAX; + unsigned Opc = (LHS == True) ? ISD::UMIN : ISD::UMAX; return DAG.getNode(Opc, DL, VT, LHS, RHS); } case ISD::SETLE: case ISD::SETLT: { - unsigned Opc = (LHS == True) ? AMDGPUISD::SMIN : AMDGPUISD::SMAX; + unsigned Opc = (LHS == True) ? ISD::SMIN : ISD::SMAX; return DAG.getNode(Opc, DL, VT, LHS, RHS); } case ISD::SETGT: case ISD::SETGE: { - unsigned Opc = (LHS == True) ? AMDGPUISD::SMAX : AMDGPUISD::SMIN; + unsigned Opc = (LHS == True) ? ISD::SMAX : ISD::SMIN; return DAG.getNode(Opc, DL, VT, LHS, RHS); } case ISD::SETUGE: case ISD::SETUGT: { - unsigned Opc = (LHS == True) ? AMDGPUISD::UMAX : AMDGPUISD::UMIN; + unsigned Opc = (LHS == True) ? ISD::UMAX : ISD::UMIN; return DAG.getNode(Opc, DL, VT, LHS, RHS); } default: @@ -1189,7 +1226,7 @@ SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op, for (unsigned i = 0; i < NumElts; ++i) { SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(), - DAG.getConstant(i * MemEltSize, PtrVT)); + DAG.getConstant(i * MemEltSize, SL, PtrVT)); SDValue NewLoad = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT, @@ -1240,7 +1277,8 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, Load->isInvariant(), Load->getAlignment()); SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, - DAG.getConstant(LoMemVT.getStoreSize(), PtrVT)); + DAG.getConstant(LoMemVT.getStoreSize(), SL, + PtrVT)); SDValue HiLoad = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, @@ -1280,18 +1318,18 @@ SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, unsigned MemEltBits = MemEltVT.getSizeInBits(); unsigned MemNumElements = MemVT.getVectorNumElements(); unsigned PackedSize = MemVT.getStoreSizeInBits(); - SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, MVT::i32); + SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, DL, MVT::i32); assert(Value.getValueType().getScalarSizeInBits() >= 32); SDValue PackedValue; for (unsigned i = 0; i < MemNumElements; ++i) { SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value, - DAG.getConstant(i, MVT::i32)); + DAG.getConstant(i, DL, MVT::i32)); Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32); Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg - SDValue Shift = DAG.getConstant(MemEltBits * i, MVT::i32); + SDValue Shift = DAG.getConstant(MemEltBits * i, DL, MVT::i32); Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift); if (i == 0) { @@ -1333,9 +1371,9 @@ SDValue AMDGPUTargetLowering::ScalarizeVectorStore(SDValue Op, for (unsigned i = 0, e = NumElts; i != e; ++i) { SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Store->getValue(), - DAG.getConstant(i, MVT::i32)); + DAG.getConstant(i, SL, MVT::i32)); - SDValue Offset = DAG.getConstant(i * MemEltVT.getStoreSize(), PtrVT); + SDValue Offset = DAG.getConstant(i * MemEltVT.getStoreSize(), SL, PtrVT); SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Store->getBasePtr(), Offset); SDValue NewStore = DAG.getTruncStore(Store->getChain(), SL, Val, Ptr, @@ -1374,7 +1412,8 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, EVT PtrVT = BasePtr.getValueType(); SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, - DAG.getConstant(LoMemVT.getStoreSize(), PtrVT)); + DAG.getConstant(LoMemVT.getStoreSize(), SL, + PtrVT)); MachinePointerInfo SrcValue(Store->getMemOperand()->getValue()); SDValue LoStore @@ -1430,22 +1469,34 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32)) return SDValue(); + // <SI && AS=PRIVATE && EXTLOAD && size < 32bit, + // register (2-)byte extract. + // Get Register holding the target. SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), - DAG.getConstant(2, MVT::i32)); + DAG.getConstant(2, DL, MVT::i32)); + // Load the Register. SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), Load->getChain(), Ptr, - DAG.getTargetConstant(0, MVT::i32), + DAG.getTargetConstant(0, DL, MVT::i32), Op.getOperand(2)); + + // Get offset within the register. SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, Load->getBasePtr(), - DAG.getConstant(0x3, MVT::i32)); + DAG.getConstant(0x3, DL, MVT::i32)); + + // Bit offset of target byte (byteIdx * 8). SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, - DAG.getConstant(3, MVT::i32)); + DAG.getConstant(3, DL, MVT::i32)); + // Shift to the right. Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); + // Eliminate the upper bits by setting them to ... EVT MemEltVT = MemVT.getScalarType(); + + // ... ones. if (ExtType == ISD::SEXTLOAD) { SDValue MemEltVTNode = DAG.getValueType(MemEltVT); @@ -1457,6 +1508,7 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { return DAG.getMergeValues(Ops, DL); } + // ... or zeros. SDValue Ops[] = { DAG.getZeroExtendInReg(Ret, DL, MemEltVT), Load->getChain() @@ -1491,15 +1543,16 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { } SDValue BasePtr = Store->getBasePtr(); SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, - DAG.getConstant(2, MVT::i32)); + DAG.getConstant(2, DL, MVT::i32)); SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, - Chain, Ptr, DAG.getTargetConstant(0, MVT::i32)); + Chain, Ptr, + DAG.getTargetConstant(0, DL, MVT::i32)); SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, - DAG.getConstant(0x3, MVT::i32)); + DAG.getConstant(0x3, DL, MVT::i32)); SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, - DAG.getConstant(3, MVT::i32)); + DAG.getConstant(3, DL, MVT::i32)); SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Store->getValue()); @@ -1509,15 +1562,17 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, MaskedValue, ShiftAmt); - SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(Mask, MVT::i32), + SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getConstant(Mask, DL, MVT::i32), ShiftAmt); DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, - DAG.getConstant(0xffffffff, MVT::i32)); + DAG.getConstant(0xffffffff, DL, MVT::i32)); Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, - Chain, Value, Ptr, DAG.getTargetConstant(0, MVT::i32)); + Chain, Value, Ptr, + DAG.getTargetConstant(0, DL, MVT::i32)); } return SDValue(); } @@ -1544,17 +1599,18 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool unsigned BitSize = VT.getScalarType().getSizeInBits(); - SDValue jq = DAG.getConstant(1, IntVT); + SDValue jq = DAG.getConstant(1, DL, IntVT); if (sign) { // char|short jq = ia ^ ib; jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); // jq = jq >> (bitsize - 2) - jq = DAG.getNode(ISD::SRA, DL, VT, jq, DAG.getConstant(BitSize - 2, VT)); + jq = DAG.getNode(ISD::SRA, DL, VT, jq, + DAG.getConstant(BitSize - 2, DL, VT)); // jq = jq | 0x1 - jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, VT)); + jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT)); // jq = (int)jq jq = DAG.getSExtOrTrunc(jq, DL, IntVT); @@ -1603,7 +1659,7 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE); // jq = (cv ? jq : 0); - jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, VT)); + jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT)); // dst = trunc/extend to legal type iq = sign ? DAG.getSExtOrTrunc(iq, DL, VT) : DAG.getZExtOrTrunc(iq, DL, VT); @@ -1631,8 +1687,8 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, EVT VT = Op.getValueType(); EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); - SDValue one = DAG.getConstant(1, HalfVT); - SDValue zero = DAG.getConstant(0, HalfVT); + SDValue one = DAG.getConstant(1, DL, HalfVT); + SDValue zero = DAG.getConstant(0, DL, HalfVT); //HiLo split SDValue LHS = Op.getOperand(0); @@ -1643,12 +1699,26 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); + if (VT == MVT::i64 && + DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) && + DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) { + + SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), + LHS_Lo, RHS_Lo); + + SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(0), zero); + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(1), zero); + Results.push_back(DIV); + Results.push_back(REM); + return; + } + // Get Speculative values SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); - SDValue REM_Hi = zero; SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, zero); SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); SDValue DIV_Lo = zero; @@ -1656,42 +1726,28 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, const unsigned halfBitWidth = HalfVT.getSizeInBits(); for (unsigned i = 0; i < halfBitWidth; ++i) { - SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT); - // Get Value of high bit - SDValue HBit; - if (halfBitWidth == 32 && Subtarget->hasBFE()) { - HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one); - } else { - HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); - HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); - } - - SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo, - DAG.getConstant(halfBitWidth - 1, HalfVT)); - REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one); - REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry); - - REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one); - REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit); - - - SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); - - SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT); + const unsigned bitPos = halfBitWidth - i - 1; + SDValue POS = DAG.getConstant(bitPos, DL, HalfVT); + // Get value of high bit + SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); + HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); + HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit); + + // Shift + REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT)); + // Add LHS high bit + REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit); + + SDValue BIT = DAG.getConstant(1 << bitPos, DL, HalfVT); SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE); DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); // Update REM - SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); - REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE); - REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero); - REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one); } - SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); Results.push_back(DIV); Results.push_back(REM); @@ -1712,8 +1768,8 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, SDValue Den = Op.getOperand(1); if (VT == MVT::i32) { - if (DAG.MaskedValueIsZero(Op.getOperand(0), APInt(32, 0xff << 24)) && - DAG.MaskedValueIsZero(Op.getOperand(1), APInt(32, 0xff << 24))) { + if (DAG.MaskedValueIsZero(Num, APInt::getHighBitsSet(32, 8)) && + DAG.MaskedValueIsZero(Den, APInt::getHighBitsSet(32, 8))) { // TODO: We technically could do this for i64, but shouldn't that just be // handled by something generally reducing 64-bit division on 32-bit // values to 32-bit? @@ -1732,11 +1788,11 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den); // NEG_RCP_LO = -RCP_LO - SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), + SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RCP_LO); // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) - SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT), + SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT), NEG_RCP_LO, RCP_LO, ISD::SETEQ); // Calculate the rounding error from the URECIP instruction @@ -1750,7 +1806,7 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E); // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) - SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT), + SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT), RCP_A_E, RCP_S_E, ISD::SETEQ); // Quotient = mulhu(Tmp0, Num) @@ -1764,14 +1820,14 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den, - DAG.getConstant(-1, VT), - DAG.getConstant(0, VT), + DAG.getConstant(-1, DL, VT), + DAG.getConstant(0, DL, VT), ISD::SETUGE); // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num, Num_S_Remainder, - DAG.getConstant(-1, VT), - DAG.getConstant(0, VT), + DAG.getConstant(-1, DL, VT), + DAG.getConstant(0, DL, VT), ISD::SETUGE); // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den, @@ -1781,18 +1837,18 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, // Quotient_A_One = Quotient + 1 SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient, - DAG.getConstant(1, VT)); + DAG.getConstant(1, DL, VT)); // Quotient_S_One = Quotient - 1 SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient, - DAG.getConstant(1, VT)); + DAG.getConstant(1, DL, VT)); // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) - SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT), + SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT), Quotient, Quotient_A_One, ISD::SETEQ); // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) - Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT), + Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT), Quotient_S_One, Div, ISD::SETEQ); // Calculate Rem result: @@ -1804,11 +1860,11 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den); // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) - SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT), + SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT), Remainder, Remainder_S_Den, ISD::SETEQ); // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) - Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT), + Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT), Remainder_A_Den, Rem, ISD::SETEQ); SDValue Ops[2] = { Div, @@ -1825,19 +1881,31 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); - if (VT == MVT::i32) { - if (DAG.ComputeNumSignBits(Op.getOperand(0)) > 8 && - DAG.ComputeNumSignBits(Op.getOperand(1)) > 8) { - // TODO: We technically could do this for i64, but shouldn't that just be - // handled by something generally reducing 64-bit division on 32-bit - // values to 32-bit? - return LowerDIVREM24(Op, DAG, true); - } + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue NegOne = DAG.getConstant(-1, DL, VT); + + if (VT == MVT::i32 && + DAG.ComputeNumSignBits(LHS) > 8 && + DAG.ComputeNumSignBits(RHS) > 8) { + return LowerDIVREM24(Op, DAG, true); + } + if (VT == MVT::i64 && + DAG.ComputeNumSignBits(LHS) > 32 && + DAG.ComputeNumSignBits(RHS) > 32) { + EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); + + //HiLo split + SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero); + SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero); + SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), + LHS_Lo, RHS_Lo); + SDValue Res[2] = { + DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)), + DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1)) + }; + return DAG.getMergeValues(Res, DL); } - SDValue Zero = DAG.getConstant(0, VT); - SDValue NegOne = DAG.getConstant(-1, VT); - SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT); SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT); SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign); @@ -1889,8 +1957,8 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); - const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64); - const SDValue One = DAG.getConstantFP(1.0, MVT::f64); + const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); @@ -1902,14 +1970,28 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } +static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) { + const unsigned FractBits = 52; + const unsigned ExpBits = 11; + + SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, + Hi, + DAG.getConstant(FractBits - 32, SL, MVT::i32), + DAG.getConstant(ExpBits, SL, MVT::i32)); + SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, + DAG.getConstant(1023, SL, MVT::i32)); + + return Exp; +} + SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); assert(Op.getValueType() == MVT::f64); - const SDValue Zero = DAG.getConstant(0, MVT::i32); - const SDValue One = DAG.getConstant(1, MVT::i32); + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + const SDValue One = DAG.getConstant(1, SL, MVT::i32); SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); @@ -1917,19 +1999,12 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { // exponent. SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One); - const unsigned FractBits = 52; - const unsigned ExpBits = 11; + SDValue Exp = extractF64Exponent(Hi, SL, DAG); - // Extract the exponent. - SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, - Hi, - DAG.getConstant(FractBits - 32, MVT::i32), - DAG.getConstant(ExpBits, MVT::i32)); - SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, - DAG.getConstant(1023, MVT::i32)); + const unsigned FractBits = 52; // Extract the sign bit. - const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32); + const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32); SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask); // Extend back to to 64-bits. @@ -1939,7 +2014,7 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src); const SDValue FractMask - = DAG.getConstant((UINT64_C(1) << FractBits) - 1, MVT::i64); + = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64); SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp); SDValue Not = DAG.getNOT(SL, Shr, MVT::i64); @@ -1947,7 +2022,7 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); - const SDValue FiftyOne = DAG.getConstant(FractBits - 1, MVT::i32); + const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32); SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); @@ -1965,7 +2040,7 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType() == MVT::f64); APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52"); - SDValue C1 = DAG.getConstantFP(C1Val, MVT::f64); + SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64); SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src); SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign); @@ -1974,7 +2049,7 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src); APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51"); - SDValue C2 = DAG.getConstantFP(C2Val, MVT::f64); + SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64); EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT); @@ -1989,6 +2064,101 @@ SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) con return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0)); } +// XXX - May require not supporting f32 denormals? +SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + + SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X); + + SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T); + + SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff); + + const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f32); + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); + const SDValue Half = DAG.getConstantFP(0.5, SL, MVT::f32); + + SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); + + SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); + + SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero); + + return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel); +} + +SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + + SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X); + + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + const SDValue One = DAG.getConstant(1, SL, MVT::i32); + const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32); + const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32); + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); + + + SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); + + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One); + + SDValue Exp = extractF64Exponent(Hi, SL, DAG); + + const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL, + MVT::i64); + + SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp); + SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64, + DAG.getConstant(INT64_C(0x0008000000000000), SL, + MVT::i64), + Exp); + + SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M); + SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT, + DAG.getConstant(0, SL, MVT::i64), Tmp0, + ISD::SETNE); + + SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1, + D, DAG.getConstant(0, SL, MVT::i64)); + SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2); + + K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64)); + K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K); + + SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); + SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); + SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ); + + SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64, + ExpEqNegOne, + DAG.getConstantFP(1.0, SL, MVT::f64), + DAG.getConstantFP(0.0, SL, MVT::f64)); + + SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X); + + K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K); + K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K); + + return K; +} + +SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (VT == MVT::f32) + return LowerFROUND32(Op, DAG); + + if (VT == MVT::f64) + return LowerFROUND64(Op, DAG); + + llvm_unreachable("unhandled type"); +} + SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); @@ -1999,8 +2169,8 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); - const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64); - const SDValue NegOne = DAG.getConstantFP(-1.0, MVT::f64); + const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); + const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64); EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); @@ -2020,9 +2190,9 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(0, SL, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, - DAG.getConstant(1, MVT::i32)); + DAG.getConstant(1, SL, MVT::i32)); SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP, SL, MVT::f64, Hi); @@ -2030,7 +2200,7 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo); SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi, - DAG.getConstant(32, MVT::i32)); + DAG.getConstant(32, SL, MVT::i32)); return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo); } @@ -2051,13 +2221,13 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, // f32 uint_to_fp i64 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(0, DL, MVT::i32)); SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, - DAG.getConstant(1, MVT::i32)); + DAG.getConstant(1, DL, MVT::i32)); SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi); FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi, - DAG.getConstantFP(4294967296.0f, MVT::f32)); // 2^32 + DAG.getConstantFP(4294967296.0f, DL, MVT::f32)); // 2^32 return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi); } @@ -2078,10 +2248,10 @@ SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); - SDValue K0 - = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), MVT::f64); - SDValue K1 - = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), MVT::f64); + SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL, + MVT::f64); + SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL, + MVT::f64); SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0); @@ -2180,14 +2350,14 @@ static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) { template <typename IntTy> static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, - uint32_t Offset, uint32_t Width) { + uint32_t Offset, uint32_t Width, SDLoc DL) { if (Width + Offset < 32) { uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width); IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width); - return DAG.getConstant(Result, MVT::i32); + return DAG.getConstant(Result, DL, MVT::i32); } - return DAG.getConstant(Src0 >> Offset, MVT::i32); + return DAG.getConstant(Src0 >> Offset, DL, MVT::i32); } static bool usesAllNormalStores(SDNode *LoadVal) { @@ -2292,7 +2462,6 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SELECT: { SDValue Cond = N->getOperand(0); if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) { - SDLoc DL(N); EVT VT = N->getValueType(0); SDValue LHS = Cond.getOperand(0); SDValue RHS = Cond.getOperand(1); @@ -2323,7 +2492,7 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, uint32_t WidthVal = Width->getZExtValue() & 0x1f; if (WidthVal == 0) - return DAG.getConstant(0, MVT::i32); + return DAG.getConstant(0, DL, MVT::i32); ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); if (!Offset) @@ -2362,17 +2531,19 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return constantFoldBFE<int32_t>(DAG, CVal->getSExtValue(), OffsetVal, - WidthVal); + WidthVal, + DL); } return constantFoldBFE<uint32_t>(DAG, CVal->getZExtValue(), OffsetVal, - WidthVal); + WidthVal, + DL); } if ((OffsetVal + WidthVal) >= 32) { - SDValue ShiftVal = DAG.getConstant(OffsetVal, MVT::i32); + SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32); return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, BitsFrom, ShiftVal); } @@ -2476,8 +2647,8 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { - switch (Opcode) { - default: return nullptr; + switch ((AMDGPUISD::NodeType)Opcode) { + case AMDGPUISD::FIRST_NUMBER: break; // AMDIL DAG nodes NODE_NAME_CASE(CALL); NODE_NAME_CASE(UMUL); @@ -2488,13 +2659,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(DWORDADDR) NODE_NAME_CASE(FRACT) NODE_NAME_CASE(CLAMP) - NODE_NAME_CASE(MAD) + NODE_NAME_CASE(COS_HW) + NODE_NAME_CASE(SIN_HW) NODE_NAME_CASE(FMAX_LEGACY) - NODE_NAME_CASE(SMAX) - NODE_NAME_CASE(UMAX) NODE_NAME_CASE(FMIN_LEGACY) - NODE_NAME_CASE(SMIN) - NODE_NAME_CASE(UMIN) NODE_NAME_CASE(FMAX3) NODE_NAME_CASE(SMAX3) NODE_NAME_CASE(UMAX3) @@ -2513,6 +2681,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(LDEXP) NODE_NAME_CASE(FP_CLASS) NODE_NAME_CASE(DOT4) + NODE_NAME_CASE(CARRY) + NODE_NAME_CASE(BORROW) NODE_NAME_CASE(BFE_U32) NODE_NAME_CASE(BFE_I32) NODE_NAME_CASE(BFI) @@ -2522,6 +2692,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(MUL_I24) NODE_NAME_CASE(MAD_U24) NODE_NAME_CASE(MAD_I24) + NODE_NAME_CASE(TEXTURE_FETCH) NODE_NAME_CASE(EXPORT) NODE_NAME_CASE(CONST_ADDRESS) NODE_NAME_CASE(REGISTER_LOAD) @@ -2538,9 +2709,16 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CVT_F32_UBYTE3) NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) NODE_NAME_CASE(CONST_DATA_PTR) + case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; + NODE_NAME_CASE(SENDMSG) + NODE_NAME_CASE(INTERP_MOV) + NODE_NAME_CASE(INTERP_P1) + NODE_NAME_CASE(INTERP_P2) NODE_NAME_CASE(STORE_MSKOR) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) + case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } + return nullptr; } SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand, @@ -2630,13 +2808,11 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( break; } - case AMDGPUISD::SMAX: - case AMDGPUISD::UMAX: - case AMDGPUISD::SMIN: - case AMDGPUISD::UMIN: - computeKnownBitsForMinMax(Op.getOperand(0), Op.getOperand(1), - KnownZero, KnownOne, DAG, Depth); + case AMDGPUISD::CARRY: + case AMDGPUISD::BORROW: { + KnownZero = APInt::getHighBitsSet(32, 31); break; + } case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { @@ -2680,6 +2856,10 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1; } + case AMDGPUISD::CARRY: + case AMDGPUISD::BORROW: + return 31; + default: return 1; } diff --git a/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.h b/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.h index 15f529c..fbb7d3c 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.h +++ b/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.h @@ -43,12 +43,15 @@ private: /// \brief Split a vector store into multiple scalar stores. /// \returns The resulting chain. - SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const; @@ -86,6 +89,7 @@ protected: SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const; void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl<SDValue> &Results) const; @@ -106,7 +110,7 @@ protected: const SmallVectorImpl<ISD::InputArg> &Ins) const; public: - AMDGPUTargetLowering(TargetMachine &TM); + AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI); bool isFAbsFree(EVT VT) const override; bool isFNegFree(EVT VT) const override; @@ -129,6 +133,10 @@ public: EVT ExtVT) const override; bool isLoadBitCastBeneficial(EVT, EVT) const override; + + bool storeOfVectorConstantIsCheap(EVT MemVT, + unsigned NumElem, + unsigned AS) const override; bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; @@ -203,7 +211,7 @@ public: namespace AMDGPUISD { -enum { +enum NodeType : unsigned { // AMDIL ISD Opcodes FIRST_NUMBER = ISD::BUILTIN_OP_END, CALL, // Function call based on a single integer @@ -214,18 +222,13 @@ enum { DWORDADDR, FRACT, CLAMP, - MAD, // Multiply + add with same result as the separate operations. // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. // Denormals handled on some parts. COS_HW, SIN_HW, FMAX_LEGACY, - SMAX, - UMAX, FMIN_LEGACY, - SMIN, - UMIN, FMAX3, SMAX3, UMAX3, @@ -247,6 +250,8 @@ enum { LDEXP, FP_CLASS, DOT4, + CARRY, + BORROW, BFE_U32, // Extract range of bits with zero extension to 32-bits. BFE_I32, // Extract range of bits with sign extension to 32-bits. BFI, // (src0 & src1) | (~src0 & src2) @@ -283,6 +288,10 @@ enum { BUILD_VERTICAL_VECTOR, /// Pointer to the start of the shader's constant data. CONST_DATA_PTR, + SENDMSG, + INTERP_MOV, + INTERP_P1, + INTERP_P2, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, STORE_MSKOR, LOAD_CONSTANT, diff --git a/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.cpp b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.cpp index e34a7b7..64e295f 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.cpp +++ b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.cpp @@ -31,7 +31,7 @@ using namespace llvm; void AMDGPUInstrInfo::anchor() {} AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &st) - : AMDGPUGenInstrInfo(-1,-1), RI(st), ST(st) { } + : AMDGPUGenInstrInfo(-1, -1), ST(st) {} const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const { return RI; @@ -152,26 +152,20 @@ bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const return true; } - -MachineInstr * -AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops, - int FrameIndex) const { +MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl( + MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, int FrameIndex) const { // TODO: Implement this function return nullptr; } -MachineInstr* -AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops, - MachineInstr *LoadMI) const { +MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl( + MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const { // TODO: Implement this function return nullptr; } -bool -AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops) const { +bool AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI, + ArrayRef<unsigned> Ops) const { // TODO: Implement this function return false; } @@ -319,10 +313,7 @@ int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { return -1; } - Offset = MF.getTarget() - .getSubtargetImpl() - ->getFrameLowering() - ->getFrameIndexOffset(MF, -1); + Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexOffset(MF, -1); return getIndirectIndexBegin(MF) + Offset; } @@ -353,7 +344,7 @@ enum SISubtarget { VI = 1 }; -enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) { +static enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) { switch (Gen) { default: return SI; @@ -363,8 +354,8 @@ enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) { } int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { - int MCOp = AMDGPU::getMCOpcode(Opcode, - AMDGPUSubtargetToSISubtarget(RI.ST.getGeneration())); + int MCOp = AMDGPU::getMCOpcode( + Opcode, AMDGPUSubtargetToSISubtarget(ST.getGeneration())); // -1 means that Opcode is already a native instruction. if (MCOp == -1) diff --git a/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.h b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.h index 202183c..8fd27a1 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.h +++ b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.h @@ -85,14 +85,15 @@ public: const TargetRegisterInfo *TRI) const override; protected: - MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops, + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, int FrameIndex) const override; - MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops, + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const override; + public: /// \returns the smallest register index that will be accessed by an indirect /// read or write or -1 if indirect addressing is not used by this program. @@ -103,7 +104,7 @@ public: int getIndirectIndexEnd(const MachineFunction &MF) const; bool canFoldMemoryOperand(const MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops) const override; + ArrayRef<unsigned> Ops) const override; bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, unsigned Reg, bool UnfoldLoad, bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const override; diff --git a/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.td b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.td index d657ad0..b413897 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.td +++ b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.td @@ -78,7 +78,6 @@ def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp, >; def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>; -def AMDGPUmad : SDNode<"AMDGPUISD::MAD", SDTFPTernaryOp, []>; // out = max(a, b) a and b are signed ints def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp, @@ -95,16 +94,6 @@ def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp, [] >; -// out = min(a, b) a and b are signed ints -def AMDGPUsmin : SDNode<"AMDGPUISD::SMIN", SDTIntBinOp, - [SDNPCommutative, SDNPAssociative] ->; - -// out = min(a, b) a and b are unsigned ints -def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp, - [SDNPCommutative, SDNPAssociative] ->; - // FIXME: TableGen doesn't like commutative instructions with more // than 2 operands. // out = max(a, b, c) a, b and c are floats @@ -137,6 +126,13 @@ def AMDGPUumin3 : SDNode<"AMDGPUISD::UMIN3", AMDGPUDTIntTernaryOp, [/*SDNPCommutative, SDNPAssociative*/] >; +// out = (src0 + src1 > 0xFFFFFFFF) ? 1 : 0 +def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>; + +// out = (src1 > src0) ? 1 : 0 +def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>; + + def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0", SDTIntToFPOp, []>; def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1", @@ -213,6 +209,22 @@ def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp, [] >; +def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG", + SDTypeProfile<0, 1, [SDTCisInt<0>]>, + [SDNPHasChain, SDNPInGlue]>; + +def AMDGPUinterp_mov : SDNode<"AMDGPUISD::INTERP_MOV", + SDTypeProfile<1, 3, [SDTCisFP<0>]>, + [SDNPInGlue]>; + +def AMDGPUinterp_p1 : SDNode<"AMDGPUISD::INTERP_P1", + SDTypeProfile<1, 3, [SDTCisFP<0>]>, + [SDNPInGlue, SDNPOutGlue]>; + +def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2", + SDTypeProfile<1, 4, [SDTCisFP<0>]>, + [SDNPInGlue]>; + //===----------------------------------------------------------------------===// // Flow Control Profile Types //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/R600/AMDGPUInstructions.td b/contrib/llvm/lib/Target/R600/AMDGPUInstructions.td index 34b1fc8..72cab39 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUInstructions.td +++ b/contrib/llvm/lib/Target/R600/AMDGPUInstructions.td @@ -23,8 +23,6 @@ class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instructio let Pattern = pattern; let Itinerary = NullALU; - let isCodeGenOnly = 1; - let TSFlags{63} = isRegisterLoad; let TSFlags{62} = isRegisterStore; } @@ -185,12 +183,15 @@ def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); }]>; -def az_extload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{ +class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr), + (ld_node node:$ptr), [{ LoadSDNode *L = cast<LoadSDNode>(N); return L->getExtensionType() == ISD::ZEXTLOAD || L->getExtensionType() == ISD::EXTLOAD; }]>; +def az_extload : AZExtLoadBase <unindexedload>; + def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8; }]>; @@ -360,25 +361,29 @@ def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>; def mskor_global : PatFrag<(ops node:$val, node:$ptr), (AMDGPUstore_mskor node:$val, node:$ptr), [{ - return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; -}]>; - - -def atomic_cmp_swap_32_local : - PatFrag<(ops node:$ptr, node:$cmp, node:$swap), - (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{ - AtomicSDNode *AN = cast<AtomicSDNode>(N); - return AN->getMemoryVT() == MVT::i32 && - AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; -}]>; + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; +}]>; + +multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> { + + def _32_local : PatFrag < + (ops node:$ptr, node:$cmp, node:$swap), + (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ + AtomicSDNode *AN = cast<AtomicSDNode>(N); + return AN->getMemoryVT() == MVT::i32 && + AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; + }]>; + + def _64_local : PatFrag< + (ops node:$ptr, node:$cmp, node:$swap), + (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ + AtomicSDNode *AN = cast<AtomicSDNode>(N); + return AN->getMemoryVT() == MVT::i64 && + AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; + }]>; +} -def atomic_cmp_swap_64_local : - PatFrag<(ops node:$ptr, node:$cmp, node:$swap), - (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{ - AtomicSDNode *AN = cast<AtomicSDNode>(N); - return AN->getMemoryVT() == MVT::i64 && - AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; -}]>; +defm atomic_cmp_swap : AtomicCmpSwapLocal <atomic_cmp_swap>; def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return isFlatLoad(dyn_cast<LoadSDNode>(N)); @@ -391,7 +396,7 @@ def flat_store : PatFrag<(ops node:$val, node:$ptr), def mskor_flat : PatFrag<(ops node:$val, node:$ptr), (AMDGPUstore_mskor node:$val, node:$ptr), [{ - return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; }]>; class global_binary_atomic_op<SDNode atomic_op> : PatFrag< @@ -415,11 +420,6 @@ def atomic_xor_global : global_binary_atomic_op<atomic_load_xor>; // Misc Pattern Fragments //===----------------------------------------------------------------------===// -def fmad : PatFrag < - (ops node:$src0, node:$src1, node:$src2), - (fadd (fmul node:$src0, node:$src1), node:$src2) ->; - class Constants { int TWO_PI = 0x40c90fdb; int PI = 0x40490fdb; @@ -440,6 +440,11 @@ def FP_ONE : PatLeaf < [{return N->isExactlyValue(1.0);}] >; +def FP_HALF : PatLeaf < + (fpimm), + [{return N->isExactlyValue(0.5);}] +>; + let isCodeGenOnly = 1, isPseudo = 1 in { let usesCustomInserter = 1 in { @@ -580,22 +585,20 @@ class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : Pat < // Bitfield extract patterns -/* - -XXX: The BFE pattern is not working correctly because the XForm is not being -applied. +def IMMZeroBasedBitfieldMask : PatLeaf <(imm), [{ + return isMask_32(N->getZExtValue()); +}]>; -def legalshift32 : ImmLeaf <i32, [{return Imm >=0 && Imm < 32;}]>; -def bfemask : PatLeaf <(imm), [{return isMask_32(N->getZExtValue());}], - SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(CountTrailingOnes_32(N->getZExtValue()), MVT::i32);}]>>; +def IMMPopCount : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(countPopulation(N->getZExtValue()), SDLoc(N), + MVT::i32); +}]>; -class BFEPattern <Instruction BFE> : Pat < - (and (srl i32:$x, legalshift32:$y), bfemask:$z), - (BFE $x, $y, $z) +class BFEPattern <Instruction BFE, Instruction MOV> : Pat < + (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)), + (BFE $src, $rshift, (MOV (i32 (IMMPopCount $mask)))) >; -*/ - // rotr pattern class ROTRPattern <Instruction BIT_ALIGN> : Pat < (rotr i32:$src0, i32:$src1), @@ -605,6 +608,20 @@ class ROTRPattern <Instruction BIT_ALIGN> : Pat < // 24-bit arithmetic patterns def umul24 : PatFrag <(ops node:$x, node:$y), (mul node:$x, node:$y)>; +// Special conversion patterns + +def cvt_rpi_i32_f32 : PatFrag < + (ops node:$src), + (fp_to_sint (ffloor (fadd $src, FP_HALF))), + [{ (void) N; return TM.Options.NoNaNsFPMath; }] +>; + +def cvt_flr_i32_f32 : PatFrag < + (ops node:$src), + (fp_to_sint (ffloor $src)), + [{ (void)N; return TM.Options.NoNaNsFPMath; }] +>; + /* class UMUL24Pattern <Instruction UMUL24> : Pat < (mul U24:$x, U24:$y), diff --git a/contrib/llvm/lib/Target/R600/AMDGPUMCInstLower.cpp b/contrib/llvm/lib/Target/R600/AMDGPUMCInstLower.cpp index 03aa32d..2083146 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUMCInstLower.cpp +++ b/contrib/llvm/lib/Target/R600/AMDGPUMCInstLower.cpp @@ -58,32 +58,32 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { default: llvm_unreachable("unknown operand type"); case MachineOperand::MO_Immediate: - MCOp = MCOperand::CreateImm(MO.getImm()); + MCOp = MCOperand::createImm(MO.getImm()); break; case MachineOperand::MO_Register: - MCOp = MCOperand::CreateReg(MO.getReg()); + MCOp = MCOperand::createReg(MO.getReg()); break; case MachineOperand::MO_MachineBasicBlock: - MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create( + MCOp = MCOperand::createExpr(MCSymbolRefExpr::create( MO.getMBB()->getSymbol(), Ctx)); break; case MachineOperand::MO_GlobalAddress: { const GlobalValue *GV = MO.getGlobal(); - MCSymbol *Sym = Ctx.GetOrCreateSymbol(StringRef(GV->getName())); - MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(Sym, Ctx)); + MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(GV->getName())); + MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(Sym, Ctx)); break; } case MachineOperand::MO_TargetIndex: { assert(MO.getIndex() == AMDGPU::TI_CONSTDATA_START); - MCSymbol *Sym = Ctx.GetOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); - const MCSymbolRefExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx); - MCOp = MCOperand::CreateExpr(Expr); + MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); + const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); + MCOp = MCOperand::createExpr(Expr); break; } case MachineOperand::MO_ExternalSymbol: { - MCSymbol *Sym = Ctx.GetOrCreateSymbol(StringRef(MO.getSymbolName())); - const MCSymbolRefExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx); - MCOp = MCOperand::CreateExpr(Expr); + MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName())); + const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); + MCOp = MCOperand::createExpr(Expr); break; } } @@ -92,12 +92,12 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { } void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { - AMDGPUMCInstLower MCInstLowering(OutContext, - MF->getTarget().getSubtarget<AMDGPUSubtarget>()); + const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>(); + AMDGPUMCInstLower MCInstLowering(OutContext, STI); #ifdef _DEBUG StringRef Err; - if (!TM.getSubtargetImpl()->getInstrInfo()->verifyInstruction(MI, Err)) { + if (!STI.getInstrInfo()->verifyInstruction(MI, Err)) { errs() << "Warning: Illegal instruction detected: " << Err << "\n"; MI->dump(); } @@ -113,28 +113,29 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { } else { MCInst TmpInst; MCInstLowering.lower(MI, TmpInst); - EmitToStreamer(OutStreamer, TmpInst); + EmitToStreamer(*OutStreamer, TmpInst); - if (DisasmEnabled) { + if (STI.dumpCode()) { // Disassemble instruction/operands to text. DisasmLines.resize(DisasmLines.size() + 1); std::string &DisasmLine = DisasmLines.back(); raw_string_ostream DisasmStream(DisasmLine); AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), - *TM.getSubtargetImpl()->getInstrInfo(), - *TM.getSubtargetImpl()->getRegisterInfo()); - InstPrinter.printInst(&TmpInst, DisasmStream, StringRef()); + *MF->getSubtarget().getInstrInfo(), + *MF->getSubtarget().getRegisterInfo()); + InstPrinter.printInst(&TmpInst, DisasmStream, StringRef(), + MF->getSubtarget()); // Disassemble instruction/operands to hex representation. SmallVector<MCFixup, 4> Fixups; SmallVector<char, 16> CodeBytes; raw_svector_ostream CodeStream(CodeBytes); - MCObjectStreamer &ObjStreamer = (MCObjectStreamer &)OutStreamer; + auto &ObjStreamer = static_cast<MCObjectStreamer&>(*OutStreamer); MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter(); - InstEmitter.EncodeInstruction(TmpInst, CodeStream, Fixups, - TM.getSubtarget<MCSubtargetInfo>()); + InstEmitter.encodeInstruction(TmpInst, CodeStream, Fixups, + MF->getSubtarget<MCSubtargetInfo>()); CodeStream.flush(); HexLines.resize(HexLines.size() + 1); diff --git a/contrib/llvm/lib/Target/R600/AMDGPUMachineFunction.cpp b/contrib/llvm/lib/Target/R600/AMDGPUMachineFunction.cpp index 0f3f9e2..21c7da6 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUMachineFunction.cpp +++ b/contrib/llvm/lib/Target/R600/AMDGPUMachineFunction.cpp @@ -15,9 +15,7 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : LDSSize(0), ScratchSize(0), IsKernel(true) { - AttributeSet Set = MF.getFunction()->getAttributes(); - Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, - ShaderTypeAttribute); + Attribute A = MF.getFunction()->getFnAttribute(ShaderTypeAttribute); if (A.isStringAttribute()) { StringRef Str = A.getValueAsString(); diff --git a/contrib/llvm/lib/Target/R600/AMDGPUPromoteAlloca.cpp b/contrib/llvm/lib/Target/R600/AMDGPUPromoteAlloca.cpp index b81fef4..4a65bfc 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUPromoteAlloca.cpp +++ b/contrib/llvm/lib/Target/R600/AMDGPUPromoteAlloca.cpp @@ -18,6 +18,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #define DEBUG_TYPE "amdgpu-promote-alloca" @@ -87,7 +88,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { continue; if (Use->getParent()->getParent() == &F) LocalMemAvailable -= - Mod->getDataLayout()->getTypeAllocSize(GVTy->getElementType()); + Mod->getDataLayout().getTypeAllocSize(GVTy->getElementType()); } } } @@ -276,8 +277,8 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { // value from the reqd_work_group_size function attribute if it is // available. unsigned WorkGroupSize = 256; - int AllocaSize = WorkGroupSize * - Mod->getDataLayout()->getTypeAllocSize(AllocaTy); + int AllocaSize = + WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy); if (AllocaSize > LocalMemAvailable) { DEBUG(dbgs() << " Not enough local memory to promote alloca.\n"); @@ -294,9 +295,9 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { DEBUG(dbgs() << "Promoting alloca to local memory\n"); LocalMemAvailable -= AllocaSize; + Type *GVTy = ArrayType::get(I.getAllocatedType(), 256); GlobalVariable *GV = new GlobalVariable( - *Mod, ArrayType::get(I.getAllocatedType(), 256), false, - GlobalValue::ExternalLinkage, 0, I.getName(), 0, + *Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0, GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); FunctionType *FTy = FunctionType::get( @@ -315,12 +316,11 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { Value *ReadTIDIGZ = Mod->getOrInsertFunction( "llvm.r600.read.tidig.z", FTy, AttrSet); - - Value *TCntY = Builder.CreateCall(ReadLocalSizeY); - Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ); - Value *TIdX = Builder.CreateCall(ReadTIDIGX); - Value *TIdY = Builder.CreateCall(ReadTIDIGY); - Value *TIdZ = Builder.CreateCall(ReadTIDIGZ); + Value *TCntY = Builder.CreateCall(ReadLocalSizeY, {}); + Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ, {}); + Value *TIdX = Builder.CreateCall(ReadTIDIGX, {}); + Value *TIdY = Builder.CreateCall(ReadTIDIGY, {}); + Value *TIdZ = Builder.CreateCall(ReadTIDIGZ, {}); Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ); Tmp0 = Builder.CreateMul(Tmp0, TIdX); @@ -332,7 +332,7 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext()))); Indices.push_back(TID); - Value *Offset = Builder.CreateGEP(GV, Indices); + Value *Offset = Builder.CreateGEP(GVTy, GV, Indices); I.mutateType(Offset->getType()); I.replaceAllUsesWith(Offset); I.eraseFromParent(); @@ -365,8 +365,8 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { Function *F = Call->getCalledFunction(); FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes, F->isVarArg()); - Constant *C = Mod->getOrInsertFunction(StringRef(F->getName().str() + ".local"), NewType, - F->getAttributes()); + Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(), + NewType, F->getAttributes()); Function *NewF = cast<Function>(C); Call->setCalledFunction(NewF); continue; diff --git a/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.cpp b/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.cpp index 57b054b..3ca0eca 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.cpp +++ b/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.cpp @@ -17,10 +17,7 @@ using namespace llvm; -AMDGPURegisterInfo::AMDGPURegisterInfo(const AMDGPUSubtarget &st) -: AMDGPUGenRegisterInfo(0), - ST(st) - { } +AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {} //===----------------------------------------------------------------------===// // Function handling callbacks - Functions are a seldom used feature of GPUS, so diff --git a/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.h b/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.h index f27576a..cfd800b 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.h +++ b/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.h @@ -30,9 +30,8 @@ class TargetInstrInfo; struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { static const MCPhysReg CalleeSavedReg; - const AMDGPUSubtarget &ST; - AMDGPURegisterInfo(const AMDGPUSubtarget &st); + AMDGPURegisterInfo(); BitVector getReservedRegs(const MachineFunction &MF) const override { assert(!"Unimplemented"); return BitVector(); diff --git a/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.cpp b/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.cpp index 87cdb5f..5288866 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.cpp @@ -31,22 +31,9 @@ using namespace llvm; #define GET_SUBTARGETINFO_CTOR #include "AMDGPUGenSubtargetInfo.inc" -static std::string computeDataLayout(const AMDGPUSubtarget &ST) { - std::string Ret = "e-p:32:32"; - - if (ST.is64bit()) { - // 32-bit private, local, and region pointers. 64-bit global and constant. - Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64"; - } - - Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256" - "-v512:512-v1024:1024-v2048:2048-n32:64"; - - return Ret; -} - AMDGPUSubtarget & -AMDGPUSubtarget::initializeSubtargetDependencies(StringRef GPU, StringRef FS) { +AMDGPUSubtarget::initializeSubtargetDependencies(StringRef TT, StringRef GPU, + StringRef FS) { // Determine default and user-specified characteristics // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be // enabled, but some instructions do not respect them and they run at the @@ -59,6 +46,9 @@ AMDGPUSubtarget::initializeSubtargetDependencies(StringRef GPU, StringRef FS) { SmallString<256> FullFS("+promote-alloca,+fp64-denormals,"); FullFS += FS; + if (GPU == "" && Triple(TT).getArch() == Triple::amdgcn) + GPU = "SI"; + ParseSubtargetFeatures(GPU, FullFS); // FIXME: I don't think think Evergreen has any useful support for @@ -76,23 +66,26 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS, : AMDGPUGenSubtargetInfo(TT, GPU, FS), DevName(GPU), Is64bit(false), DumpCode(false), R600ALUInst(false), HasVertexCache(false), TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false), - FP64Denormals(false), FP32Denormals(false), CaymanISA(false), - FlatAddressSpace(false), EnableIRStructurizer(true), - EnablePromoteAlloca(false), EnableIfCvt(true), - EnableLoadStoreOpt(false), WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), - EnableVGPRSpilling(false),SGPRInitBug(false), - DL(computeDataLayout(initializeSubtargetDependencies(GPU, FS))), + FP64Denormals(false), FP32Denormals(false), FastFMAF32(false), + CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true), + EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false), + WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), + EnableVGPRSpilling(false), SGPRInitBug(false), + IsGCN(false), GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), + LDSBankCount(0), FrameLowering(TargetFrameLowering::StackGrowsUp, 64 * 16, // Maximum stack alignment (long16) 0), - InstrItins(getInstrItineraryForCPU(GPU)), - TargetTriple(TT) { + InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) { + + initializeSubtargetDependencies(TT, GPU, FS); + if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { InstrInfo.reset(new R600InstrInfo(*this)); - TLInfo.reset(new R600TargetLowering(TM)); + TLInfo.reset(new R600TargetLowering(TM, *this)); } else { InstrInfo.reset(new SIInstrInfo(*this)); - TLInfo.reset(new SITargetLowering(TM)); + TLInfo.reset(new SITargetLowering(TM, *this)); } } diff --git a/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.h b/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.h index eeb41d3..a5a901c 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.h +++ b/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.h @@ -22,7 +22,6 @@ #include "R600ISelLowering.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" -#include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetSubtargetInfo.h" #define GET_SUBTARGETINFO_HEADER @@ -60,6 +59,7 @@ private: bool FP64; bool FP64Denormals; bool FP32Denormals; + bool FastFMAF32; bool CaymanISA; bool FlatAddressSpace; bool EnableIRStructurizer; @@ -71,8 +71,13 @@ private: int LocalMemorySize; bool EnableVGPRSpilling; bool SGPRInitBug; + bool IsGCN; + bool GCN1Encoding; + bool GCN3Encoding; + bool CIInsts; + bool FeatureDisable; + int LDSBankCount; - const DataLayout DL; AMDGPUFrameLowering FrameLowering; std::unique_ptr<AMDGPUTargetLowering> TLInfo; std::unique_ptr<AMDGPUInstrInfo> InstrInfo; @@ -81,7 +86,8 @@ private: public: AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS, TargetMachine &TM); - AMDGPUSubtarget &initializeSubtargetDependencies(StringRef GPU, StringRef FS); + AMDGPUSubtarget &initializeSubtargetDependencies(StringRef TT, StringRef GPU, + StringRef FS); const AMDGPUFrameLowering *getFrameLowering() const override { return &FrameLowering; @@ -95,7 +101,6 @@ public: AMDGPUTargetLowering *getTargetLowering() const override { return TLInfo.get(); } - const DataLayout *getDataLayout() const override { return &DL; } const InstrItineraryData *getInstrItineraryData() const override { return &InstrItins; } @@ -134,6 +139,10 @@ public: return FP64Denormals; } + bool hasFastFMAF32() const { + return FastFMAF32; + } + bool hasFlatAddressSpace() const { return FlatAddressSpace; } @@ -177,6 +186,14 @@ public: return (getGeneration() >= EVERGREEN); } + bool hasCARRY() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasBORROW() const { + return (getGeneration() >= EVERGREEN); + } + bool IsIRStructurizerEnabled() const { return EnableIRStructurizer; } @@ -212,10 +229,14 @@ public: return SGPRInitBug; } + int getLDSBankCount() const { + return LDSBankCount; + } + unsigned getAmdKernelCodeChipID() const; bool enableMachineScheduler() const override { - return getGeneration() <= NORTHERN_ISLANDS; + return true; } void overrideSchedPolicy(MachineSchedPolicy &Policy, @@ -249,6 +270,10 @@ public: // FIXME: Not sure what this is for other subtagets. llvm_unreachable("do not know max waves per CU for this subtarget."); } + + bool enableSubRegLiveness() const override { + return true; + } }; } // End namespace llvm diff --git a/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.cpp b/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.cpp index a1da717..44c2abd 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -15,6 +15,7 @@ #include "AMDGPUTargetMachine.h" #include "AMDGPU.h" +#include "AMDGPUTargetTransformInfo.h" #include "R600ISelLowering.h" #include "R600InstrInfo.h" #include "R600MachineScheduler.h" @@ -27,7 +28,7 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Verifier.h" #include "llvm/MC/MCAsmInfo.h" -#include "llvm/PassManager.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_os_ostream.h" #include "llvm/Transforms/IPO.h" @@ -38,7 +39,7 @@ using namespace llvm; extern "C" void LLVMInitializeR600Target() { // Register the target - RegisterTargetMachine<AMDGPUTargetMachine> X(TheAMDGPUTarget); + RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget); RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget); } @@ -50,14 +51,30 @@ static MachineSchedRegistry SchedCustomRegistry("r600", "Run R600's custom scheduler", createR600MachineScheduler); +static std::string computeDataLayout(StringRef TT) { + Triple Triple(TT); + std::string Ret = "e-p:32:32"; + + if (Triple.getArch() == Triple::amdgcn) { + // 32-bit private, local, and region pointers. 64-bit global and constant. + Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64"; + } + + Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256" + "-v512:512-v1024:1024-v2048:2048-n32:64"; + + return Ret; +} + AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, TargetOptions Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OptLevel) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel), - TLOF(new TargetLoweringObjectFileELF()), - Subtarget(TT, CPU, FS, *this), IntrinsicInfo() { + : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM, + OptLevel), + TLOF(new TargetLoweringObjectFileELF()), Subtarget(TT, CPU, FS, *this), + IntrinsicInfo() { setRequiresStructuredCFG(true); initAsmInfo(); } @@ -66,10 +83,33 @@ AMDGPUTargetMachine::~AMDGPUTargetMachine() { delete TLOF; } +//===----------------------------------------------------------------------===// +// R600 Target Machine (R600 -> Cayman) +//===----------------------------------------------------------------------===// + +R600TargetMachine::R600TargetMachine(const Target &T, StringRef TT, StringRef FS, + StringRef CPU, TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL) : + AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) { } + + +//===----------------------------------------------------------------------===// +// GCN Target Machine (SI+) +//===----------------------------------------------------------------------===// + +GCNTargetMachine::GCNTargetMachine(const Target &T, StringRef TT, StringRef FS, + StringRef CPU, TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL) : + AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) { } + +//===----------------------------------------------------------------------===// +// AMDGPU Pass Setup +//===----------------------------------------------------------------------===// + namespace { class AMDGPUPassConfig : public TargetPassConfig { public: - AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase &PM) + AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} AMDGPUTargetMachine &getAMDGPUTargetMachine() const { @@ -78,7 +118,7 @@ public: ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override { - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) return createR600MachineScheduler(C); return nullptr; @@ -86,6 +126,25 @@ public: void addIRPasses() override; void addCodeGenPrepare() override; + virtual bool addPreISel() override; + virtual bool addInstSelector() override; +}; + +class R600PassConfig : public AMDGPUPassConfig { +public: + R600PassConfig(TargetMachine *TM, PassManagerBase &PM) + : AMDGPUPassConfig(TM, PM) { } + + bool addPreISel() override; + void addPreRegAlloc() override; + void addPreSched2() override; + void addPreEmitPass() override; +}; + +class GCNPassConfig : public AMDGPUPassConfig { +public: + GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) + : AMDGPUPassConfig(TM, PM) { } bool addPreISel() override; bool addInstSelector() override; void addPreRegAlloc() override; @@ -93,22 +152,12 @@ public: void addPreSched2() override; void addPreEmitPass() override; }; -} // End of anonymous namespace - -TargetPassConfig *AMDGPUTargetMachine::createPassConfig(PassManagerBase &PM) { - return new AMDGPUPassConfig(this, PM); -} -//===----------------------------------------------------------------------===// -// AMDGPU Analysis Pass Setup -//===----------------------------------------------------------------------===// +} // End of anonymous namespace -void AMDGPUTargetMachine::addAnalysisPasses(PassManagerBase &PM) { - // Add first the target-independent BasicTTI pass, then our AMDGPU pass. This - // allows the AMDGPU pass to delegate to the target independent layer when - // appropriate. - PM.add(createBasicTargetTransformInfoPass(this)); - PM.add(createAMDGPUTargetTransformInfoPass(this)); +TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { + return TargetIRAnalysis( + [this](Function &F) { return TargetTransformInfo(AMDGPUTTIImpl(this)); }); } void AMDGPUPassConfig::addIRPasses() { @@ -125,108 +174,119 @@ void AMDGPUPassConfig::addIRPasses() { } void AMDGPUPassConfig::addCodeGenPrepare() { - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); if (ST.isPromoteAllocaEnabled()) { addPass(createAMDGPUPromoteAlloca(ST)); addPass(createSROAPass()); } - TargetPassConfig::addCodeGenPrepare(); } bool AMDGPUPassConfig::addPreISel() { - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); addPass(createFlattenCFGPass()); if (ST.IsIRStructurizerEnabled()) addPass(createStructurizeCFGPass()); - if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - addPass(createSinkingPass()); - addPass(createSITypeRewriter()); - addPass(createSIAnnotateControlFlowPass()); - } else { - addPass(createR600TextureIntrinsicsReplacer()); - } return false; } bool AMDGPUPassConfig::addInstSelector() { - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); - addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); + return false; +} - if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - addPass(createSILowerI1CopiesPass()); - addPass(createSIFixSGPRCopiesPass(*TM)); - addPass(createSIFoldOperandsPass()); - } +//===----------------------------------------------------------------------===// +// R600 Pass Setup +//===----------------------------------------------------------------------===// +bool R600PassConfig::addPreISel() { + AMDGPUPassConfig::addPreISel(); + addPass(createR600TextureIntrinsicsReplacer()); return false; } -void AMDGPUPassConfig::addPreRegAlloc() { - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); +void R600PassConfig::addPreRegAlloc() { + addPass(createR600VectorRegMerger(*TM)); +} - if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - addPass(createR600VectorRegMerger(*TM)); - } else { - if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) { - // Don't do this with no optimizations since it throws away debug info by - // merging nonadjacent loads. +void R600PassConfig::addPreSched2() { + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); + addPass(createR600EmitClauseMarkers(), false); + if (ST.isIfCvtEnabled()) + addPass(&IfConverterID, false); + addPass(createR600ClauseMergePass(*TM), false); +} - // This should be run after scheduling, but before register allocation. It - // also need extra copies to the address operand to be eliminated. - initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); - insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); - } +void R600PassConfig::addPreEmitPass() { + addPass(createAMDGPUCFGStructurizerPass(), false); + addPass(createR600ExpandSpecialInstrsPass(*TM), false); + addPass(&FinalizeMachineBundlesID, false); + addPass(createR600Packetizer(*TM), false); + addPass(createR600ControlFlowFinalizer(*TM), false); +} - addPass(createSIShrinkInstructionsPass(), false); - addPass(createSIFixSGPRLiveRangesPass(), false); - } +TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { + return new R600PassConfig(this, PM); } -void AMDGPUPassConfig::addPostRegAlloc() { - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); +//===----------------------------------------------------------------------===// +// GCN Pass Setup +//===----------------------------------------------------------------------===// - if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) { - addPass(createSIPrepareScratchRegs(), false); - addPass(createSIShrinkInstructionsPass(), false); - } +bool GCNPassConfig::addPreISel() { + AMDGPUPassConfig::addPreISel(); + addPass(createSinkingPass()); + addPass(createSITypeRewriter()); + addPass(createSIAnnotateControlFlowPass()); + return false; } -void AMDGPUPassConfig::addPreSched2() { - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); +bool GCNPassConfig::addInstSelector() { + AMDGPUPassConfig::addInstSelector(); + addPass(createSILowerI1CopiesPass()); + addPass(createSIFixSGPRCopiesPass(*TM)); + addPass(createSIFoldOperandsPass()); + return false; +} - if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - addPass(createR600EmitClauseMarkers(), false); - if (ST.isIfCvtEnabled()) - addPass(&IfConverterID, false); - if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - addPass(createR600ClauseMergePass(*TM), false); - if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - addPass(createSIInsertWaits(*TM), false); +void GCNPassConfig::addPreRegAlloc() { + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); + + // This needs to be run directly before register allocation because + // earlier passes might recompute live intervals. + // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass + if (getOptLevel() > CodeGenOpt::None) { + initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry()); + insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); } -} -void AMDGPUPassConfig::addPreEmitPass() { - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); - if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - addPass(createAMDGPUCFGStructurizerPass(), false); - addPass(createR600ExpandSpecialInstrsPass(*TM), false); - addPass(&FinalizeMachineBundlesID, false); - addPass(createR600Packetizer(*TM), false); - addPass(createR600ControlFlowFinalizer(*TM), false); - } else { - addPass(createSILowerControlFlowPass(*TM), false); + if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) { + // Don't do this with no optimizations since it throws away debug info by + // merging nonadjacent loads. + + // This should be run after scheduling, but before register allocation. It + // also need extra copies to the address operand to be eliminated. + initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); + insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); } + addPass(createSIShrinkInstructionsPass(), false); + addPass(createSIFixSGPRLiveRangesPass(), false); } +void GCNPassConfig::addPostRegAlloc() { + addPass(createSIPrepareScratchRegs(), false); + addPass(createSIShrinkInstructionsPass(), false); +} -//===----------------------------------------------------------------------===// -// GCN Target Machine (SI+) -//===----------------------------------------------------------------------===// +void GCNPassConfig::addPreSched2() { + addPass(createSIInsertWaits(*TM), false); +} -GCNTargetMachine::GCNTargetMachine(const Target &T, StringRef TT, StringRef FS, - StringRef CPU, TargetOptions Options, Reloc::Model RM, - CodeModel::Model CM, CodeGenOpt::Level OL) : - AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) { } +void GCNPassConfig::addPreEmitPass() { + addPass(createSILowerControlFlowPass(*TM), false); +} + +TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { + return new GCNPassConfig(this, PM); +} diff --git a/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.h b/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.h index 66b3070..785c119 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.h +++ b/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.h @@ -29,6 +29,8 @@ namespace llvm { //===----------------------------------------------------------------------===// class AMDGPUTargetMachine : public LLVMTargetMachine { +private: + protected: TargetLoweringObjectFile *TLOF; AMDGPUSubtarget Subtarget; @@ -39,22 +41,36 @@ public: StringRef CPU, TargetOptions Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); ~AMDGPUTargetMachine(); - const AMDGPUSubtarget *getSubtargetImpl() const override { + + const AMDGPUSubtarget *getSubtargetImpl() const { return &Subtarget; } + const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override { return &Subtarget; } const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { return &IntrinsicInfo; } - TargetPassConfig *createPassConfig(PassManagerBase &PM) override; + TargetIRAnalysis getTargetIRAnalysis() override; - /// \brief Register R600 analysis passes with a pass manager. - void addAnalysisPasses(PassManagerBase &PM) override; TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF; } }; //===----------------------------------------------------------------------===// +// R600 Target Machine (R600 -> Cayman) +//===----------------------------------------------------------------------===// + +class R600TargetMachine : public AMDGPUTargetMachine { + +public: + R600TargetMachine(const Target &T, StringRef TT, StringRef FS, + StringRef CPU, TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL); + + TargetPassConfig *createPassConfig(PassManagerBase &PM) override; +}; + +//===----------------------------------------------------------------------===// // GCN Target Machine (SI+) //===----------------------------------------------------------------------===// @@ -64,6 +80,8 @@ public: GCNTargetMachine(const Target &T, StringRef TT, StringRef FS, StringRef CPU, TargetOptions Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); + + TargetPassConfig *createPassConfig(PassManagerBase &PM) override; }; } // End namespace llvm diff --git a/contrib/llvm/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/contrib/llvm/lib/Target/R600/AMDGPUTargetTransformInfo.cpp index 0bc62d0..6dacc74 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/R600/AMDGPUTargetTransformInfo.cpp @@ -15,11 +15,12 @@ // //===----------------------------------------------------------------------===// -#include "AMDGPU.h" -#include "AMDGPUTargetMachine.h" +#include "AMDGPUTargetTransformInfo.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Target/CostTable.h" #include "llvm/Target/TargetLowering.h" @@ -27,78 +28,8 @@ using namespace llvm; #define DEBUG_TYPE "AMDGPUtti" -// Declare the pass initialization routine locally as target-specific passes -// don't have a target-wide initialization entry point, and so we rely on the -// pass constructor initialization. -namespace llvm { -void initializeAMDGPUTTIPass(PassRegistry &); -} - -namespace { - -class AMDGPUTTI final : public ImmutablePass, public TargetTransformInfo { - const AMDGPUTargetMachine *TM; - const AMDGPUSubtarget *ST; - const AMDGPUTargetLowering *TLI; - - /// Estimate the overhead of scalarizing an instruction. Insert and Extract - /// are set if the result needs to be inserted and/or extracted from vectors. - unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; - -public: - AMDGPUTTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) { - llvm_unreachable("This pass cannot be directly constructed"); - } - - AMDGPUTTI(const AMDGPUTargetMachine *TM) - : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), - TLI(TM->getSubtargetImpl()->getTargetLowering()) { - initializeAMDGPUTTIPass(*PassRegistry::getPassRegistry()); - } - - void initializePass() override { pushTTIStack(this); } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - TargetTransformInfo::getAnalysisUsage(AU); - } - - /// Pass identification. - static char ID; - - /// Provide necessary pointer adjustments for the two base classes. - void *getAdjustedAnalysisPointer(const void *ID) override { - if (ID == &TargetTransformInfo::ID) - return (TargetTransformInfo *)this; - return this; - } - - bool hasBranchDivergence() const override; - - void getUnrollingPreferences(const Function *F, Loop *L, - UnrollingPreferences &UP) const override; - - PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const override; - - unsigned getNumberOfRegisters(bool Vector) const override; - unsigned getRegisterBitWidth(bool Vector) const override; - unsigned getMaxInterleaveFactor() const override; -}; - -} // end anonymous namespace - -INITIALIZE_AG_PASS(AMDGPUTTI, TargetTransformInfo, "AMDGPUtti", - "AMDGPU Target Transform Info", true, true, false) -char AMDGPUTTI::ID = 0; - -ImmutablePass * -llvm::createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM) { - return new AMDGPUTTI(TM); -} - -bool AMDGPUTTI::hasBranchDivergence() const { return true; } - -void AMDGPUTTI::getUnrollingPreferences(const Function *, Loop *L, - UnrollingPreferences &UP) const { +void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, + TTI::UnrollingPreferences &UP) { UP.Threshold = 300; // Twice the default. UP.MaxCount = UINT_MAX; UP.Partial = true; @@ -106,13 +37,15 @@ void AMDGPUTTI::getUnrollingPreferences(const Function *, Loop *L, // TODO: Do we want runtime unrolling? for (const BasicBlock *BB : L->getBlocks()) { + const DataLayout &DL = BB->getModule()->getDataLayout(); for (const Instruction &I : *BB) { const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I); if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) continue; const Value *Ptr = GEP->getPointerOperand(); - const AllocaInst *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr)); + const AllocaInst *Alloca = + dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL)); if (Alloca) { // We want to do whatever we can to limit the number of alloca // instructions that make it through to the code generator. allocas @@ -130,13 +63,7 @@ void AMDGPUTTI::getUnrollingPreferences(const Function *, Loop *L, } } -AMDGPUTTI::PopcntSupportKind -AMDGPUTTI::getPopcntSupport(unsigned TyWidth) const { - assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); - return ST->hasBCNT(TyWidth) ? PSK_FastHardware : PSK_Software; -} - -unsigned AMDGPUTTI::getNumberOfRegisters(bool Vec) const { +unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) { if (Vec) return 0; @@ -147,11 +74,9 @@ unsigned AMDGPUTTI::getNumberOfRegisters(bool Vec) const { return 4 * 128; // XXX - 4 channels. Should these count as vector instead? } -unsigned AMDGPUTTI::getRegisterBitWidth(bool) const { - return 32; -} +unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool) { return 32; } -unsigned AMDGPUTTI::getMaxInterleaveFactor() const { +unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { // Semi-arbitrary large amount. return 64; } diff --git a/contrib/llvm/lib/Target/R600/AMDGPUTargetTransformInfo.h b/contrib/llvm/lib/Target/R600/AMDGPUTargetTransformInfo.h new file mode 100644 index 0000000..791c84e --- /dev/null +++ b/contrib/llvm/lib/Target/R600/AMDGPUTargetTransformInfo.h @@ -0,0 +1,78 @@ +//===-- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file a TargetTransformInfo::Concept conforming object specific to the +/// AMDGPU target machine. It uses the target's detailed information to +/// provide more precise answers to certain TTI queries, while letting the +/// target independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H +#define LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H + +#include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/Target/TargetLowering.h" + +namespace llvm { + +class AMDGPUTTIImpl : public BasicTTIImplBase<AMDGPUTTIImpl> { + typedef BasicTTIImplBase<AMDGPUTTIImpl> BaseT; + typedef TargetTransformInfo TTI; + friend BaseT; + + const AMDGPUSubtarget *ST; + const AMDGPUTargetLowering *TLI; + + const AMDGPUSubtarget *getST() const { return ST; } + const AMDGPUTargetLowering *getTLI() const { return TLI; } + +public: + explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM) + : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {} + + // Provide value semantics. MSVC requires that we spell all of these out. + AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg) + : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {} + AMDGPUTTIImpl(AMDGPUTTIImpl &&Arg) + : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), + TLI(std::move(Arg.TLI)) {} + AMDGPUTTIImpl &operator=(const AMDGPUTTIImpl &RHS) { + BaseT::operator=(static_cast<const BaseT &>(RHS)); + ST = RHS.ST; + TLI = RHS.TLI; + return *this; + } + AMDGPUTTIImpl &operator=(AMDGPUTTIImpl &&RHS) { + BaseT::operator=(std::move(static_cast<BaseT &>(RHS))); + ST = std::move(RHS.ST); + TLI = std::move(RHS.TLI); + return *this; + } + + bool hasBranchDivergence() { return true; } + + void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); + + TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { + assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); + return ST->hasBCNT(TyWidth) ? TTI::PSK_FastHardware : TTI::PSK_Software; + } + + unsigned getNumberOfRegisters(bool Vector); + unsigned getRegisterBitWidth(bool Vector); + unsigned getMaxInterleaveFactor(unsigned VF); +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/R600/AMDILCFGStructurizer.cpp b/contrib/llvm/lib/Target/R600/AMDILCFGStructurizer.cpp index ee6e8ec..c9b25a1 100644 --- a/contrib/llvm/lib/Target/R600/AMDILCFGStructurizer.cpp +++ b/contrib/llvm/lib/Target/R600/AMDILCFGStructurizer.cpp @@ -10,8 +10,8 @@ #include "AMDGPU.h" #include "AMDGPUInstrInfo.h" -#include "R600InstrInfo.h" #include "AMDGPUSubtarget.h" +#include "R600InstrInfo.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/SmallVector.h" @@ -30,6 +30,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" +#include <deque> using namespace llvm; @@ -165,6 +166,7 @@ public: TRI = &TII->getRegisterInfo(); DEBUG(MF.dump();); OrderedBlks.clear(); + Visited.clear(); FuncRep = &MF; MLI = &getAnalysis<MachineLoopInfo>(); DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI);); @@ -621,7 +623,7 @@ DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) { for (MachineBasicBlock::iterator It = MBB->begin(); It != MBB->end(); ++It) { MachineInstr *instr = &(*It); - if (instr->getDebugLoc().isUnknown() == false) + if (instr->getDebugLoc()) DL = instr->getDebugLoc(); } return DL; @@ -1075,21 +1077,19 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) { } int AMDGPUCFGStructurizer::loopendPatternMatch() { - std::vector<MachineLoop *> NestedLoops; - for (MachineLoopInfo::iterator It = MLI->begin(), E = MLI->end(); It != E; - ++It) - for (MachineLoop *ML : depth_first(*It)) - NestedLoops.push_back(ML); + std::deque<MachineLoop *> NestedLoops; + for (auto &It: *MLI) + for (MachineLoop *ML : depth_first(It)) + NestedLoops.push_front(ML); if (NestedLoops.size() == 0) return 0; - // Process nested loop outside->inside, so "continue" to a outside loop won't - // be mistaken as "break" of the current loop. + // Process nested loop outside->inside (we did push_front), + // so "continue" to a outside loop won't be mistaken as "break" + // of the current loop. int Num = 0; - for (std::vector<MachineLoop *>::reverse_iterator It = NestedLoops.rbegin(), - E = NestedLoops.rend(); It != E; ++It) { - MachineLoop *ExaminedLoop = *It; + for (MachineLoop *ExaminedLoop : NestedLoops) { if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop]) continue; DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump();); @@ -1611,7 +1611,7 @@ void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB, bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI); - if (UseContinueLogical == false) { + if (!UseContinueLogical) { int BranchOpcode = TrueBranch == ContMBB ? getBranchNzeroOpcode(OldOpcode) : getBranchZeroOpcode(OldOpcode); diff --git a/contrib/llvm/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp index 3b4ba1a..95025a6 100644 --- a/contrib/llvm/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp +++ b/contrib/llvm/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp @@ -8,6 +8,8 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIDefines.h" +#include "llvm/ADT/APFloat.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/STLExtras.h" @@ -27,77 +29,107 @@ #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Debug.h" using namespace llvm; namespace { -class AMDGPUAsmParser : public MCTargetAsmParser { - MCSubtargetInfo &STI; - MCAsmParser &Parser; - - - /// @name Auto-generated Match Functions - /// { - -#define GET_ASSEMBLER_HEADER -#include "AMDGPUGenAsmMatcher.inc" - - /// } - -public: - AMDGPUAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser, - const MCInstrInfo &_MII, - const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(_STI), Parser(_Parser) { - setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); - } - bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; - bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, - OperandVector &Operands, MCStreamer &Out, - uint64_t &ErrorInfo, - bool MatchingInlineAsm) override; - bool ParseDirective(AsmToken DirectiveID) override; - OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic); - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, - SMLoc NameLoc, OperandVector &Operands) override; - - bool parseCnt(int64_t &IntVal); - OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands); -}; +struct OptionalOperand; class AMDGPUOperand : public MCParsedAsmOperand { enum KindTy { Token, - Immediate + Immediate, + Register, + Expression } Kind; + SMLoc StartLoc, EndLoc; + public: AMDGPUOperand(enum KindTy K) : MCParsedAsmOperand(), Kind(K) {} + MCContext *Ctx; + + enum ImmTy { + ImmTyNone, + ImmTyDSOffset0, + ImmTyDSOffset1, + ImmTyGDS, + ImmTyOffset, + ImmTyGLC, + ImmTySLC, + ImmTyTFE, + ImmTyClamp, + ImmTyOMod + }; + struct TokOp { const char *Data; unsigned Length; }; struct ImmOp { + bool IsFPImm; + ImmTy Type; int64_t Val; }; + struct RegOp { + unsigned RegNo; + int Modifiers; + const MCRegisterInfo *TRI; + bool IsForcedVOP3; + }; + union { TokOp Tok; ImmOp Imm; + RegOp Reg; + const MCExpr *Expr; }; void addImmOperands(MCInst &Inst, unsigned N) const { - Inst.addOperand(MCOperand::CreateImm(getImm())); - } - void addRegOperands(MCInst &Inst, unsigned N) const { - llvm_unreachable("addRegOperands"); + Inst.addOperand(MCOperand::createImm(getImm())); } + StringRef getToken() const { return StringRef(Tok.Data, Tok.Length); } + + void addRegOperands(MCInst &Inst, unsigned N) const { + Inst.addOperand(MCOperand::createReg(getReg())); + } + + void addRegOrImmOperands(MCInst &Inst, unsigned N) const { + if (isReg()) + addRegOperands(Inst, N); + else + addImmOperands(Inst, N); + } + + void addRegWithInputModsOperands(MCInst &Inst, unsigned N) const { + Inst.addOperand(MCOperand::createImm( + Reg.Modifiers == -1 ? 0 : Reg.Modifiers)); + addRegOperands(Inst, N); + } + + void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const { + if (isImm()) + addImmOperands(Inst, N); + else { + assert(isExpr()); + Inst.addOperand(MCOperand::createExpr(Expr)); + } + } + + bool defaultTokenHasSuffix() const { + StringRef Token(Tok.Data, Tok.Length); + + return Token.endswith("_e32") || Token.endswith("_e64"); + } + bool isToken() const override { return Kind == Token; } @@ -106,52 +138,384 @@ public: return Kind == Immediate; } + bool isInlineImm() const { + float F = BitsToFloat(Imm.Val); + // TODO: Add 0.5pi for VI + return isImm() && ((Imm.Val <= 64 && Imm.Val >= -16) || + (F == 0.0 || F == 0.5 || F == -0.5 || F == 1.0 || F == -1.0 || + F == 2.0 || F == -2.0 || F == 4.0 || F == -4.0)); + } + + bool isDSOffset0() const { + assert(isImm()); + return Imm.Type == ImmTyDSOffset0; + } + + bool isDSOffset1() const { + assert(isImm()); + return Imm.Type == ImmTyDSOffset1; + } + int64_t getImm() const { return Imm.Val; } + enum ImmTy getImmTy() const { + assert(isImm()); + return Imm.Type; + } + + bool isRegKind() const { + return Kind == Register; + } + bool isReg() const override { - return false; + return Kind == Register && Reg.Modifiers == -1; + } + + bool isRegWithInputMods() const { + return Kind == Register && (Reg.IsForcedVOP3 || Reg.Modifiers != -1); + } + + void setModifiers(unsigned Mods) { + assert(isReg()); + Reg.Modifiers = Mods; + } + + bool hasModifiers() const { + assert(isRegKind()); + return Reg.Modifiers != -1; } unsigned getReg() const override { - return 0; + return Reg.RegNo; + } + + bool isRegOrImm() const { + return isReg() || isImm(); + } + + bool isRegClass(unsigned RCID) const { + return Reg.TRI->getRegClass(RCID).contains(getReg()); + } + + bool isSCSrc32() const { + return isInlineImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID)); + } + + bool isSSrc32() const { + return isImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID)); + } + + bool isSSrc64() const { + return isImm() || isInlineImm() || + (isReg() && isRegClass(AMDGPU::SReg_64RegClassID)); + } + + bool isVCSrc32() const { + return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID)); + } + + bool isVCSrc64() const { + return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID)); + } + + bool isVSrc32() const { + return isImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID)); + } + + bool isVSrc64() const { + return isImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID)); } bool isMem() const override { return false; } + bool isExpr() const { + return Kind == Expression; + } + + bool isSoppBrTarget() const { + return isExpr() || isImm(); + } + SMLoc getStartLoc() const override { - return SMLoc(); + return StartLoc; } SMLoc getEndLoc() const override { - return SMLoc(); + return EndLoc; } void print(raw_ostream &OS) const override { } - static std::unique_ptr<AMDGPUOperand> CreateImm(int64_t Val) { + static std::unique_ptr<AMDGPUOperand> CreateImm(int64_t Val, SMLoc Loc, + enum ImmTy Type = ImmTyNone, + bool IsFPImm = false) { auto Op = llvm::make_unique<AMDGPUOperand>(Immediate); Op->Imm.Val = Val; + Op->Imm.IsFPImm = IsFPImm; + Op->Imm.Type = Type; + Op->StartLoc = Loc; + Op->EndLoc = Loc; return Op; } - static std::unique_ptr<AMDGPUOperand> CreateToken(StringRef Str, SMLoc Loc) { + static std::unique_ptr<AMDGPUOperand> CreateToken(StringRef Str, SMLoc Loc, + bool HasExplicitEncodingSize = true) { auto Res = llvm::make_unique<AMDGPUOperand>(Token); Res->Tok.Data = Str.data(); Res->Tok.Length = Str.size(); + Res->StartLoc = Loc; + Res->EndLoc = Loc; return Res; } + static std::unique_ptr<AMDGPUOperand> CreateReg(unsigned RegNo, SMLoc S, + SMLoc E, + const MCRegisterInfo *TRI, + bool ForceVOP3) { + auto Op = llvm::make_unique<AMDGPUOperand>(Register); + Op->Reg.RegNo = RegNo; + Op->Reg.TRI = TRI; + Op->Reg.Modifiers = -1; + Op->Reg.IsForcedVOP3 = ForceVOP3; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<AMDGPUOperand> CreateExpr(const class MCExpr *Expr, SMLoc S) { + auto Op = llvm::make_unique<AMDGPUOperand>(Expression); + Op->Expr = Expr; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + bool isDSOffset() const; + bool isDSOffset01() const; bool isSWaitCnt() const; + bool isMubufOffset() const; +}; + +class AMDGPUAsmParser : public MCTargetAsmParser { + MCSubtargetInfo &STI; + const MCInstrInfo &MII; + MCAsmParser &Parser; + + unsigned ForcedEncodingSize; + /// @name Auto-generated Match Functions + /// { + +#define GET_ASSEMBLER_HEADER +#include "AMDGPUGenAsmMatcher.inc" + + /// } + +public: + AMDGPUAsmParser(MCSubtargetInfo &STI, MCAsmParser &_Parser, + const MCInstrInfo &MII, + const MCTargetOptions &Options) + : MCTargetAsmParser(), STI(STI), MII(MII), Parser(_Parser), + ForcedEncodingSize(0){ + + if (STI.getFeatureBits().none()) { + // Set default features. + STI.ToggleFeature("SOUTHERN_ISLANDS"); + } + + setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + } + + unsigned getForcedEncodingSize() const { + return ForcedEncodingSize; + } + + void setForcedEncodingSize(unsigned Size) { + ForcedEncodingSize = Size; + } + + bool isForcedVOP3() const { + return ForcedEncodingSize == 64; + } + + bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; + unsigned checkTargetMatchPredicate(MCInst &Inst) override; + bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) override; + bool ParseDirective(AsmToken DirectiveID) override; + OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic); + bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) override; + + OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int, + int64_t Default = 0); + OperandMatchResultTy parseIntWithPrefix(const char *Prefix, + OperandVector &Operands, + enum AMDGPUOperand::ImmTy ImmTy = + AMDGPUOperand::ImmTyNone); + OperandMatchResultTy parseNamedBit(const char *Name, OperandVector &Operands, + enum AMDGPUOperand::ImmTy ImmTy = + AMDGPUOperand::ImmTyNone); + OperandMatchResultTy parseOptionalOps( + const ArrayRef<OptionalOperand> &OptionalOps, + OperandVector &Operands); + + + void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands); + void cvtDS(MCInst &Inst, const OperandVector &Operands); + OperandMatchResultTy parseDSOptionalOps(OperandVector &Operands); + OperandMatchResultTy parseDSOff01OptionalOps(OperandVector &Operands); + OperandMatchResultTy parseDSOffsetOptional(OperandVector &Operands); + + bool parseCnt(int64_t &IntVal); + OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands); + OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands); + + void cvtMubuf(MCInst &Inst, const OperandVector &Operands); + OperandMatchResultTy parseOffset(OperandVector &Operands); + OperandMatchResultTy parseMubufOptionalOps(OperandVector &Operands); + OperandMatchResultTy parseGLC(OperandVector &Operands); + OperandMatchResultTy parseSLC(OperandVector &Operands); + OperandMatchResultTy parseTFE(OperandVector &Operands); + + OperandMatchResultTy parseDMask(OperandVector &Operands); + OperandMatchResultTy parseUNorm(OperandVector &Operands); + OperandMatchResultTy parseR128(OperandVector &Operands); + + void cvtVOP3(MCInst &Inst, const OperandVector &Operands); + OperandMatchResultTy parseVOP3OptionalOps(OperandVector &Operands); +}; + +struct OptionalOperand { + const char *Name; + AMDGPUOperand::ImmTy Type; + bool IsBit; + int64_t Default; + bool (*ConvertResult)(int64_t&); }; } +static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) { + if (IsVgpr) { + switch (RegWidth) { + default: llvm_unreachable("Unknown register width"); + case 1: return AMDGPU::VGPR_32RegClassID; + case 2: return AMDGPU::VReg_64RegClassID; + case 3: return AMDGPU::VReg_96RegClassID; + case 4: return AMDGPU::VReg_128RegClassID; + case 8: return AMDGPU::VReg_256RegClassID; + case 16: return AMDGPU::VReg_512RegClassID; + } + } + + switch (RegWidth) { + default: llvm_unreachable("Unknown register width"); + case 1: return AMDGPU::SGPR_32RegClassID; + case 2: return AMDGPU::SGPR_64RegClassID; + case 4: return AMDGPU::SReg_128RegClassID; + case 8: return AMDGPU::SReg_256RegClassID; + case 16: return AMDGPU::SReg_512RegClassID; + } +} + +static unsigned getRegForName(const StringRef &RegName) { + + return StringSwitch<unsigned>(RegName) + .Case("exec", AMDGPU::EXEC) + .Case("vcc", AMDGPU::VCC) + .Case("flat_scr", AMDGPU::FLAT_SCR) + .Case("m0", AMDGPU::M0) + .Case("scc", AMDGPU::SCC) + .Case("flat_scr_lo", AMDGPU::FLAT_SCR_LO) + .Case("flat_scr_hi", AMDGPU::FLAT_SCR_HI) + .Case("vcc_lo", AMDGPU::VCC_LO) + .Case("vcc_hi", AMDGPU::VCC_HI) + .Case("exec_lo", AMDGPU::EXEC_LO) + .Case("exec_hi", AMDGPU::EXEC_HI) + .Default(0); +} + bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { - return true; + const AsmToken Tok = Parser.getTok(); + StartLoc = Tok.getLoc(); + EndLoc = Tok.getEndLoc(); + const StringRef &RegName = Tok.getString(); + RegNo = getRegForName(RegName); + + if (RegNo) { + Parser.Lex(); + return false; + } + + // Match vgprs and sgprs + if (RegName[0] != 's' && RegName[0] != 'v') + return true; + + bool IsVgpr = RegName[0] == 'v'; + unsigned RegWidth; + unsigned RegIndexInClass; + if (RegName.size() > 1) { + // We have a 32-bit register + RegWidth = 1; + if (RegName.substr(1).getAsInteger(10, RegIndexInClass)) + return true; + Parser.Lex(); + } else { + // We have a register greater than 32-bits. + + int64_t RegLo, RegHi; + Parser.Lex(); + if (getLexer().isNot(AsmToken::LBrac)) + return true; + + Parser.Lex(); + if (getParser().parseAbsoluteExpression(RegLo)) + return true; + + if (getLexer().isNot(AsmToken::Colon)) + return true; + + Parser.Lex(); + if (getParser().parseAbsoluteExpression(RegHi)) + return true; + + if (getLexer().isNot(AsmToken::RBrac)) + return true; + + Parser.Lex(); + RegWidth = (RegHi - RegLo) + 1; + if (IsVgpr) { + // VGPR registers aren't aligned. + RegIndexInClass = RegLo; + } else { + // SGPR registers are aligned. Max alignment is 4 dwords. + RegIndexInClass = RegLo / std::min(RegWidth, 4u); + } + } + + const MCRegisterInfo *TRC = getContext().getRegisterInfo(); + unsigned RC = getRegClass(IsVgpr, RegWidth); + if (RegIndexInClass > TRC->getRegClass(RC).getNumRegs()) + return true; + RegNo = TRC->getRegClass(RC).getRegister(RegIndexInClass); + return false; +} + +unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { + + uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; + + if ((getForcedEncodingSize() == 32 && (TSFlags & SIInstrFlags::VOP3)) || + (getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3))) + return Match_InvalidOperand; + + return Match_Success; } @@ -163,22 +527,52 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, MCInst Inst; switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) { - case Match_Success: - Inst.setLoc(IDLoc); - Out.EmitInstruction(Inst, STI); - return false; - case Match_MissingFeature: - return Error(IDLoc, "instruction use requires an option to be enabled"); - case Match_MnemonicFail: - return Error(IDLoc, "unrecognized instruction mnemonic"); - case Match_InvalidOperand: { - if (ErrorInfo != ~0ULL) { - if (ErrorInfo >= Operands.size()) - return Error(IDLoc, "too few operands for instruction"); + default: break; + case Match_Success: + Inst.setLoc(IDLoc); + Out.EmitInstruction(Inst, STI); + return false; + case Match_MissingFeature: + return Error(IDLoc, "instruction not supported on this GPU"); + case Match_MnemonicFail: + return Error(IDLoc, "unrecognized instruction mnemonic"); + + case Match_InvalidOperand: { + SMLoc ErrorLoc = IDLoc; + if (ErrorInfo != ~0ULL) { + if (ErrorInfo >= Operands.size()) { + if (isForcedVOP3()) { + // If 64-bit encoding has been forced we can end up with no + // clamp or omod operands if none of the registers have modifiers, + // so we need to add these to the operand list. + AMDGPUOperand &LastOp = + ((AMDGPUOperand &)*Operands[Operands.size() - 1]); + if (LastOp.isRegKind() || + (LastOp.isImm() && + LastOp.getImmTy() != AMDGPUOperand::ImmTyNone)) { + SMLoc S = Parser.getTok().getLoc(); + Operands.push_back(AMDGPUOperand::CreateImm(0, S, + AMDGPUOperand::ImmTyClamp)); + Operands.push_back(AMDGPUOperand::CreateImm(0, S, + AMDGPUOperand::ImmTyOMod)); + bool Res = MatchAndEmitInstruction(IDLoc, Opcode, Operands, + Out, ErrorInfo, + MatchingInlineAsm); + if (!Res) + return Res; + } + + } + return Error(IDLoc, "too few operands for instruction"); + } + + ErrorLoc = ((AMDGPUOperand &)*Operands[ErrorInfo]).getStartLoc(); + if (ErrorLoc == SMLoc()) + ErrorLoc = IDLoc; + } + return Error(ErrorLoc, "invalid operand for instruction"); } - return Error(IDLoc, "invalid operand for instruction"); - } } llvm_unreachable("Implement any new match types added!"); } @@ -187,6 +581,19 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { return true; } +static bool operandsHaveModifiers(const OperandVector &Operands) { + + for (unsigned i = 0, e = Operands.size(); i != e; ++i) { + const AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]); + if (Op.isRegKind() && Op.hasModifiers()) + return true; + if (Op.isImm() && (Op.getImmTy() == AMDGPUOperand::ImmTyOMod || + Op.getImmTy() == AMDGPUOperand::ImmTyClamp)) + return true; + } + return false; +} + AMDGPUAsmParser::OperandMatchResultTy AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { @@ -195,17 +602,105 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { // If we successfully parsed the operand or if there as an error parsing, // we are done. - if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail) + // + // If we are parsing after we reach EndOfStatement then this means we + // are appending default values to the Operands list. This is only done + // by custom parser, so we shouldn't continue on to the generic parsing. + if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail || + getLexer().is(AsmToken::EndOfStatement)) return ResTy; + bool Negate = false, Abs = false; + if (getLexer().getKind()== AsmToken::Minus) { + Parser.Lex(); + Negate = true; + } + + if (getLexer().getKind() == AsmToken::Pipe) { + Parser.Lex(); + Abs = true; + } + switch(getLexer().getKind()) { case AsmToken::Integer: { + SMLoc S = Parser.getTok().getLoc(); int64_t IntVal; if (getParser().parseAbsoluteExpression(IntVal)) return MatchOperand_ParseFail; - Operands.push_back(AMDGPUOperand::CreateImm(IntVal)); + APInt IntVal32(32, IntVal); + if (IntVal32.getSExtValue() != IntVal) { + Error(S, "invalid immediate: only 32-bit values are legal"); + return MatchOperand_ParseFail; + } + + IntVal = IntVal32.getSExtValue(); + if (Negate) + IntVal *= -1; + Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S)); return MatchOperand_Success; } + case AsmToken::Real: { + // FIXME: We should emit an error if a double precisions floating-point + // value is used. I'm not sure the best way to detect this. + SMLoc S = Parser.getTok().getLoc(); + int64_t IntVal; + if (getParser().parseAbsoluteExpression(IntVal)) + return MatchOperand_ParseFail; + + APFloat F((float)BitsToDouble(IntVal)); + if (Negate) + F.changeSign(); + Operands.push_back( + AMDGPUOperand::CreateImm(F.bitcastToAPInt().getZExtValue(), S)); + return MatchOperand_Success; + } + case AsmToken::Identifier: { + SMLoc S, E; + unsigned RegNo; + if (!ParseRegister(RegNo, S, E)) { + + bool HasModifiers = operandsHaveModifiers(Operands); + unsigned Modifiers = 0; + + if (Negate) + Modifiers |= 0x1; + + if (Abs) { + if (getLexer().getKind() != AsmToken::Pipe) + return MatchOperand_ParseFail; + Parser.Lex(); + Modifiers |= 0x2; + } + + if (Modifiers && !HasModifiers) { + // We are adding a modifier to src1 or src2 and previous sources + // don't have modifiers, so we need to go back and empty modifers + // for each previous source. + for (unsigned PrevRegIdx = Operands.size() - 1; PrevRegIdx > 1; + --PrevRegIdx) { + + AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[PrevRegIdx]); + RegOp.setModifiers(0); + } + } + + + Operands.push_back(AMDGPUOperand::CreateReg( + RegNo, S, E, getContext().getRegisterInfo(), + isForcedVOP3())); + + if (HasModifiers || Modifiers) { + AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[Operands.size() - 1]); + RegOp.setModifiers(Modifiers); + + } + } else { + Operands.push_back(AMDGPUOperand::CreateToken(Parser.getTok().getString(), + S)); + Parser.Lex(); + } + return MatchOperand_Success; + } default: return MatchOperand_NoMatch; } @@ -214,23 +709,283 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { + + // Clear any forced encodings from the previous instruction. + setForcedEncodingSize(0); + + if (Name.endswith("_e64")) + setForcedEncodingSize(64); + else if (Name.endswith("_e32")) + setForcedEncodingSize(32); + // Add the instruction mnemonic Operands.push_back(AMDGPUOperand::CreateToken(Name, NameLoc)); - if (getLexer().is(AsmToken::EndOfStatement)) - return false; + while (!getLexer().is(AsmToken::EndOfStatement)) { + AMDGPUAsmParser::OperandMatchResultTy Res = parseOperand(Operands, Name); + + // Eat the comma or space if there is one. + if (getLexer().is(AsmToken::Comma)) + Parser.Lex(); - AMDGPUAsmParser::OperandMatchResultTy Res = parseOperand(Operands, Name); - switch (Res) { - case MatchOperand_Success: return false; - case MatchOperand_ParseFail: return Error(NameLoc, - "Failed parsing operand"); - case MatchOperand_NoMatch: return Error(NameLoc, "Not a valid operand"); + switch (Res) { + case MatchOperand_Success: break; + case MatchOperand_ParseFail: return Error(getLexer().getLoc(), + "failed parsing operand."); + case MatchOperand_NoMatch: return Error(getLexer().getLoc(), + "not a valid operand."); + } } - return true; + + // Once we reach end of statement, continue parsing so we can add default + // values for optional arguments. + AMDGPUAsmParser::OperandMatchResultTy Res; + while ((Res = parseOperand(Operands, Name)) != MatchOperand_NoMatch) { + if (Res != MatchOperand_Success) + return Error(getLexer().getLoc(), "failed parsing operand."); + } + return false; +} + +//===----------------------------------------------------------------------===// +// Utility functions +//===----------------------------------------------------------------------===// + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int, + int64_t Default) { + + // We are at the end of the statement, and this is a default argument, so + // use a default value. + if (getLexer().is(AsmToken::EndOfStatement)) { + Int = Default; + return MatchOperand_Success; + } + + switch(getLexer().getKind()) { + default: return MatchOperand_NoMatch; + case AsmToken::Identifier: { + StringRef OffsetName = Parser.getTok().getString(); + if (!OffsetName.equals(Prefix)) + return MatchOperand_NoMatch; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Colon)) + return MatchOperand_ParseFail; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Integer)) + return MatchOperand_ParseFail; + + if (getParser().parseAbsoluteExpression(Int)) + return MatchOperand_ParseFail; + break; + } + } + return MatchOperand_Success; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands, + enum AMDGPUOperand::ImmTy ImmTy) { + + SMLoc S = Parser.getTok().getLoc(); + int64_t Offset = 0; + + AMDGPUAsmParser::OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Offset); + if (Res != MatchOperand_Success) + return Res; + + Operands.push_back(AMDGPUOperand::CreateImm(Offset, S, ImmTy)); + return MatchOperand_Success; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, + enum AMDGPUOperand::ImmTy ImmTy) { + int64_t Bit = 0; + SMLoc S = Parser.getTok().getLoc(); + + // We are at the end of the statement, and this is a default argument, so + // use a default value. + if (getLexer().isNot(AsmToken::EndOfStatement)) { + switch(getLexer().getKind()) { + case AsmToken::Identifier: { + StringRef Tok = Parser.getTok().getString(); + if (Tok == Name) { + Bit = 1; + Parser.Lex(); + } else if (Tok.startswith("no") && Tok.endswith(Name)) { + Bit = 0; + Parser.Lex(); + } else { + return MatchOperand_NoMatch; + } + break; + } + default: + return MatchOperand_NoMatch; + } + } + + Operands.push_back(AMDGPUOperand::CreateImm(Bit, S, ImmTy)); + return MatchOperand_Success; +} + +static bool operandsHasOptionalOp(const OperandVector &Operands, + const OptionalOperand &OOp) { + for (unsigned i = 0; i < Operands.size(); i++) { + const AMDGPUOperand &ParsedOp = ((const AMDGPUOperand &)*Operands[i]); + if ((ParsedOp.isImm() && ParsedOp.getImmTy() == OOp.Type) || + (ParsedOp.isToken() && ParsedOp.getToken() == OOp.Name)) + return true; + + } + return false; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseOptionalOps(const ArrayRef<OptionalOperand> &OptionalOps, + OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + for (const OptionalOperand &Op : OptionalOps) { + if (operandsHasOptionalOp(Operands, Op)) + continue; + AMDGPUAsmParser::OperandMatchResultTy Res; + int64_t Value; + if (Op.IsBit) { + Res = parseNamedBit(Op.Name, Operands, Op.Type); + if (Res == MatchOperand_NoMatch) + continue; + return Res; + } + + Res = parseIntWithPrefix(Op.Name, Value, Op.Default); + + if (Res == MatchOperand_NoMatch) + continue; + + if (Res != MatchOperand_Success) + return Res; + + if (Op.ConvertResult && !Op.ConvertResult(Value)) { + return MatchOperand_ParseFail; + } + + Operands.push_back(AMDGPUOperand::CreateImm(Value, S, Op.Type)); + return MatchOperand_Success; + } + return MatchOperand_NoMatch; } //===----------------------------------------------------------------------===// +// ds +//===----------------------------------------------------------------------===// + +static const OptionalOperand DSOptionalOps [] = { + {"offset", AMDGPUOperand::ImmTyOffset, false, 0, nullptr}, + {"gds", AMDGPUOperand::ImmTyGDS, true, 0, nullptr} +}; + +static const OptionalOperand DSOptionalOpsOff01 [] = { + {"offset0", AMDGPUOperand::ImmTyDSOffset0, false, 0, nullptr}, + {"offset1", AMDGPUOperand::ImmTyDSOffset1, false, 0, nullptr}, + {"gds", AMDGPUOperand::ImmTyGDS, true, 0, nullptr} +}; + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseDSOptionalOps(OperandVector &Operands) { + return parseOptionalOps(DSOptionalOps, Operands); +} +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseDSOff01OptionalOps(OperandVector &Operands) { + return parseOptionalOps(DSOptionalOpsOff01, Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseDSOffsetOptional(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + AMDGPUAsmParser::OperandMatchResultTy Res = + parseIntWithPrefix("offset", Operands, AMDGPUOperand::ImmTyOffset); + if (Res == MatchOperand_NoMatch) { + Operands.push_back(AMDGPUOperand::CreateImm(0, S, + AMDGPUOperand::ImmTyOffset)); + Res = MatchOperand_Success; + } + return Res; +} + +bool AMDGPUOperand::isDSOffset() const { + return isImm() && isUInt<16>(getImm()); +} + +bool AMDGPUOperand::isDSOffset01() const { + return isImm() && isUInt<8>(getImm()); +} + +void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst, + const OperandVector &Operands) { + + std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx; + + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + + // Add the register arguments + if (Op.isReg()) { + Op.addRegOperands(Inst, 1); + continue; + } + + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = i; + } + + unsigned Offset0Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset0]; + unsigned Offset1Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset1]; + unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS]; + + ((AMDGPUOperand &)*Operands[Offset0Idx]).addImmOperands(Inst, 1); // offset0 + ((AMDGPUOperand &)*Operands[Offset1Idx]).addImmOperands(Inst, 1); // offset1 + ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds + Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0 +} + +void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) { + + std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx; + bool GDSOnly = false; + + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + + // Add the register arguments + if (Op.isReg()) { + Op.addRegOperands(Inst, 1); + continue; + } + + if (Op.isToken() && Op.getToken() == "gds") { + GDSOnly = true; + continue; + } + + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = i; + } + + unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset]; + ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1); // offset + + if (!GDSOnly) { + unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS]; + ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds + } + Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0 +} + + +//===----------------------------------------------------------------------===// // s_waitcnt //===----------------------------------------------------------------------===// @@ -284,6 +1039,7 @@ AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { // expcnt [6:4] // lgkmcnt [10:8] int64_t CntVal = 0x77f; + SMLoc S = Parser.getTok().getLoc(); switch(getLexer().getKind()) { default: return MatchOperand_ParseFail; @@ -300,7 +1056,7 @@ AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { } while(getLexer().isNot(AsmToken::EndOfStatement)); break; } - Operands.push_back(AMDGPUOperand::CreateImm(CntVal)); + Operands.push_back(AMDGPUOperand::CreateImm(CntVal, S)); return MatchOperand_Success; } @@ -308,6 +1064,245 @@ bool AMDGPUOperand::isSWaitCnt() const { return isImm(); } +//===----------------------------------------------------------------------===// +// sopp branch targets +//===----------------------------------------------------------------------===// + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + + switch (getLexer().getKind()) { + default: return MatchOperand_ParseFail; + case AsmToken::Integer: { + int64_t Imm; + if (getParser().parseAbsoluteExpression(Imm)) + return MatchOperand_ParseFail; + Operands.push_back(AMDGPUOperand::CreateImm(Imm, S)); + return MatchOperand_Success; + } + + case AsmToken::Identifier: + Operands.push_back(AMDGPUOperand::CreateExpr( + MCSymbolRefExpr::create(getContext().getOrCreateSymbol( + Parser.getTok().getString()), getContext()), S)); + Parser.Lex(); + return MatchOperand_Success; + } +} + +//===----------------------------------------------------------------------===// +// mubuf +//===----------------------------------------------------------------------===// + +static const OptionalOperand MubufOptionalOps [] = { + {"offset", AMDGPUOperand::ImmTyOffset, false, 0, nullptr}, + {"glc", AMDGPUOperand::ImmTyGLC, true, 0, nullptr}, + {"slc", AMDGPUOperand::ImmTySLC, true, 0, nullptr}, + {"tfe", AMDGPUOperand::ImmTyTFE, true, 0, nullptr} +}; + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseMubufOptionalOps(OperandVector &Operands) { + return parseOptionalOps(MubufOptionalOps, Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseOffset(OperandVector &Operands) { + return parseIntWithPrefix("offset", Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseGLC(OperandVector &Operands) { + return parseNamedBit("glc", Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseSLC(OperandVector &Operands) { + return parseNamedBit("slc", Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseTFE(OperandVector &Operands) { + return parseNamedBit("tfe", Operands); +} + +bool AMDGPUOperand::isMubufOffset() const { + return isImm() && isUInt<12>(getImm()); +} + +void AMDGPUAsmParser::cvtMubuf(MCInst &Inst, + const OperandVector &Operands) { + std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx; + + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + + // Add the register arguments + if (Op.isReg()) { + Op.addRegOperands(Inst, 1); + continue; + } + + // Handle the case where soffset is an immediate + if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) { + Op.addImmOperands(Inst, 1); + continue; + } + + // Handle tokens like 'offen' which are sometimes hard-coded into the + // asm string. There are no MCInst operands for these. + if (Op.isToken()) { + continue; + } + assert(Op.isImm()); + + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = i; + } + + assert(OptionalIdx.size() == 4); + + unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset]; + unsigned GLCIdx = OptionalIdx[AMDGPUOperand::ImmTyGLC]; + unsigned SLCIdx = OptionalIdx[AMDGPUOperand::ImmTySLC]; + unsigned TFEIdx = OptionalIdx[AMDGPUOperand::ImmTyTFE]; + + ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1); + ((AMDGPUOperand &)*Operands[GLCIdx]).addImmOperands(Inst, 1); + ((AMDGPUOperand &)*Operands[SLCIdx]).addImmOperands(Inst, 1); + ((AMDGPUOperand &)*Operands[TFEIdx]).addImmOperands(Inst, 1); +} + +//===----------------------------------------------------------------------===// +// mimg +//===----------------------------------------------------------------------===// + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseDMask(OperandVector &Operands) { + return parseIntWithPrefix("dmask", Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseUNorm(OperandVector &Operands) { + return parseNamedBit("unorm", Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseR128(OperandVector &Operands) { + return parseNamedBit("r128", Operands); +} + +//===----------------------------------------------------------------------===// +// vop3 +//===----------------------------------------------------------------------===// + +static bool ConvertOmodMul(int64_t &Mul) { + if (Mul != 1 && Mul != 2 && Mul != 4) + return false; + + Mul >>= 1; + return true; +} + +static bool ConvertOmodDiv(int64_t &Div) { + if (Div == 1) { + Div = 0; + return true; + } + + if (Div == 2) { + Div = 3; + return true; + } + + return false; +} + +static const OptionalOperand VOP3OptionalOps [] = { + {"clamp", AMDGPUOperand::ImmTyClamp, true, 0, nullptr}, + {"mul", AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodMul}, + {"div", AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodDiv}, +}; + +static bool isVOP3(OperandVector &Operands) { + if (operandsHaveModifiers(Operands)) + return true; + + AMDGPUOperand &DstOp = ((AMDGPUOperand&)*Operands[1]); + + if (DstOp.isReg() && DstOp.isRegClass(AMDGPU::SGPR_64RegClassID)) + return true; + + if (Operands.size() >= 5) + return true; + + if (Operands.size() > 3) { + AMDGPUOperand &Src1Op = ((AMDGPUOperand&)*Operands[3]); + if (Src1Op.getReg() && (Src1Op.isRegClass(AMDGPU::SReg_32RegClassID) || + Src1Op.isRegClass(AMDGPU::SReg_64RegClassID))) + return true; + } + return false; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseVOP3OptionalOps(OperandVector &Operands) { + + // The value returned by this function may change after parsing + // an operand so store the original value here. + bool HasModifiers = operandsHaveModifiers(Operands); + + bool IsVOP3 = isVOP3(Operands); + if (HasModifiers || IsVOP3 || + getLexer().isNot(AsmToken::EndOfStatement) || + getForcedEncodingSize() == 64) { + + AMDGPUAsmParser::OperandMatchResultTy Res = + parseOptionalOps(VOP3OptionalOps, Operands); + + if (!HasModifiers && Res == MatchOperand_Success) { + // We have added a modifier operation, so we need to make sure all + // previous register operands have modifiers + for (unsigned i = 2, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]); + if (Op.isReg()) + Op.setModifiers(0); + } + } + return Res; + } + return MatchOperand_NoMatch; +} + +void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { + ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1); + unsigned i = 2; + + std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx; + + if (operandsHaveModifiers(Operands)) { + for (unsigned e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + + if (Op.isRegWithInputMods()) { + ((AMDGPUOperand &)*Operands[i]).addRegWithInputModsOperands(Inst, 2); + continue; + } + OptionalIdx[Op.getImmTy()] = i; + } + + unsigned ClampIdx = OptionalIdx[AMDGPUOperand::ImmTyClamp]; + unsigned OModIdx = OptionalIdx[AMDGPUOperand::ImmTyOMod]; + + ((AMDGPUOperand &)*Operands[ClampIdx]).addImmOperands(Inst, 1); + ((AMDGPUOperand &)*Operands[OModIdx]).addImmOperands(Inst, 1); + } else { + for (unsigned e = Operands.size(); i != e; ++i) + ((AMDGPUOperand &)*Operands[i]).addRegOrImmOperands(Inst, 1); + } +} + /// Force static initialization. extern "C" void LLVMInitializeR600AsmParser() { RegisterMCAsmParser<AMDGPUAsmParser> A(TheAMDGPUTarget); diff --git a/contrib/llvm/lib/Target/R600/CIInstructions.td b/contrib/llvm/lib/Target/R600/CIInstructions.td index fdb58bb..560aa78 100644 --- a/contrib/llvm/lib/Target/R600/CIInstructions.td +++ b/contrib/llvm/lib/Target/R600/CIInstructions.td @@ -11,9 +11,9 @@ def isCIVI : Predicate < - "Subtarget.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || " - "Subtarget.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS" ->; + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || " + "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS" +>, AssemblerPredicate<"FeatureCIInsts">; //===----------------------------------------------------------------------===// // VOP1 Instructions diff --git a/contrib/llvm/lib/Target/R600/CaymanInstructions.td b/contrib/llvm/lib/Target/R600/CaymanInstructions.td index 433c3fc..ba4df82 100644 --- a/contrib/llvm/lib/Target/R600/CaymanInstructions.td +++ b/contrib/llvm/lib/Target/R600/CaymanInstructions.td @@ -12,7 +12,7 @@ // //===----------------------------------------------------------------------===// -def isCayman : Predicate<"Subtarget.hasCaymanISA()">; +def isCayman : Predicate<"Subtarget->hasCaymanISA()">; //===----------------------------------------------------------------------===// // Cayman Instructions diff --git a/contrib/llvm/lib/Target/R600/EvergreenInstructions.td b/contrib/llvm/lib/Target/R600/EvergreenInstructions.td index 299d1fa..7adcd46 100644 --- a/contrib/llvm/lib/Target/R600/EvergreenInstructions.td +++ b/contrib/llvm/lib/Target/R600/EvergreenInstructions.td @@ -14,14 +14,14 @@ //===----------------------------------------------------------------------===// def isEG : Predicate< - "Subtarget.getGeneration() >= AMDGPUSubtarget::EVERGREEN && " - "Subtarget.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && " - "!Subtarget.hasCaymanISA()" + "Subtarget->getGeneration() >= AMDGPUSubtarget::EVERGREEN && " + "Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && " + "!Subtarget->hasCaymanISA()" >; def isEGorCayman : Predicate< - "Subtarget.getGeneration() == AMDGPUSubtarget::EVERGREEN ||" - "Subtarget.getGeneration() ==AMDGPUSubtarget::NORTHERN_ISLANDS" + "Subtarget->getGeneration() == AMDGPUSubtarget::EVERGREEN ||" + "Subtarget->getGeneration() ==AMDGPUSubtarget::NORTHERN_ISLANDS" >; //===----------------------------------------------------------------------===// @@ -287,9 +287,8 @@ def BFE_INT_eg : R600_3OP <0x5, "BFE_INT", VecALU >; -// XXX: This pattern is broken, disabling for now. See comment in -// AMDGPUInstructions.td for more info. -// def : BFEPattern <BFE_UINT_eg>; +def : BFEPattern <BFE_UINT_eg, MOV_IMM_I32>; + def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))], VecALU @@ -336,6 +335,9 @@ defm CUBE_eg : CUBE_Common<0xC0>; def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>; +def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>; +def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>; + def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", ctlz_zero_undef, VecALU>; def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>; @@ -590,8 +592,6 @@ def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>; // SHA-256 Patterns def : SHA256MaPattern <BFI_INT_eg, XOR_INT>; -def : FROUNDPat <CNDGE_eg, CNDGT_eg>; - def EG_ExportSwz : ExportSwzInst { let Word1{19-16} = 0; // BURST_COUNT let Word1{20} = 0; // VALID_PIXEL_MODE diff --git a/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp index b66ed10..f706769 100644 --- a/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp +++ b/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp @@ -20,7 +20,7 @@ using namespace llvm; void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, - StringRef Annot) { + StringRef Annot, const MCSubtargetInfo &STI) { OS.flush(); printInstruction(MI, OS); @@ -89,14 +89,24 @@ void AMDGPUInstPrinter::printDSOffset(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printDSOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << " offset0:"; - printU8ImmDecOperand(MI, OpNo, O); + if (MI->getOperand(OpNo).getImm()) { + O << " offset0:"; + printU8ImmDecOperand(MI, OpNo, O); + } } void AMDGPUInstPrinter::printDSOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << " offset1:"; - printU8ImmDecOperand(MI, OpNo, O); + if (MI->getOperand(OpNo).getImm()) { + O << " offset1:"; + printU8ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " gds"; } void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo, @@ -117,7 +127,8 @@ void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo, O << " tfe"; } -void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) { +void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O, + const MCRegisterInfo &MRI) { switch (reg) { case AMDGPU::VCC: O << "vcc"; @@ -208,6 +219,16 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) { O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']'; } +void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3) + O << "_e64 "; + else + O << "_e32 "; + + printOperand(MI, OpNo, O); +} + void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, raw_ostream &O) { int32_t SImm = static_cast<int32_t>(Imm); if (SImm >= -16 && SImm <= 64) { @@ -277,7 +298,7 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, break; default: - printRegOperand(Op.getReg(), O); + printRegOperand(Op.getReg(), O, MRI); break; } } else if (Op.isImm()) { @@ -316,7 +337,7 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, } } else if (Op.isExpr()) { const MCExpr *Exp = Op.getExpr(); - Exp->print(O); + Exp->print(O, &MAI); } else { llvm_unreachable("unknown operand type in printOperand"); } diff --git a/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h index 1d43c7a..14fb511 100644 --- a/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h +++ b/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h @@ -29,7 +29,10 @@ public: void printInstruction(const MCInst *MI, raw_ostream &O); static const char *getRegisterName(unsigned RegNo); - void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override; + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + static void printRegOperand(unsigned RegNo, raw_ostream &O, + const MCRegisterInfo &MRI); private: void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); @@ -44,10 +47,12 @@ private: void printDSOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printDSOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printDSOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printGDS(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printGLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printRegOperand(unsigned RegNo, raw_ostream &O); + void printVOPDst(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printImmediate32(uint32_t I, raw_ostream &O); void printImmediate64(uint64_t I, raw_ostream &O); void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp index 5fb311b..3713223 100644 --- a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -24,19 +24,19 @@ namespace { class AMDGPUMCObjectWriter : public MCObjectWriter { public: - AMDGPUMCObjectWriter(raw_ostream &OS) : MCObjectWriter(OS, true) { } - void ExecutePostLayoutBinding(MCAssembler &Asm, + AMDGPUMCObjectWriter(raw_pwrite_stream &OS) : MCObjectWriter(OS, true) {} + void executePostLayoutBinding(MCAssembler &Asm, const MCAsmLayout &Layout) override { //XXX: Implement if necessary. } - void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout, + void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout, const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, bool &IsPCRel, uint64_t &FixedValue) override { assert(!"Not implemented"); } - void WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout) override; + void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override; }; @@ -64,10 +64,10 @@ public: } //End anonymous namespace -void AMDGPUMCObjectWriter::WriteObject(MCAssembler &Asm, +void AMDGPUMCObjectWriter::writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) { for (MCAssembler::iterator I = Asm.begin(), E = Asm.end(); I != E; ++I) { - Asm.writeSectionData(I, Layout); + Asm.writeSectionData(&*I, Layout); } } @@ -115,8 +115,7 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo( } bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { - for (unsigned i = 0; i < Count; ++i) - OW->Write8(0); + OW->WriteZeros(Count); return true; } @@ -131,7 +130,7 @@ class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend { public: ELFAMDGPUAsmBackend(const Target &T) : AMDGPUAsmBackend(T) { } - MCObjectWriter *createObjectWriter(raw_ostream &OS) const override { + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { return createAMDGPUELFObjectWriter(OS); } }; diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp index 5fb94d5..59f45ff 100644 --- a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -33,7 +33,7 @@ protected: AMDGPUELFObjectWriter::AMDGPUELFObjectWriter() : MCELFObjectTargetWriter(false, 0, 0, false) { } -MCObjectWriter *llvm::createAMDGPUELFObjectWriter(raw_ostream &OS) { +MCObjectWriter *llvm::createAMDGPUELFObjectWriter(raw_pwrite_stream &OS) { MCELFObjectTargetWriter *MOTW = new AMDGPUELFObjectWriter(); return createELFObjectWriter(MOTW, OS, true); } diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 19d89fb..028a86d 100644 --- a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -11,7 +11,7 @@ #include "AMDGPUMCAsmInfo.h" using namespace llvm; -AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfoELF() { +AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() { HasSingleParameterDotFile = false; //===------------------------------------------------------------------===// MaxInstLength = 16; diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h index 8f75c76..a5bac51 100644 --- a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h +++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h @@ -17,7 +17,7 @@ #include "llvm/MC/MCAsmInfoELF.h" namespace llvm { -class StringRef; +class Triple; // If you need to create another MCAsmInfo class, which inherits from MCAsmInfo, // you will need to make sure your new class sets PrivateGlobalPrefix to @@ -26,7 +26,7 @@ class StringRef; // with 'L' as a local symbol. class AMDGPUMCAsmInfo : public MCAsmInfoELF { public: - explicit AMDGPUMCAsmInfo(StringRef &TT); + explicit AMDGPUMCAsmInfo(const Triple &TT); }; } // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 83403ba..1bc205d 100644 --- a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -17,6 +17,7 @@ #include "InstPrinter/AMDGPUInstPrinter.h" #include "SIDefines.h" #include "llvm/MC/MCCodeGenInfo.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" @@ -59,63 +60,31 @@ static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(StringRef TT, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) { MCCodeGenInfo *X = new MCCodeGenInfo(); - X->InitMCCodeGenInfo(RM, CM, OL); + X->initMCCodeGenInfo(RM, CM, OL); return X; } -static MCInstPrinter *createAMDGPUMCInstPrinter(const Target &T, +static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T, unsigned SyntaxVariant, const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI) { + const MCRegisterInfo &MRI) { return new AMDGPUInstPrinter(MAI, MII, MRI); } -static MCCodeEmitter *createAMDGPUMCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, - MCContext &Ctx) { - if (STI.getFeatureBits() & AMDGPU::Feature64BitPtr) { - return createSIMCCodeEmitter(MCII, MRI, STI, Ctx); - } else { - return createR600MCCodeEmitter(MCII, MRI, STI); - } -} - -static MCStreamer *createMCStreamer(const Target &T, StringRef TT, - MCContext &Ctx, MCAsmBackend &MAB, - raw_ostream &_OS, MCCodeEmitter *_Emitter, - const MCSubtargetInfo &STI, bool RelaxAll) { - return createELFStreamer(Ctx, MAB, _OS, _Emitter, false); -} - extern "C" void LLVMInitializeR600TargetMC() { + for (Target *T : {&TheAMDGPUTarget, &TheGCNTarget}) { + RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T); + + TargetRegistry::RegisterMCCodeGenInfo(*T, createAMDGPUMCCodeGenInfo); + TargetRegistry::RegisterMCInstrInfo(*T, createAMDGPUMCInstrInfo); + TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo); + TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo); + TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter); + TargetRegistry::RegisterMCAsmBackend(*T, createAMDGPUAsmBackend); + } - RegisterMCAsmInfo<AMDGPUMCAsmInfo> Y(TheAMDGPUTarget); - RegisterMCAsmInfo<AMDGPUMCAsmInfo> Z(TheGCNTarget); - - TargetRegistry::RegisterMCCodeGenInfo(TheAMDGPUTarget, createAMDGPUMCCodeGenInfo); - TargetRegistry::RegisterMCCodeGenInfo(TheGCNTarget, createAMDGPUMCCodeGenInfo); - - TargetRegistry::RegisterMCInstrInfo(TheAMDGPUTarget, createAMDGPUMCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(TheGCNTarget, createAMDGPUMCInstrInfo); - - TargetRegistry::RegisterMCRegInfo(TheAMDGPUTarget, createAMDGPUMCRegisterInfo); - TargetRegistry::RegisterMCRegInfo(TheGCNTarget, createAMDGPUMCRegisterInfo); - - TargetRegistry::RegisterMCSubtargetInfo(TheAMDGPUTarget, createAMDGPUMCSubtargetInfo); - TargetRegistry::RegisterMCSubtargetInfo(TheGCNTarget, createAMDGPUMCSubtargetInfo); - - TargetRegistry::RegisterMCInstPrinter(TheAMDGPUTarget, createAMDGPUMCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(TheGCNTarget, createAMDGPUMCInstPrinter); - - TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget, createAMDGPUMCCodeEmitter); - TargetRegistry::RegisterMCCodeEmitter(TheGCNTarget, createAMDGPUMCCodeEmitter); - - TargetRegistry::RegisterMCAsmBackend(TheAMDGPUTarget, createAMDGPUAsmBackend); - TargetRegistry::RegisterMCAsmBackend(TheGCNTarget, createAMDGPUAsmBackend); - - TargetRegistry::RegisterMCObjectStreamer(TheAMDGPUTarget, createMCStreamer); - TargetRegistry::RegisterMCObjectStreamer(TheGCNTarget, createMCStreamer); + TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget, + createR600MCCodeEmitter); + TargetRegistry::RegisterMCCodeEmitter(TheGCNTarget, createSIMCCodeEmitter); } diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h index bc8cd53..9a7548e 100644 --- a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -16,6 +16,7 @@ #ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H #define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H +#include "llvm/Support/DataTypes.h" #include "llvm/ADT/StringRef.h" namespace llvm { @@ -27,6 +28,7 @@ class MCObjectWriter; class MCRegisterInfo; class MCSubtargetInfo; class Target; +class raw_pwrite_stream; class raw_ostream; extern Target TheAMDGPUTarget; @@ -34,17 +36,16 @@ extern Target TheGCNTarget; MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI); + MCContext &Ctx); MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx); MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI, StringRef TT, StringRef CPU); -MCObjectWriter *createAMDGPUELFObjectWriter(raw_ostream &OS); +MCObjectWriter *createAMDGPUELFObjectWriter(raw_pwrite_stream &OS); } // End llvm namespace #define GET_REGINFO_ENUM diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/contrib/llvm/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index dc1344f..e683498 100644 --- a/contrib/llvm/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -23,6 +23,7 @@ #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/EndianStream.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -30,8 +31,8 @@ using namespace llvm; namespace { class R600MCCodeEmitter : public AMDGPUMCCodeEmitter { - R600MCCodeEmitter(const R600MCCodeEmitter &) LLVM_DELETED_FUNCTION; - void operator=(const R600MCCodeEmitter &) LLVM_DELETED_FUNCTION; + R600MCCodeEmitter(const R600MCCodeEmitter &) = delete; + void operator=(const R600MCCodeEmitter &) = delete; const MCInstrInfo &MCII; const MCRegisterInfo &MRI; @@ -41,7 +42,7 @@ public: : MCII(mcii), MRI(mri) { } /// \brief Encode the instruction and write it to the OS. - void EncodeInstruction(const MCInst &MI, raw_ostream &OS, + void encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; @@ -81,12 +82,12 @@ enum FCInstr { }; MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI) { + const MCRegisterInfo &MRI, + MCContext &Ctx) { return new R600MCCodeEmitter(MCII, MRI); } -void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, +void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); @@ -99,7 +100,7 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, } else if (IS_VTX(Desc)) { uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups, STI); uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset - if (!(STI.getFeatureBits() & AMDGPU::FeatureCaymanISA)) { + if (!(STI.getFeatureBits()[AMDGPU::FeatureCaymanISA])) { InstWord2 |= 1 << 19; // Mega-Fetch bit } @@ -132,7 +133,7 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, Emit((uint32_t) 0, OS); } else { uint64_t Inst = getBinaryCodeForInstr(MI, Fixups, STI); - if ((STI.getFeatureBits() & AMDGPU::FeatureR600ALUInst) && + if ((STI.getFeatureBits()[AMDGPU::FeatureR600ALUInst]) && ((Desc.TSFlags & R600_InstFlag::OP1) || Desc.TSFlags & R600_InstFlag::OP2)) { uint64_t ISAOpCode = Inst & (0x3FFULL << 39); @@ -148,15 +149,11 @@ void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const { } void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const { - for (unsigned i = 0; i < 4; i++) { - OS.write((uint8_t) ((Value >> (8 * i)) & 0xff)); - } + support::endian::Writer<support::little>(OS).write(Value); } void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const { - for (unsigned i = 0; i < 8; i++) { - EmitByte((Value >> (8 * i)) & 0xff, OS); - } + support::endian::Writer<support::little>(OS).write(Value); } unsigned R600MCCodeEmitter::getHWRegChan(unsigned reg) const { diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/contrib/llvm/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp index 640de3f..65a0eeb 100644 --- a/contrib/llvm/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp @@ -31,15 +31,9 @@ using namespace llvm; namespace { -/// \brief Helper type used in encoding -typedef union { - int32_t I; - float F; -} IntFloatUnion; - class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { - SIMCCodeEmitter(const SIMCCodeEmitter &) LLVM_DELETED_FUNCTION; - void operator=(const SIMCCodeEmitter &) LLVM_DELETED_FUNCTION; + SIMCCodeEmitter(const SIMCCodeEmitter &) = delete; + void operator=(const SIMCCodeEmitter &) = delete; const MCInstrInfo &MCII; const MCRegisterInfo &MRI; MCContext &Ctx; @@ -48,17 +42,17 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const; /// \brief Encode an fp or int literal - uint32_t getLitEncoding(const MCOperand &MO) const; + uint32_t getLitEncoding(const MCOperand &MO, unsigned OpSize) const; public: SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, MCContext &ctx) : MCII(mcii), MRI(mri), Ctx(ctx) { } - ~SIMCCodeEmitter() { } + ~SIMCCodeEmitter() override {} /// \brief Encode the instruction and write it to the OS. - void EncodeInstruction(const MCInst &MI, raw_ostream &OS, + void encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; @@ -78,7 +72,6 @@ public: MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx) { return new SIMCCodeEmitter(MCII, MRI, Ctx); } @@ -91,52 +84,102 @@ bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc, OpType == AMDGPU::OPERAND_REG_INLINE_C; } -uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO) const { +// Returns the encoding value to use if the given integer is an integer inline +// immediate value, or 0 if it is not. +template <typename IntTy> +static uint32_t getIntInlineImmEncoding(IntTy Imm) { + if (Imm >= 0 && Imm <= 64) + return 128 + Imm; - IntFloatUnion Imm; - if (MO.isImm()) - Imm.I = MO.getImm(); - else if (MO.isFPImm()) - Imm.F = MO.getFPImm(); - else if (MO.isExpr()) - return 255; - else - return ~0; + if (Imm >= -16 && Imm <= -1) + return 192 + std::abs(Imm); + + return 0; +} + +static uint32_t getLit32Encoding(uint32_t Val) { + uint32_t IntImm = getIntInlineImmEncoding(static_cast<int32_t>(Val)); + if (IntImm != 0) + return IntImm; + + if (Val == FloatToBits(0.5f)) + return 240; + + if (Val == FloatToBits(-0.5f)) + return 241; + + if (Val == FloatToBits(1.0f)) + return 242; + + if (Val == FloatToBits(-1.0f)) + return 243; + + if (Val == FloatToBits(2.0f)) + return 244; + + if (Val == FloatToBits(-2.0f)) + return 245; - if (Imm.I >= 0 && Imm.I <= 64) - return 128 + Imm.I; + if (Val == FloatToBits(4.0f)) + return 246; + + if (Val == FloatToBits(-4.0f)) + return 247; + + return 255; +} - if (Imm.I >= -16 && Imm.I <= -1) - return 192 + abs(Imm.I); +static uint32_t getLit64Encoding(uint64_t Val) { + uint32_t IntImm = getIntInlineImmEncoding(static_cast<int64_t>(Val)); + if (IntImm != 0) + return IntImm; - if (Imm.F == 0.5f) + if (Val == DoubleToBits(0.5)) return 240; - if (Imm.F == -0.5f) + if (Val == DoubleToBits(-0.5)) return 241; - if (Imm.F == 1.0f) + if (Val == DoubleToBits(1.0)) return 242; - if (Imm.F == -1.0f) + if (Val == DoubleToBits(-1.0)) return 243; - if (Imm.F == 2.0f) + if (Val == DoubleToBits(2.0)) return 244; - if (Imm.F == -2.0f) + if (Val == DoubleToBits(-2.0)) return 245; - if (Imm.F == 4.0f) + if (Val == DoubleToBits(4.0)) return 246; - if (Imm.F == -4.0f) + if (Val == DoubleToBits(-4.0)) return 247; return 255; } -void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, +uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, + unsigned OpSize) const { + if (MO.isExpr()) + return 255; + + assert(!MO.isFPImm()); + + if (!MO.isImm()) + return ~0; + + if (OpSize == 4) + return getLit32Encoding(static_cast<uint32_t>(MO.getImm())); + + assert(OpSize == 8); + + return getLit64Encoding(static_cast<uint64_t>(MO.getImm())); +} + +void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { @@ -158,25 +201,24 @@ void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, if (!isSrcOperand(Desc, i)) continue; + int RCID = Desc.OpInfo[i].RegClass; + const MCRegisterClass &RC = MRI.getRegClass(RCID); + // Is this operand a literal immediate? const MCOperand &Op = MI.getOperand(i); - if (getLitEncoding(Op) != 255) + if (getLitEncoding(Op, RC.getSize()) != 255) continue; // Yes! Encode it - IntFloatUnion Imm; + int64_t Imm = 0; + if (Op.isImm()) - Imm.I = Op.getImm(); - else if (Op.isFPImm()) - Imm.F = Op.getFPImm(); - else { - assert(Op.isExpr()); - // This will be replaced with a fixup value. - Imm.I = 0; - } + Imm = Op.getImm(); + else if (!Op.isExpr()) // Exprs will be replaced with a fixup value. + llvm_unreachable("Must be immediate or expr"); for (unsigned j = 0; j < 4; j++) { - OS.write((uint8_t) ((Imm.I >> (8 * j)) & 0xff)); + OS.write((uint8_t) ((Imm >> (8 * j)) & 0xff)); } // Only one literal value allowed @@ -192,7 +234,7 @@ unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, if (MO.isExpr()) { const MCExpr *Expr = MO.getExpr(); MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_sopp_br; - Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc())); + Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc())); return 0; } @@ -210,7 +252,7 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCSymbolRefExpr *Expr = cast<MCSymbolRefExpr>(MO.getExpr()); MCFixupKind Kind; const MCSymbol *Sym = - Ctx.GetOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); + Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); if (&Expr->getSymbol() == Sym) { // Add the offset to the beginning of the constant values. @@ -219,7 +261,7 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, // This is used for constant data stored in .rodata. Kind = (MCFixupKind)AMDGPU::fixup_si_rodata; } - Fixups.push_back(MCFixup::Create(4, Expr, Kind, MI.getLoc())); + Fixups.push_back(MCFixup::create(4, Expr, Kind, MI.getLoc())); } // Figure out the operand number, needed for isSrcOperand check @@ -231,7 +273,10 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); if (isSrcOperand(Desc, OpNo)) { - uint32_t Enc = getLitEncoding(MO); + int RCID = Desc.OpInfo[OpNo].RegClass; + const MCRegisterClass &RC = MRI.getRegClass(RCID); + + uint32_t Enc = getLitEncoding(MO, RC.getSize()); if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4)) return Enc; diff --git a/contrib/llvm/lib/Target/R600/Processors.td b/contrib/llvm/lib/Target/R600/Processors.td index e5fef0c..c0ffede 100644 --- a/contrib/llvm/lib/Target/R600/Processors.td +++ b/contrib/llvm/lib/Target/R600/Processors.td @@ -83,9 +83,13 @@ def : Proc<"cayman", R600_VLIW4_Itin, // Southern Islands //===----------------------------------------------------------------------===// -def : ProcessorModel<"SI", SIFullSpeedModel, [FeatureSouthernIslands]>; +def : ProcessorModel<"SI", SIFullSpeedModel, + [FeatureSouthernIslands, FeatureFastFMAF32] +>; -def : ProcessorModel<"tahiti", SIFullSpeedModel, [FeatureSouthernIslands]>; +def : ProcessorModel<"tahiti", SIFullSpeedModel, + [FeatureSouthernIslands, FeatureFastFMAF32] +>; def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureSouthernIslands]>; @@ -99,15 +103,24 @@ def : ProcessorModel<"hainan", SIQuarterSpeedModel, [FeatureSouthernIslands]>; // Sea Islands //===----------------------------------------------------------------------===// -def : ProcessorModel<"bonaire", SIQuarterSpeedModel, [FeatureSeaIslands]>; +def : ProcessorModel<"bonaire", SIQuarterSpeedModel, + [FeatureSeaIslands, FeatureLDSBankCount32] +>; -def : ProcessorModel<"kabini", SIQuarterSpeedModel, [FeatureSeaIslands]>; +def : ProcessorModel<"kabini", SIQuarterSpeedModel, + [FeatureSeaIslands, FeatureLDSBankCount16] +>; -def : ProcessorModel<"kaveri", SIQuarterSpeedModel, [FeatureSeaIslands]>; +def : ProcessorModel<"kaveri", SIQuarterSpeedModel, + [FeatureSeaIslands, FeatureLDSBankCount32] +>; -def : ProcessorModel<"hawaii", SIFullSpeedModel, [FeatureSeaIslands]>; +def : ProcessorModel<"hawaii", SIFullSpeedModel, + [FeatureSeaIslands, FeatureFastFMAF32, FeatureLDSBankCount32] +>; -def : ProcessorModel<"mullins", SIQuarterSpeedModel, [FeatureSeaIslands]>; +def : ProcessorModel<"mullins", SIQuarterSpeedModel, + [FeatureSeaIslands, FeatureLDSBankCount16]>; //===----------------------------------------------------------------------===// // Volcanic Islands diff --git a/contrib/llvm/lib/Target/R600/R600ClauseMergePass.cpp b/contrib/llvm/lib/Target/R600/R600ClauseMergePass.cpp index f07be00..3cb9021 100644 --- a/contrib/llvm/lib/Target/R600/R600ClauseMergePass.cpp +++ b/contrib/llvm/lib/Target/R600/R600ClauseMergePass.cpp @@ -14,11 +14,11 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "R600Defines.h" #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" #include "R600RegisterInfo.h" -#include "AMDGPUSubtarget.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" diff --git a/contrib/llvm/lib/Target/R600/R600ControlFlowFinalizer.cpp b/contrib/llvm/lib/Target/R600/R600ControlFlowFinalizer.cpp index edaf278..c8f37f6 100644 --- a/contrib/llvm/lib/Target/R600/R600ControlFlowFinalizer.cpp +++ b/contrib/llvm/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -39,14 +39,14 @@ struct CFStack { FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3 }; - const AMDGPUSubtarget &ST; + const AMDGPUSubtarget *ST; std::vector<StackItem> BranchStack; std::vector<StackItem> LoopStack; unsigned MaxStackSize; unsigned CurrentEntries; unsigned CurrentSubEntries; - CFStack(const AMDGPUSubtarget &st, unsigned ShaderType) : ST(st), + CFStack(const AMDGPUSubtarget *st, unsigned ShaderType) : ST(st), // We need to reserve a stack entry for CALL_FS in vertex shaders. MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0), CurrentEntries(0), CurrentSubEntries(0) { } @@ -76,11 +76,11 @@ bool CFStack::branchStackContains(CFStack::StackItem Item) { } bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { - if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST.hasCaymanISA() && + if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() && getLoopDepth() > 1) return true; - if (!ST.hasCFAluBug()) + if (!ST->hasCFAluBug()) return false; switch(Opcode) { @@ -91,7 +91,7 @@ bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { case AMDGPU::CF_ALU_CONTINUE: if (CurrentSubEntries == 0) return false; - if (ST.getWavefrontSize() == 64) { + if (ST->getWavefrontSize() == 64) { // We are being conservative here. We only require this work-around if // CurrentSubEntries > 3 && // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0) @@ -102,7 +102,7 @@ bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { // resources without any problems. return CurrentSubEntries > 3; } else { - assert(ST.getWavefrontSize() == 32); + assert(ST->getWavefrontSize() == 32); // We are being conservative here. We only require the work-around if // CurrentSubEntries > 7 && // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0) @@ -118,8 +118,8 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { default: return 0; case CFStack::FIRST_NON_WQM_PUSH: - assert(!ST.hasCaymanISA()); - if (ST.getGeneration() <= AMDGPUSubtarget::R700) { + assert(!ST->hasCaymanISA()); + if (ST->getGeneration() <= AMDGPUSubtarget::R700) { // +1 For the push operation. // +2 Extra space required. return 3; @@ -132,7 +132,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { return 2; } case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY: - assert(ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN); + assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN); // +1 For the push operation. // +1 Extra space required. return 2; @@ -153,13 +153,14 @@ void CFStack::pushBranch(unsigned Opcode, bool isWQM) { case AMDGPU::CF_PUSH_EG: case AMDGPU::CF_ALU_PUSH_BEFORE: if (!isWQM) { - if (!ST.hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH)) + if (!ST->hasCaymanISA() && + !branchStackContains(CFStack::FIRST_NON_WQM_PUSH)) Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI // See comment in // CFStack::getSubEntrySize() else if (CurrentEntries > 0 && - ST.getGeneration() > AMDGPUSubtarget::EVERGREEN && - !ST.hasCaymanISA() && + ST->getGeneration() > AMDGPUSubtarget::EVERGREEN && + !ST->hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY)) Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY; else @@ -219,7 +220,7 @@ private: const R600InstrInfo *TII; const R600RegisterInfo *TRI; unsigned MaxFetchInst; - const AMDGPUSubtarget &ST; + const AMDGPUSubtarget *ST; bool IsTrivialInst(MachineInstr *MI) const { switch (MI->getOpcode()) { @@ -233,7 +234,7 @@ private: const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const { unsigned Opcode = 0; - bool isEg = (ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN); + bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN); switch (CFI) { case CF_TC: Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600; @@ -266,7 +267,7 @@ private: Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600; break; case CF_END: - if (ST.hasCaymanISA()) { + if (ST->hasCaymanISA()) { Opcode = AMDGPU::CF_END_CM; break; } @@ -467,17 +468,14 @@ private: } public: - R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID), - TII (nullptr), TRI(nullptr), - ST(tm.getSubtarget<AMDGPUSubtarget>()) { - const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>(); - MaxFetchInst = ST.getTexVTXClauseSize(); - } + R600ControlFlowFinalizer(TargetMachine &tm) + : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {} bool runOnMachineFunction(MachineFunction &MF) override { - TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); - TRI = static_cast<const R600RegisterInfo *>( - MF.getSubtarget().getRegisterInfo()); + ST = &MF.getSubtarget<AMDGPUSubtarget>(); + MaxFetchInst = ST->getTexVTXClauseSize(); + TII = static_cast<const R600InstrInfo *>(ST->getInstrInfo()); + TRI = static_cast<const R600RegisterInfo *>(ST->getRegisterInfo()); R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); CFStack CFStack(ST, MFI->getShaderType()); diff --git a/contrib/llvm/lib/Target/R600/R600ISelLowering.cpp b/contrib/llvm/lib/Target/R600/R600ISelLowering.cpp index 2e1b094..8357b6d 100644 --- a/contrib/llvm/lib/Target/R600/R600ISelLowering.cpp +++ b/contrib/llvm/lib/Target/R600/R600ISelLowering.cpp @@ -30,9 +30,9 @@ using namespace llvm; -R600TargetLowering::R600TargetLowering(TargetMachine &TM) : - AMDGPUTargetLowering(TM), - Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) { +R600TargetLowering::R600TargetLowering(TargetMachine &TM, + const AMDGPUSubtarget &STI) + : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) { addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); @@ -40,7 +40,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); - computeRegisterProperties(); + computeRegisterProperties(STI.getRegisterInfo()); // Set condition code actions setCondCodeAction(ISD::SETO, MVT::f32, Expand); @@ -91,6 +91,14 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::SELECT, MVT::v2i32, Expand); setOperationAction(ISD::SELECT, MVT::v4i32, Expand); + // ADD, SUB overflow. + // TODO: turn these into Legal? + if (Subtarget->hasCARRY()) + setOperationAction(ISD::UADDO, MVT::i32, Custom); + + if (Subtarget->hasBORROW()) + setOperationAction(ISD::USUBO, MVT::i32, Custom); + // Expand sign extension of vectors if (!Subtarget->hasBFE()) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); @@ -163,15 +171,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); - setOperationAction(ISD::SUB, MVT::i64, Expand); - - // These should be replaced by UDVIREM, but it does not happen automatically - // during Type Legalization - setOperationAction(ISD::UDIV, MVT::i64, Custom); - setOperationAction(ISD::UREM, MVT::i64, Custom); - setOperationAction(ISD::SDIV, MVT::i64, Custom); - setOperationAction(ISD::SREM, MVT::i64, Custom); - // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32 // to be Legal/Custom in order to avoid library calls. setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); @@ -197,7 +196,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock::iterator I = *MI; const R600InstrInfo *TII = - static_cast<const R600InstrInfo *>(MF->getSubtarget().getInstrInfo()); + static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo()); switch (MI->getOpcode()) { default: @@ -585,6 +584,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG); case ISD::SRA_PARTS: case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG); + case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY); + case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW); case ISD::FCOS: case ISD::FSIN: return LowerTrig(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); @@ -611,17 +612,18 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2)); } case AMDGPUIntrinsic::R600_store_swizzle: { + SDLoc DL(Op); const SDValue Args[8] = { Chain, Op.getOperand(2), // Export Value Op.getOperand(3), // ArrayBase Op.getOperand(4), // Type - DAG.getConstant(0, MVT::i32), // SWZ_X - DAG.getConstant(1, MVT::i32), // SWZ_Y - DAG.getConstant(2, MVT::i32), // SWZ_Z - DAG.getConstant(3, MVT::i32) // SWZ_W + DAG.getConstant(0, DL, MVT::i32), // SWZ_X + DAG.getConstant(1, DL, MVT::i32), // SWZ_Y + DAG.getConstant(2, DL, MVT::i32), // SWZ_Z + DAG.getConstant(3, DL, MVT::i32) // SWZ_W }; - return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args); + return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args); } // default for switch(IntrinsicID) @@ -652,11 +654,10 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue(); MachineSDNode *interp; if (ijb < 0) { - const MachineFunction &MF = DAG.getMachineFunction(); - const R600InstrInfo *TII = static_cast<const R600InstrInfo *>( - MF.getSubtarget().getInstrInfo()); + const R600InstrInfo *TII = + static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo()); interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, - MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32)); + MVT::v4f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32)); return DAG.getTargetExtractSubreg( TII->getRegisterInfo().getSubRegFromChannel(slot % 4), DL, MVT::f32, SDValue(interp, 0)); @@ -674,11 +675,11 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const if (slot % 4 < 2) interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), + MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32), RegisterJNode, RegisterINode); else interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), + MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32), RegisterJNode, RegisterINode); return SDValue(interp, slot % 2); } @@ -691,11 +692,11 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy) interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32), + MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32), RegisterJNode, RegisterINode); else interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32), + MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32), RegisterJNode, RegisterINode); return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, SDValue(interp, 0), SDValue(interp, 1)); @@ -751,19 +752,19 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const } SDValue TexArgs[19] = { - DAG.getConstant(TextureOp, MVT::i32), + DAG.getConstant(TextureOp, DL, MVT::i32), Op.getOperand(1), - DAG.getConstant(0, MVT::i32), - DAG.getConstant(1, MVT::i32), - DAG.getConstant(2, MVT::i32), - DAG.getConstant(3, MVT::i32), + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(1, DL, MVT::i32), + DAG.getConstant(2, DL, MVT::i32), + DAG.getConstant(3, DL, MVT::i32), Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), - DAG.getConstant(0, MVT::i32), - DAG.getConstant(1, MVT::i32), - DAG.getConstant(2, MVT::i32), - DAG.getConstant(3, MVT::i32), + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(1, DL, MVT::i32), + DAG.getConstant(2, DL, MVT::i32), + DAG.getConstant(3, DL, MVT::i32), Op.getOperand(5), Op.getOperand(6), Op.getOperand(7), @@ -776,21 +777,21 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case AMDGPUIntrinsic::AMDGPU_dp4: { SDValue Args[8] = { DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), - DAG.getConstant(0, MVT::i32)), + DAG.getConstant(0, DL, MVT::i32)), DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), - DAG.getConstant(0, MVT::i32)), + DAG.getConstant(0, DL, MVT::i32)), DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), - DAG.getConstant(1, MVT::i32)), + DAG.getConstant(1, DL, MVT::i32)), DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), - DAG.getConstant(1, MVT::i32)), + DAG.getConstant(1, DL, MVT::i32)), DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), - DAG.getConstant(2, MVT::i32)), + DAG.getConstant(2, DL, MVT::i32)), DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), - DAG.getConstant(2, MVT::i32)), + DAG.getConstant(2, DL, MVT::i32)), DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), - DAG.getConstant(3, MVT::i32)), + DAG.getConstant(3, DL, MVT::i32)), DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), - DAG.getConstant(3, MVT::i32)) + DAG.getConstant(3, DL, MVT::i32)) }; return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args); } @@ -871,42 +872,6 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(Result); return; } - case ISD::UDIV: { - SDValue Op = SDValue(N, 0); - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), - N->getOperand(0), N->getOperand(1)); - Results.push_back(UDIVREM); - break; - } - case ISD::UREM: { - SDValue Op = SDValue(N, 0); - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), - N->getOperand(0), N->getOperand(1)); - Results.push_back(UDIVREM.getValue(1)); - break; - } - case ISD::SDIV: { - SDValue Op = SDValue(N, 0); - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT), - N->getOperand(0), N->getOperand(1)); - Results.push_back(SDIVREM); - break; - } - case ISD::SREM: { - SDValue Op = SDValue(N, 0); - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT), - N->getOperand(0), N->getOperand(1)); - Results.push_back(SDIVREM.getValue(1)); - break; - } case ISD::SDIVREM: { SDValue Op = SDValue(N, 1); SDValue RES = LowerSDIVREM(Op, DAG); @@ -932,8 +897,8 @@ SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, for (unsigned i = 0, e = VecVT.getVectorNumElements(); i != e; ++i) { - Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, - Vector, DAG.getConstant(i, getVectorIdxTy()))); + Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector, + DAG.getConstant(i, DL, getVectorIdxTy()))); } return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args); @@ -977,11 +942,12 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5) EVT VT = Op.getValueType(); SDValue Arg = Op.getOperand(0); - SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT, - DAG.getNode(ISD::FADD, SDLoc(Op), VT, - DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg, - DAG.getConstantFP(0.15915494309, MVT::f32)), - DAG.getConstantFP(0.5, MVT::f32))); + SDLoc DL(Op); + SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, + DAG.getNode(ISD::FADD, DL, VT, + DAG.getNode(ISD::FMUL, DL, VT, Arg, + DAG.getConstantFP(0.15915494309, DL, MVT::f32)), + DAG.getConstantFP(0.5, DL, MVT::f32))); unsigned TrigNode; switch (Op.getOpcode()) { case ISD::FCOS: @@ -993,14 +959,14 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { default: llvm_unreachable("Wrong trig opcode"); } - SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT, - DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart, - DAG.getConstantFP(-0.5, MVT::f32))); + SDValue TrigVal = DAG.getNode(TrigNode, DL, VT, + DAG.getNode(ISD::FADD, DL, VT, FractPart, + DAG.getConstantFP(-0.5, DL, MVT::f32))); if (Gen >= AMDGPUSubtarget::R700) return TrigVal; // On R600 hw, COS/SIN input must be between -Pi and Pi. - return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal, - DAG.getConstantFP(3.14159265359, MVT::f32)); + return DAG.getNode(ISD::FMUL, DL, VT, TrigVal, + DAG.getConstantFP(3.14159265359, DL, MVT::f32)); } SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const { @@ -1010,11 +976,11 @@ SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const { SDValue Lo = Op.getOperand(0); SDValue Hi = Op.getOperand(1); SDValue Shift = Op.getOperand(2); - SDValue Zero = DAG.getConstant(0, VT); - SDValue One = DAG.getConstant(1, VT); + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue One = DAG.getConstant(1, DL, VT); - SDValue Width = DAG.getConstant(VT.getSizeInBits(), VT); - SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT); + SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); + SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); @@ -1046,13 +1012,13 @@ SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const { SDValue Lo = Op.getOperand(0); SDValue Hi = Op.getOperand(1); SDValue Shift = Op.getOperand(2); - SDValue Zero = DAG.getConstant(0, VT); - SDValue One = DAG.getConstant(1, VT); + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue One = DAG.getConstant(1, DL, VT); const bool SRA = Op.getOpcode() == ISD::SRA_PARTS; - SDValue Width = DAG.getConstant(VT.getSizeInBits(), VT); - SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT); + SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); + SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); @@ -1077,12 +1043,31 @@ SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); } +SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG, + unsigned mainop, unsigned ovf) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + + SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi); + // Extend sign. + OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF, + DAG.getValueType(MVT::i1)); + + SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi); + + return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF); +} + SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); return DAG.getNode( ISD::SETCC, - SDLoc(Op), + DL, MVT::i1, - Op, DAG.getConstantFP(0.0f, MVT::f32), + Op, DAG.getConstantFP(0.0f, DL, MVT::f32), DAG.getCondCode(ISD::SETNE) ); } @@ -1098,7 +1083,7 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, assert(isInt<16>(ByteOffset)); return DAG.getLoad(VT, DL, DAG.getEntryNode(), - DAG.getConstant(ByteOffset, MVT::i32), // PTR + DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR MachinePointerInfo(ConstantPointerNull::get(PtrType)), false, false, false, 0); } @@ -1235,11 +1220,11 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const SDValue HWTrue, HWFalse; if (CompareVT == MVT::f32) { - HWTrue = DAG.getConstantFP(1.0f, CompareVT); - HWFalse = DAG.getConstantFP(0.0f, CompareVT); + HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT); + HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT); } else if (CompareVT == MVT::i32) { - HWTrue = DAG.getConstant(-1, CompareVT); - HWFalse = DAG.getConstant(0, CompareVT); + HWTrue = DAG.getConstant(-1, DL, CompareVT); + HWFalse = DAG.getConstant(0, DL, CompareVT); } else { llvm_unreachable("Unhandled value type in LowerSELECT_CC"); @@ -1277,8 +1262,9 @@ SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, default: llvm_unreachable("Invalid stack width"); } - return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr, - DAG.getConstant(SRLPad, MVT::i32)); + SDLoc DL(Ptr); + return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(SRLPad, DL, MVT::i32)); } void R600TargetLowering::getStackAddress(unsigned StackWidth, @@ -1329,26 +1315,26 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { EVT MemVT = StoreNode->getMemoryVT(); SDValue MaskConstant; if (MemVT == MVT::i8) { - MaskConstant = DAG.getConstant(0xFF, MVT::i32); + MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32); } else { assert(MemVT == MVT::i16); - MaskConstant = DAG.getConstant(0xFFFF, MVT::i32); + MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32); } SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr, - DAG.getConstant(2, MVT::i32)); + DAG.getConstant(2, DL, MVT::i32)); SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr, - DAG.getConstant(0x00000003, VT)); + DAG.getConstant(0x00000003, DL, VT)); SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, - DAG.getConstant(3, VT)); + DAG.getConstant(3, DL, VT)); SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift); SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift); // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32 // vector instead. SDValue Src[4] = { ShiftedValue, - DAG.getConstant(0, MVT::i32), - DAG.getConstant(0, MVT::i32), + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32), Mask }; SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src); @@ -1361,7 +1347,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // Convert pointer from byte address to dword address. Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), - Ptr, DAG.getConstant(2, MVT::i32))); + Ptr, DAG.getConstant(2, DL, MVT::i32))); if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { llvm_unreachable("Truncated and indexed stores not supported yet"); @@ -1385,8 +1371,8 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // Lowering for indirect addressing const MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>( - getTargetMachine().getSubtargetImpl()->getFrameLowering()); + const AMDGPUFrameLowering *TFL = + static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering()); unsigned StackWidth = TFL->getStackWidth(MF); Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); @@ -1403,13 +1389,13 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { unsigned Channel, PtrIncr; getStackAddress(StackWidth, i, Channel, PtrIncr); Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, - DAG.getConstant(PtrIncr, MVT::i32)); + DAG.getConstant(PtrIncr, DL, MVT::i32)); SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, - Value, DAG.getConstant(i, MVT::i32)); + Value, DAG.getConstant(i, DL, MVT::i32)); Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Elem, Ptr, - DAG.getTargetConstant(Channel, MVT::i32)); + DAG.getTargetConstant(Channel, DL, MVT::i32)); } Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); } else { @@ -1417,7 +1403,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); } Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, - DAG.getTargetConstant(0, MVT::i32)); // Channel + DAG.getTargetConstant(0, DL, MVT::i32)); // Channel } return Chain; @@ -1484,16 +1470,17 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const // Lower loads constant address space global variable loads if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && - isa<GlobalVariable>( - GetUnderlyingObject(LoadNode->getMemOperand()->getValue()))) { + isa<GlobalVariable>(GetUnderlyingObject( + LoadNode->getMemOperand()->getValue(), *getDataLayout()))) { SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL, getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, - DAG.getConstant(2, MVT::i32)); + DAG.getConstant(2, DL, MVT::i32)); return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(), LoadNode->getChain(), Ptr, - DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2)); + DAG.getTargetConstant(0, DL, MVT::i32), + Op.getOperand(2)); } if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { @@ -1520,7 +1507,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and // then div by 4 at the ISel step SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, - DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32)); + DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32)); Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); } EVT NewVT = MVT::v4i32; @@ -1534,15 +1521,16 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const } else { // non-constant ptr can't be folded, keeps it as a v4f32 load Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, - DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)), - DAG.getConstant(LoadNode->getAddressSpace() - - AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32) + DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, + DAG.getConstant(4, DL, MVT::i32)), + DAG.getConstant(LoadNode->getAddressSpace() - + AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32) ); } if (!VT.isVector()) { Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(0, DL, MVT::i32)); } SDValue MergedValues[2] = { @@ -1562,18 +1550,16 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const if (LoadNode->getExtensionType() == ISD::SEXTLOAD) { EVT MemVT = LoadNode->getMemoryVT(); assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8)); - SDValue ShiftAmount = - DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32); SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT, LoadNode->isVolatile(), LoadNode->isNonTemporal(), LoadNode->isInvariant(), LoadNode->getAlignment()); - SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount); - SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount); + SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad, + DAG.getValueType(MemVT)); - SDValue MergedValues[2] = { Sra, Chain }; + SDValue MergedValues[2] = { Res, Chain }; return DAG.getMergeValues(MergedValues, DL); } @@ -1583,8 +1569,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const // Lowering for indirect addressing const MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>( - getTargetMachine().getSubtargetImpl()->getFrameLowering()); + const AMDGPUFrameLowering *TFL = + static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering()); unsigned StackWidth = TFL->getStackWidth(MF); Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); @@ -1601,10 +1587,10 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const unsigned Channel, PtrIncr; getStackAddress(StackWidth, i, Channel, PtrIncr); Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, - DAG.getConstant(PtrIncr, MVT::i32)); + DAG.getConstant(PtrIncr, DL, MVT::i32)); Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, Chain, Ptr, - DAG.getTargetConstant(Channel, MVT::i32), + DAG.getTargetConstant(Channel, DL, MVT::i32), Op.getOperand(2)); } for (unsigned i = NumElemVT; i < 4; ++i) { @@ -1615,7 +1601,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const } else { LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, Chain, Ptr, - DAG.getTargetConstant(0, MVT::i32), // Channel + DAG.getTargetConstant(0, DL, MVT::i32), // Channel Op.getOperand(2)); } @@ -1704,7 +1690,7 @@ SDValue R600TargetLowering::LowerFormalArguments( MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain, - DAG.getConstant(Offset, MVT::i32), + DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo, MemVT, false, true, true, 4); @@ -1805,24 +1791,25 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, -SDValue Swz[4], SelectionDAG &DAG) const { + SDValue Swz[4], SelectionDAG &DAG, + SDLoc DL) const { assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR); // Old -> New swizzle values DenseMap<unsigned, unsigned> SwizzleRemap; BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap); for (unsigned i = 0; i < 4; i++) { - unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue(); + unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) - Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32); + Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); } SwizzleRemap.clear(); BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap); for (unsigned i = 0; i < 4; i++) { - unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue(); + unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) - Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32); + Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); } return BuildVector; @@ -1868,11 +1855,12 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); } - return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0), + SDLoc dl(N); + return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0), SelectCC.getOperand(0), // LHS SelectCC.getOperand(1), // RHS - DAG.getConstant(-1, MVT::i32), // True - DAG.getConstant(0, MVT::i32), // Flase + DAG.getConstant(-1, dl, MVT::i32), // True + DAG.getConstant(0, dl, MVT::i32), // False SelectCC.getOperand(4)); // CC break; @@ -2015,7 +2003,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, N->getOperand(7) // SWZ_W }; SDLoc DL(N); - NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG); + NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL); return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs); } case AMDGPUISD::TEXTURE_FETCH: { @@ -2044,9 +2032,9 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, N->getOperand(17), N->getOperand(18), }; - NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG); - return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(), - NewArgs); + SDLoc DL(N); + NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL); + return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs); } } @@ -2065,13 +2053,13 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, if (!Neg.getNode()) return false; Src = Src.getOperand(0); - Neg = DAG.getTargetConstant(1, MVT::i32); + Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); return true; case AMDGPU::FABS_R600: if (!Abs.getNode()) return false; Src = Src.getOperand(0); - Abs = DAG.getTargetConstant(1, MVT::i32); + Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); return true; case AMDGPU::CONST_COPY: { unsigned Opcode = ParentNode->getMachineOpcode(); @@ -2167,7 +2155,7 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, assert(C); if (C->getZExtValue()) return false; - Imm = DAG.getTargetConstant(ImmValue, MVT::i32); + Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32); } Src = DAG.getRegister(ImmReg, MVT::i32); return true; @@ -2188,9 +2176,7 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, unsigned Opcode = Node->getMachineOpcode(); SDValue FakeOp; - std::vector<SDValue> Ops; - for (const SDUse &I : Node->ops()) - Ops.push_back(I); + std::vector<SDValue> Ops(Node->op_begin(), Node->op_end()); if (Opcode == AMDGPU::DOT_4) { int OperandIdx[] = { @@ -2252,13 +2238,11 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, AMDGPU::OpName::clamp); if (ClampIdx < 0) return Node; - std::vector<SDValue> Ops; - unsigned NumOp = Src.getNumOperands(); - for(unsigned i = 0; i < NumOp; ++i) - Ops.push_back(Src.getOperand(i)); - Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32); - return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node), - Node->getVTList(), Ops); + SDLoc DL(Node); + std::vector<SDValue> Ops(Src->op_begin(), Src->op_end()); + Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32); + return DAG.getMachineNode(Src.getMachineOpcode(), DL, + Node->getVTList(), Ops); } else { if (!TII->hasInstrModifiers(Opcode)) return Node; diff --git a/contrib/llvm/lib/Target/R600/R600ISelLowering.h b/contrib/llvm/lib/Target/R600/R600ISelLowering.h index 10ebc10..c06d3c4 100644 --- a/contrib/llvm/lib/Target/R600/R600ISelLowering.h +++ b/contrib/llvm/lib/Target/R600/R600ISelLowering.h @@ -23,7 +23,7 @@ class R600InstrInfo; class R600TargetLowering : public AMDGPUTargetLowering { public: - R600TargetLowering(TargetMachine &TM); + R600TargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI); MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock * BB) const override; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; @@ -50,7 +50,8 @@ private: void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB, MachineRegisterInfo & MRI, unsigned dword_offset) const; - SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG) const; + SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG, + SDLoc DL) const; SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; @@ -63,6 +64,8 @@ private: SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSHLParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSRXParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUADDSUBO(SDValue Op, SelectionDAG &DAG, + unsigned mainop, unsigned ovf) const; SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth, SelectionDAG &DAG) const; diff --git a/contrib/llvm/lib/Target/R600/R600InstrInfo.cpp b/contrib/llvm/lib/Target/R600/R600InstrInfo.cpp index 653fd0d..5f0bdf3 100644 --- a/contrib/llvm/lib/Target/R600/R600InstrInfo.cpp +++ b/contrib/llvm/lib/Target/R600/R600InstrInfo.cpp @@ -29,9 +29,7 @@ using namespace llvm; #include "AMDGPUGenDFAPacketizer.inc" R600InstrInfo::R600InstrInfo(const AMDGPUSubtarget &st) - : AMDGPUInstrInfo(st), - RI(st) - { } + : AMDGPUInstrInfo(st), RI() {} const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const { return RI; @@ -268,9 +266,8 @@ int R600InstrInfo::getSrcIdx(unsigned Opcode, unsigned SrcNum) const { return getOperandIdx(Opcode, OpTable[SrcNum]); } -#define SRC_SEL_ROWS 11 int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const { - static const unsigned SrcSelTable[SRC_SEL_ROWS][2] = { + static const unsigned SrcSelTable[][2] = { {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel}, {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel}, {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel}, @@ -284,14 +281,13 @@ int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const { {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W} }; - for (unsigned i = 0; i < SRC_SEL_ROWS; ++i) { - if (getOperandIdx(Opcode, SrcSelTable[i][0]) == (int)SrcIdx) { - return getOperandIdx(Opcode, SrcSelTable[i][1]); + for (const auto &Row : SrcSelTable) { + if (getOperandIdx(Opcode, Row[0]) == (int)SrcIdx) { + return getOperandIdx(Opcode, Row[1]); } } return -1; } -#undef SRC_SEL_ROWS SmallVector<std::pair<MachineOperand *, int64_t>, 3> R600InstrInfo::getSrcs(MachineInstr *MI) const { diff --git a/contrib/llvm/lib/Target/R600/R600Instructions.td b/contrib/llvm/lib/Target/R600/R600Instructions.td index 05957d2..7beed09 100644 --- a/contrib/llvm/lib/Target/R600/R600Instructions.td +++ b/contrib/llvm/lib/Target/R600/R600Instructions.td @@ -335,10 +335,11 @@ def load_param : LoadParamFrag<load>; def load_param_exti8 : LoadParamFrag<az_extloadi8>; def load_param_exti16 : LoadParamFrag<az_extloadi16>; -def isR600 : Predicate<"Subtarget.getGeneration() <= AMDGPUSubtarget::R700">; +def isR600 : Predicate<"Subtarget->getGeneration() <= AMDGPUSubtarget::R700">; -def isR600toCayman : Predicate< - "Subtarget.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">; +def isR600toCayman + : Predicate< + "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">; //===----------------------------------------------------------------------===// // R600 SDNodes @@ -579,6 +580,7 @@ i32imm:$COUNT, i32imm:$Enabled), let ALT_CONST = 0; let WHOLE_QUAD_MODE = 0; let BARRIER = 1; + let isCodeGenOnly = 1; let UseNamedOperandTable = 1; let Inst{31-0} = Word0; @@ -641,6 +643,7 @@ def FETCH_CLAUSE : AMDGPUInst <(outs), field bits<8> Inst; bits<8> num; let Inst = num; + let isCodeGenOnly = 1; } def ALU_CLAUSE : AMDGPUInst <(outs), @@ -648,10 +651,13 @@ def ALU_CLAUSE : AMDGPUInst <(outs), field bits<8> Inst; bits<8> num; let Inst = num; + let isCodeGenOnly = 1; } def LITERALS : AMDGPUInst <(outs), (ins LITERAL:$literal1, LITERAL:$literal2), "$literal1, $literal2", [] > { + let isCodeGenOnly = 1; + field bits<64> Inst; bits<32> literal1; bits<32> literal2; @@ -677,6 +683,11 @@ def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>; // TODO: Do these actually match the regular fmin/fmax behavior? def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax_legacy>; def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin_legacy>; +// According to https://msdn.microsoft.com/en-us/library/windows/desktop/cc308050%28v=vs.85%29.aspx +// DX10 min/max returns the other operand if one is NaN, +// this matches http://llvm.org/docs/LangRef.html#llvm-minnum-intrinsic +def MAX_DX10 : R600_2OP_Helper <0x5, "MAX_DX10", fmaxnum>; +def MIN_DX10 : R600_2OP_Helper <0x6, "MIN_DX10", fminnum>; // For the SET* instructions there is a naming conflict in TargetSelectionDAG.td, // so some of the instruction names don't match the asm string. @@ -770,10 +781,10 @@ def XOR_INT : R600_2OP_Helper <0x32, "XOR_INT", xor>; def NOT_INT : R600_1OP_Helper <0x33, "NOT_INT", not>; def ADD_INT : R600_2OP_Helper <0x34, "ADD_INT", add>; def SUB_INT : R600_2OP_Helper <0x35, "SUB_INT", sub>; -def MAX_INT : R600_2OP_Helper <0x36, "MAX_INT", AMDGPUsmax>; -def MIN_INT : R600_2OP_Helper <0x37, "MIN_INT", AMDGPUsmin>; -def MAX_UINT : R600_2OP_Helper <0x38, "MAX_UINT", AMDGPUumax>; -def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", AMDGPUumin>; +def MAX_INT : R600_2OP_Helper <0x36, "MAX_INT", smax>; +def MIN_INT : R600_2OP_Helper <0x37, "MIN_INT", smin>; +def MAX_UINT : R600_2OP_Helper <0x38, "MAX_UINT", umax>; +def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", umin>; def SETE_INT : R600_2OP < 0x3A, "SETE_INT", @@ -914,7 +925,7 @@ class MULADD_Common <bits<5> inst> : R600_3OP < class MULADD_IEEE_Common <bits<5> inst> : R600_3OP < inst, "MULADD_IEEE", - [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))] + [(set f32:$dst, (fmad f32:$src0, f32:$src1, f32:$src2))] >; class FMA_Common <bits<5> inst> : R600_3OP < @@ -1142,16 +1153,6 @@ class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ie (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x)) >; -// FROUND pattern -class FROUNDPat<Instruction CNDGE, Instruction CNDGT> : Pat < - (AMDGPUround f32:$x), - (CNDGE $x, - (CNDGE (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x)), - (CNDGT (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x)) - ) ->; - - //===----------------------------------------------------------------------===// // R600 / R700 Instructions //===----------------------------------------------------------------------===// @@ -1195,8 +1196,6 @@ let Predicates = [isR600] in { def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>; def : RsqPat<RECIPSQRT_IEEE_r600, f32>; - def : FROUNDPat <CNDGE_r600, CNDGT_r600>; - def R600_ExportSwz : ExportSwzInst { let Word1{20-17} = 0; // BURST_COUNT let Word1{21} = eop; @@ -1249,6 +1248,7 @@ let Predicates = [isR600] in { def CF_PUSH_ELSE_R600 : CF_CLAUSE_R600<12, (ins i32imm:$ADDR), "PUSH_ELSE @$ADDR"> { let CNT = 0; + let POP_COUNT = 0; // FIXME? } def CF_ELSE_R600 : CF_CLAUSE_R600<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT), "ELSE @$ADDR POP:$POP_COUNT"> { diff --git a/contrib/llvm/lib/Target/R600/R600MachineScheduler.cpp b/contrib/llvm/lib/Target/R600/R600MachineScheduler.cpp index d782713..bcde5fb 100644 --- a/contrib/llvm/lib/Target/R600/R600MachineScheduler.cpp +++ b/contrib/llvm/lib/Target/R600/R600MachineScheduler.cpp @@ -16,7 +16,7 @@ #include "AMDGPUSubtarget.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Pass.h" -#include "llvm/PassManager.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -26,17 +26,16 @@ using namespace llvm; void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness"); DAG = static_cast<ScheduleDAGMILive*>(dag); + const AMDGPUSubtarget &ST = DAG->MF.getSubtarget<AMDGPUSubtarget>(); TII = static_cast<const R600InstrInfo*>(DAG->TII); TRI = static_cast<const R600RegisterInfo*>(DAG->TRI); - VLIW5 = !DAG->MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA(); + VLIW5 = !ST.hasCaymanISA(); MRI = &DAG->MRI; CurInstKind = IDOther; CurEmitted = 0; OccupedSlotsMask = 31; InstKindLimit[IDAlu] = TII->getMaxAlusPerClause(); InstKindLimit[IDOther] = 32; - - const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>(); InstKindLimit[IDFetch] = ST.getTexVTXClauseSize(); AluInstCount = 0; FetchInstCount = 0; diff --git a/contrib/llvm/lib/Target/R600/R600OptimizeVectorRegisters.cpp b/contrib/llvm/lib/Target/R600/R600OptimizeVectorRegisters.cpp index 742c0e0..0c06ccc 100644 --- a/contrib/llvm/lib/Target/R600/R600OptimizeVectorRegisters.cpp +++ b/contrib/llvm/lib/Target/R600/R600OptimizeVectorRegisters.cpp @@ -27,10 +27,9 @@ /// to reduce MOV count. //===----------------------------------------------------------------------===// -#include "llvm/Support/Debug.h" #include "AMDGPU.h" -#include "R600InstrInfo.h" #include "AMDGPUSubtarget.h" +#include "R600InstrInfo.h" #include "llvm/CodeGen/DFAPacketizer.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -38,6 +37,7 @@ #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; diff --git a/contrib/llvm/lib/Target/R600/R600Packetizer.cpp b/contrib/llvm/lib/Target/R600/R600Packetizer.cpp index ddf68c9..deee5bc 100644 --- a/contrib/llvm/lib/Target/R600/R600Packetizer.cpp +++ b/contrib/llvm/lib/Target/R600/R600Packetizer.cpp @@ -153,7 +153,7 @@ public: TII(static_cast<const R600InstrInfo *>( MF.getSubtarget().getInstrInfo())), TRI(TII->getRegisterInfo()) { - VLIW5 = !MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA(); + VLIW5 = !MF.getSubtarget<AMDGPUSubtarget>().hasCaymanISA(); } // initPacketizerState - initialize some internal flags. diff --git a/contrib/llvm/lib/Target/R600/R600RegisterInfo.cpp b/contrib/llvm/lib/Target/R600/R600RegisterInfo.cpp index dc95675..fb0359c 100644 --- a/contrib/llvm/lib/Target/R600/R600RegisterInfo.cpp +++ b/contrib/llvm/lib/Target/R600/R600RegisterInfo.cpp @@ -20,14 +20,16 @@ using namespace llvm; -R600RegisterInfo::R600RegisterInfo(const AMDGPUSubtarget &st) -: AMDGPURegisterInfo(st) - { RCW.RegWeight = 0; RCW.WeightLimit = 0;} +R600RegisterInfo::R600RegisterInfo() : AMDGPURegisterInfo() { + RCW.RegWeight = 0; + RCW.WeightLimit = 0; +} BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); - const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(ST.getInstrInfo()); + const R600InstrInfo *TII = + static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); Reserved.set(AMDGPU::ZERO); Reserved.set(AMDGPU::HALF); diff --git a/contrib/llvm/lib/Target/R600/R600RegisterInfo.h b/contrib/llvm/lib/Target/R600/R600RegisterInfo.h index f1a8a41..9713e60 100644 --- a/contrib/llvm/lib/Target/R600/R600RegisterInfo.h +++ b/contrib/llvm/lib/Target/R600/R600RegisterInfo.h @@ -24,7 +24,7 @@ class AMDGPUSubtarget; struct R600RegisterInfo : public AMDGPURegisterInfo { RegClassWeight RCW; - R600RegisterInfo(const AMDGPUSubtarget &st); + R600RegisterInfo(); BitVector getReservedRegs(const MachineFunction &MF) const override; diff --git a/contrib/llvm/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp b/contrib/llvm/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp index 419ec8b..2fc7b02 100644 --- a/contrib/llvm/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp +++ b/contrib/llvm/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp @@ -162,7 +162,7 @@ class R600TextureIntrinsicsReplacer : Value *SamplerId = I.getArgOperand(2); unsigned TextureType = - dyn_cast<ConstantInt>(I.getArgOperand(3))->getZExtValue(); + cast<ConstantInt>(I.getArgOperand(3))->getZExtValue(); unsigned SrcSelect[4] = { 0, 1, 2, 3 }; unsigned CT[4] = {1, 1, 1, 1}; @@ -186,7 +186,7 @@ class R600TextureIntrinsicsReplacer : Value *SamplerId = I.getArgOperand(5); unsigned TextureType = - dyn_cast<ConstantInt>(I.getArgOperand(6))->getZExtValue(); + cast<ConstantInt>(I.getArgOperand(6))->getZExtValue(); unsigned SrcSelect[4] = { 0, 1, 2, 3 }; unsigned CT[4] = {1, 1, 1, 1}; diff --git a/contrib/llvm/lib/Target/R600/R700Instructions.td b/contrib/llvm/lib/Target/R600/R700Instructions.td index 9aad85d..613a0d7 100644 --- a/contrib/llvm/lib/Target/R600/R700Instructions.td +++ b/contrib/llvm/lib/Target/R600/R700Instructions.td @@ -13,7 +13,7 @@ // //===----------------------------------------------------------------------===// -def isR700 : Predicate<"Subtarget.getGeneration() == AMDGPUSubtarget::R700">; +def isR700 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::R700">; let Predicates = [isR700] in { def SIN_r700 : SIN_Common<0x6E>; diff --git a/contrib/llvm/lib/Target/R600/SIAnnotateControlFlow.cpp b/contrib/llvm/lib/Target/R600/SIAnnotateControlFlow.cpp index b8165fb..ccfbf1b 100644 --- a/contrib/llvm/lib/Target/R600/SIAnnotateControlFlow.cpp +++ b/contrib/llvm/lib/Target/R600/SIAnnotateControlFlow.cpp @@ -102,7 +102,7 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); FunctionPass::getAnalysisUsage(AU); @@ -312,7 +312,8 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { if (std::find(Latches.begin(), Latches.end(), *PI) == Latches.end()) Preds.push_back(*PI); } - BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", this); + BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", nullptr, DT, + LI, false); } CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt()); @@ -322,7 +323,7 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { /// recognize if/then/else and loops. bool SIAnnotateControlFlow::runOnFunction(Function &F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()), E = df_end(&F.getEntryBlock()); I != E; ++I) { diff --git a/contrib/llvm/lib/Target/R600/SIDefines.h b/contrib/llvm/lib/Target/R600/SIDefines.h index b540140..4727d97 100644 --- a/contrib/llvm/lib/Target/R600/SIDefines.h +++ b/contrib/llvm/lib/Target/R600/SIDefines.h @@ -36,7 +36,8 @@ enum { DS = 1 << 17, MIMG = 1 << 18, FLAT = 1 << 19, - WQM = 1 << 20 + WQM = 1 << 20, + VGPRSpill = 1 << 21 }; } diff --git a/contrib/llvm/lib/Target/R600/SIFixControlFlowLiveIntervals.cpp b/contrib/llvm/lib/Target/R600/SIFixControlFlowLiveIntervals.cpp new file mode 100644 index 0000000..5fe8d19 --- /dev/null +++ b/contrib/llvm/lib/Target/R600/SIFixControlFlowLiveIntervals.cpp @@ -0,0 +1,96 @@ +//===-- SIFixControlFlowLiveIntervals.cpp - Fix CF live intervals ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Spilling of EXEC masks used for control flow messes up control flow +/// lowering, so mark all live intervals associated with CF instructions as +/// non-spillable. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-fix-cf-live-intervals" + +namespace { + +class SIFixControlFlowLiveIntervals : public MachineFunctionPass { +public: + static char ID; + +public: + SIFixControlFlowLiveIntervals() : MachineFunctionPass(ID) { + initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Fix CF Live Intervals"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LiveIntervals>(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIFixControlFlowLiveIntervals, DEBUG_TYPE, + "SI Fix CF Live Intervals", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_END(SIFixControlFlowLiveIntervals, DEBUG_TYPE, + "SI Fix CF Live Intervals", false, false) + +char SIFixControlFlowLiveIntervals::ID = 0; + +char &llvm::SIFixControlFlowLiveIntervalsID = SIFixControlFlowLiveIntervals::ID; + +FunctionPass *llvm::createSIFixControlFlowLiveIntervalsPass() { + return new SIFixControlFlowLiveIntervals(); +} + +bool SIFixControlFlowLiveIntervals::runOnMachineFunction(MachineFunction &MF) { + LiveIntervals *LIS = &getAnalysis<LiveIntervals>(); + + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + switch (MI.getOpcode()) { + case AMDGPU::SI_IF: + case AMDGPU::SI_ELSE: + case AMDGPU::SI_BREAK: + case AMDGPU::SI_IF_BREAK: + case AMDGPU::SI_ELSE_BREAK: + case AMDGPU::SI_END_CF: { + unsigned Reg = MI.getOperand(0).getReg(); + LIS->getInterval(Reg).markNotSpillable(); + break; + } + default: + break; + } + } + } + + return false; +} diff --git a/contrib/llvm/lib/Target/R600/SIFixSGPRCopies.cpp b/contrib/llvm/lib/Target/R600/SIFixSGPRCopies.cpp index cd1b3ac..23502b4 100644 --- a/contrib/llvm/lib/Target/R600/SIFixSGPRCopies.cpp +++ b/contrib/llvm/lib/Target/R600/SIFixSGPRCopies.cpp @@ -140,7 +140,7 @@ const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses( const TargetRegisterClass *RC = TargetRegisterInfo::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) : - TRI->getRegClass(Reg); + TRI->getPhysRegClass(Reg); RC = TRI->getSubRegClass(RC, SubReg); for (MachineRegisterInfo::use_instr_iterator @@ -183,15 +183,17 @@ bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy, unsigned SrcReg = Copy.getOperand(1).getReg(); unsigned SrcSubReg = Copy.getOperand(1).getSubReg(); - const TargetRegisterClass *DstRC - = TargetRegisterInfo::isVirtualRegister(DstReg) ? - MRI.getRegClass(DstReg) : - TRI->getRegClass(DstReg); + if (!TargetRegisterInfo::isVirtualRegister(DstReg)) { + // If the destination register is a physical register there isn't really + // much we can do to fix this. + return false; + } + + const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); const TargetRegisterClass *SrcRC; if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || - DstRC == &AMDGPU::M0RegRegClass || MRI.getRegClass(SrcReg) == &AMDGPU::VReg_1RegClass) return false; diff --git a/contrib/llvm/lib/Target/R600/SIFixSGPRLiveRanges.cpp b/contrib/llvm/lib/Target/R600/SIFixSGPRLiveRanges.cpp index f34c375..0c54446 100644 --- a/contrib/llvm/lib/Target/R600/SIFixSGPRLiveRanges.cpp +++ b/contrib/llvm/lib/Target/R600/SIFixSGPRLiveRanges.cpp @@ -54,6 +54,7 @@ #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; diff --git a/contrib/llvm/lib/Target/R600/SIFoldOperands.cpp b/contrib/llvm/lib/Target/R600/SIFoldOperands.cpp index cb24bba..d14e37a 100644 --- a/contrib/llvm/lib/Target/R600/SIFoldOperands.cpp +++ b/contrib/llvm/lib/Target/R600/SIFoldOperands.cpp @@ -17,9 +17,10 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #define DEBUG_TYPE "si-fold-operands" @@ -172,6 +173,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { if (!isSafeToFold(MI.getOpcode())) continue; + unsigned OpSize = TII->getOpSize(MI, 1); MachineOperand &OpToFold = MI.getOperand(1); bool FoldingImm = OpToFold.isImm(); @@ -180,10 +182,10 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { if (!FoldingImm && !OpToFold.isReg()) continue; - // Folding immediates with more than one use will increase program side. + // Folding immediates with more than one use will increase program size. // FIXME: This will also reduce register usage, which may be better // in some cases. A better heuristic is needed. - if (FoldingImm && !TII->isInlineConstant(OpToFold) && + if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) && !MRI.hasOneUse(MI.getOperand(0).getReg())) continue; @@ -202,7 +204,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo()); // FIXME: Fold operands with subregs. - if (UseOp.isReg() && UseOp.getSubReg() && OpToFold.isReg()) { + if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) || + UseOp.isImplicit())) { continue; } @@ -213,7 +216,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { const TargetRegisterClass *UseRC = TargetRegisterInfo::isVirtualRegister(UseReg) ? MRI.getRegClass(UseReg) : - TRI.getRegClass(UseReg); + TRI.getPhysRegClass(UseReg); Imm = APInt(64, OpToFold.getImm()); @@ -237,7 +240,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { const TargetRegisterClass *DestRC = TargetRegisterInfo::isVirtualRegister(DestReg) ? MRI.getRegClass(DestReg) : - TRI.getRegClass(DestReg); + TRI.getPhysRegClass(DestReg); unsigned MovOp = TII->getMovOpcode(DestRC); if (MovOp == AMDGPU::COPY) diff --git a/contrib/llvm/lib/Target/R600/SIISelLowering.cpp b/contrib/llvm/lib/Target/R600/SIISelLowering.cpp index 32ae605..12d08cf 100644 --- a/contrib/llvm/lib/Target/R600/SIISelLowering.cpp +++ b/contrib/llvm/lib/Target/R600/SIISelLowering.cpp @@ -35,8 +35,9 @@ using namespace llvm; -SITargetLowering::SITargetLowering(TargetMachine &TM) : - AMDGPUTargetLowering(TM) { +SITargetLowering::SITargetLowering(TargetMachine &TM, + const AMDGPUSubtarget &STI) + : AMDGPUTargetLowering(TM, STI) { addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); @@ -59,7 +60,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); - computeRegisterProperties(); + computeRegisterProperties(STI.getRegisterInfo()); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); @@ -75,8 +76,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::FSIN, MVT::f32, Custom); setOperationAction(ISD::FCOS, MVT::f32, Custom); - setOperationAction(ISD::FMINNUM, MVT::f32, Legal); - setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); setOperationAction(ISD::FMINNUM, MVT::f64, Legal); setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); @@ -156,7 +155,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : for (MVT VT : MVT::fp_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); - setTruncStoreAction(MVT::f64, MVT::f32, Expand); setTruncStoreAction(MVT::i64, MVT::i32, Expand); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); @@ -171,16 +169,12 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::UDIV, MVT::i64, Expand); setOperationAction(ISD::UREM, MVT::i64, Expand); - // We only support LOAD/STORE and vector manipulation ops for vectors - // with > 4 elements. - MVT VecTypes[] = { - MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32 - }; - setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); setOperationAction(ISD::SELECT, MVT::i1, Promote); - for (MVT VT : VecTypes) { + // We only support LOAD/STORE and vector manipulation ops for vectors + // with > 4 elements. + for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch(Op) { case ISD::LOAD: @@ -205,10 +199,10 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); - setOperationAction(ISD::FFLOOR, MVT::f64, Legal); setOperationAction(ISD::FRINT, MVT::f64, Legal); } + setOperationAction(ISD::FFLOOR, MVT::f64, Legal); setOperationAction(ISD::FDIV, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f64, Custom); @@ -216,6 +210,10 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMINNUM); setTargetDAGCombine(ISD::FMAXNUM); + setTargetDAGCombine(ISD::SMIN); + setTargetDAGCombine(ISD::SMAX); + setTargetDAGCombine(ISD::UMIN); + setTargetDAGCombine(ISD::UMAX); setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::AND); @@ -256,47 +254,83 @@ bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, return false; } -// FIXME: This really needs an address space argument. The immediate offset -// size is different for different sets of memory instruction sets. - -// The single offset DS instructions have a 16-bit unsigned byte offset. -// -// MUBUF / MTBUF have a 12-bit unsigned byte offset, and additionally can do r + -// r + i with addr64. 32-bit has more addressing mode options. Depending on the -// resource constant, it can also do (i64 r0) + (i32 r1) * (i14 i). -// -// SMRD instructions have an 8-bit, dword offset. -// bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM, - Type *Ty) const { + Type *Ty, unsigned AS) const { // No global is ever allowed as a base. if (AM.BaseGV) return false; - // Allow a 16-bit unsigned immediate field, since this is what DS instructions - // use. - if (!isUInt<16>(AM.BaseOffs)) - return false; + switch (AS) { + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions? + case AMDGPUAS::PRIVATE_ADDRESS: + case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: { + // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and + // additionally can do r + r + i with addr64. 32-bit has more addressing + // mode options. Depending on the resource constant, it can also do + // (i64 r0) + (i32 r1) * (i14 i). + // + // SMRD instructions have an 8-bit, dword offset. + // + // Assume nonunifom access, since the address space isn't enough to know + // what instruction we will use, and since we don't know if this is a load + // or store and scalar stores are only available on VI. + // + // We also know if we are doing an extload, we can't do a scalar load. + // + // Private arrays end up using a scratch buffer most of the time, so also + // assume those use MUBUF instructions. Scratch loads / stores are currently + // implemented as mubuf instructions with offen bit set, so slightly + // different than the normal addr64. + if (!isUInt<12>(AM.BaseOffs)) + return false; - // Only support r+r, - switch (AM.Scale) { - case 0: // "r+i" or just "i", depending on HasBaseReg. - break; - case 1: - if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed. + // FIXME: Since we can split immediate into soffset and immediate offset, + // would it make sense to allow any immediate? + + switch (AM.Scale) { + case 0: // r + i or just i, depending on HasBaseReg. + return true; + case 1: + return true; // We have r + r or r + i. + case 2: + if (AM.HasBaseReg) { + // Reject 2 * r + r. + return false; + } + + // Allow 2 * r as r + r + // Or 2 * r + i is allowed as r + r + i. + return true; + default: // Don't allow n * r return false; - // Otherwise we have r+r or r+i. - break; - case 2: - if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed. + } + } + case AMDGPUAS::LOCAL_ADDRESS: + case AMDGPUAS::REGION_ADDRESS: { + // Basic, single offset DS instructions allow a 16-bit unsigned immediate + // field. + // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have + // an 8-bit dword offset but we don't know the alignment here. + if (!isUInt<16>(AM.BaseOffs)) return false; - // Allow 2*r as r+r. - break; - default: // Don't allow n * r + + if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. + return true; + + if (AM.Scale == 1 && AM.HasBaseReg) + return true; + return false; } - - return true; + case AMDGPUAS::FLAT_ADDRESS: { + // Flat instructions do not have offsets, and only have the register + // address. + return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1); + } + default: + llvm_unreachable("unhandled address space"); + } } bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, @@ -368,11 +402,17 @@ SITargetLowering::getPreferredVectorAction(EVT VT) const { bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); return TII->isInlineConstant(Imm); } +static EVT toIntegerVT(EVT VT) { + if (VT.isVector()) + return VT.changeVectorElementTypeToInteger(); + return MVT::getIntegerVT(VT.getSizeInBits()); +} + SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc SL, SDValue Chain, unsigned Offset, bool Signed) const { @@ -385,33 +425,50 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, Type *Ty = VT.getTypeForEVT(*DAG.getContext()); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + MVT PtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS); PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); - SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, - MRI.getLiveInVirtReg(InputPtrReg), MVT::i64); - SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, BasePtr, - DAG.getConstant(Offset, MVT::i64)); + SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, + MRI.getLiveInVirtReg(InputPtrReg), PtrVT); + SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, + DAG.getConstant(Offset, SL, PtrVT)); SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS)); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); - return DAG.getLoad(ISD::UNINDEXED, Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD, + unsigned Align = DL->getABITypeAlignment(Ty); + + if (VT != MemVT && VT.isFloatingPoint()) { + // Do an integer load and convert. + // FIXME: This is mostly because load legalization after type legalization + // doesn't handle FP extloads. + assert(VT.getScalarType() == MVT::f32 && + MemVT.getScalarType() == MVT::f16); + + EVT IVT = toIntegerVT(VT); + EVT MemIVT = toIntegerVT(MemVT); + SDValue Load = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD, + IVT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemIVT, + false, // isVolatile + true, // isNonTemporal + true, // isInvariant + Align); // Alignment + return DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load); + } + + ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + return DAG.getLoad(ISD::UNINDEXED, ExtTy, VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT, false, // isVolatile true, // isNonTemporal true, // isInvariant - DL->getABITypeAlignment(Ty)); // Alignment + Align); // Alignment } SDValue SITargetLowering::LowerFormalArguments( - SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, - SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const { - - const TargetMachine &TM = getTargetMachine(); + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo*>(TM.getSubtargetImpl()->getRegisterInfo()); + static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); MachineFunction &MF = DAG.getMachineFunction(); FunctionType *FType = MF.getFunction()->getFunctionType(); @@ -581,8 +638,7 @@ SDValue SITargetLowering::LowerFormalArguments( // Fill up the missing vector elements NumElements = Arg.VT.getVectorNumElements() - NumElements; - for (unsigned j = 0; j != NumElements; ++j) - Regs.push_back(DAG.getUNDEF(VT)); + Regs.append(NumElements, DAG.getUNDEF(VT)); InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs)); continue; @@ -592,8 +648,8 @@ SDValue SITargetLowering::LowerFormalArguments( } if (Info->getShaderType() != ShaderType::COMPUTE) { - unsigned ScratchIdx = CCInfo.getFirstUnallocated( - AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs()); + unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef<MCPhysReg>( + AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs())); Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); } return Chain; @@ -603,25 +659,14 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { MachineBasicBlock::iterator I = *MI; - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); switch (MI->getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); - case AMDGPU::BRANCH: return BB; - case AMDGPU::V_SUB_F64: { - unsigned DestReg = MI->getOperand(0).getReg(); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg) - .addImm(0) // SRC0 modifiers - .addReg(MI->getOperand(1).getReg()) - .addImm(1) // SRC1 modifiers - .addReg(MI->getOperand(2).getReg()) - .addImm(0) // CLAMP - .addImm(0); // OMOD - MI->eraseFromParent(); - break; - } + case AMDGPU::BRANCH: + return BB; case AMDGPU::SI_RegisterStorePseudo: { MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); @@ -638,6 +683,17 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( return BB; } +bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { + // This currently forces unfolding various combinations of fsub into fma with + // free fneg'd operands. As long as we have fast FMA (controlled by + // isFMAFasterThanFMulAndFAdd), we should perform these. + + // When fma is quarter rate, for f64 where add / sub are at best half rate, + // most of these combines appear to be cycle neutral but save on instruction + // count / code size. + return true; +} + EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const { if (!VT.isVector()) { return MVT::i1; @@ -649,6 +705,21 @@ MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { return MVT::i32; } +// Answering this is somewhat tricky and depends on the specific device which +// have different rates for fma or all f64 operations. +// +// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other +// regardless of which device (although the number of cycles differs between +// devices), so it is always profitable for f64. +// +// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable +// only on full rate devices. Normally, we should prefer selecting v_mad_f32 +// which we can always do even without fused FP ops since it returns the same +// result as the separate operations and since it is always full +// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 +// however does not support denormals, so we do report fma as faster if we have +// a fast fma device and require denormals. +// bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { VT = VT.getScalarType(); @@ -657,7 +728,11 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { switch (VT.getSimpleVT().SimpleTy) { case MVT::f32: - return false; /* There is V_MAD_F32 for f32 */ + // This is as fast on some subtargets. However, we always have full rate f32 + // mad available which returns the same result as the separate operations + // which we should prefer over fma. We can't use this if we want to support + // denormals, so only report this in these cases. + return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); case MVT::f64: return true; default: @@ -753,15 +828,12 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); // Build the result and - SmallVector<EVT, 4> Res; - for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i) - Res.push_back(Intr->getValueType(i)); + ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); // operands of the new intrinsic call SmallVector<SDValue, 4> Ops; Ops.push_back(BRCOND.getOperand(0)); - for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i) - Ops.push_back(Intr->getOperand(i)); + Ops.append(Intr->op_begin() + 1, Intr->op_end()); Ops.push_back(Target); // build the new intrinsic call @@ -821,23 +893,40 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(0, DL, MVT::i32)); SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, - DAG.getConstant(1, MVT::i32)); + DAG.getConstant(1, DL, MVT::i32)); SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue), PtrLo, GA); SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue), - PtrHi, DAG.getConstant(0, MVT::i32), + PtrHi, DAG.getConstant(0, DL, MVT::i32), SDValue(Lo.getNode(), 1)); return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi); } +SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, + SDValue V) const { + // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, + // so we will end up with redundant moves to m0. + // + // We can't use S_MOV_B32, because there is no way to specify m0 as the + // destination register. + // + // We have to use them both. Machine cse will combine all the S_MOV_B32 + // instructions and the register coalescer eliminate the extra copies. + SDNode *M0 = DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, V.getValueType(), V); + return DAG.getCopyToReg(Chain, DL, DAG.getRegister(AMDGPU::M0, MVT::i32), + SDValue(M0, 0), SDValue()); // Glue + // A Null SDValue creates + // a glue result. +} + SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo*>(MF.getSubtarget().getRegisterInfo()); + static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); EVT VT = Op.getValueType(); SDLoc DL(Op); @@ -926,7 +1015,28 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1), DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1))); - + case AMDGPUIntrinsic::SI_fs_constant: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); + SDValue Glue = M0.getValue(1); + return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, + DAG.getConstant(2, DL, MVT::i32), // P0 + Op.getOperand(1), Op.getOperand(2), Glue); + } + case AMDGPUIntrinsic::SI_fs_interp: { + SDValue IJ = Op.getOperand(4); + SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, + DAG.getConstant(0, DL, MVT::i32)); + SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, + DAG.getConstant(1, DL, MVT::i32)); + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); + SDValue Glue = M0.getValue(1); + SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL, + DAG.getVTList(MVT::f32, MVT::Glue), + I, Op.getOperand(1), Op.getOperand(2), Glue); + Glue = SDValue(P1.getNode(), 1); + return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J, + Op.getOperand(1), Op.getOperand(2), Glue); + } default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); } @@ -935,12 +1045,18 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); + SDLoc DL(Op); SDValue Chain = Op.getOperand(0); unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); switch (IntrinsicID) { + case AMDGPUIntrinsic::SI_sendmsg: { + Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); + SDValue Glue = Chain.getValue(1); + return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain, + Op.getOperand(2), Glue); + } case AMDGPUIntrinsic::SI_tbuffer_store: { - SDLoc DL(Op); SDValue Ops[] = { Chain, Op.getOperand(2), @@ -1013,8 +1129,8 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); SDValue Cond = Op.getOperand(0); - SDValue Zero = DAG.getConstant(0, MVT::i32); - SDValue One = DAG.getConstant(1, MVT::i32); + SDValue Zero = DAG.getConstant(0, DL, MVT::i32); + SDValue One = DAG.getConstant(1, DL, MVT::i32); SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); @@ -1089,12 +1205,12 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); const APFloat K0Val(BitsToFloat(0x6f800000)); - const SDValue K0 = DAG.getConstantFP(K0Val, MVT::f32); + const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); const APFloat K1Val(BitsToFloat(0x2f800000)); - const SDValue K1 = DAG.getConstantFP(K1Val, MVT::f32); + const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); - const SDValue One = DAG.getConstantFP(1.0, MVT::f32); + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); @@ -1119,7 +1235,7 @@ SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { SDValue X = Op.getOperand(0); SDValue Y = Op.getOperand(1); - const SDValue One = DAG.getConstantFP(1.0, MVT::f64); + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); @@ -1149,7 +1265,7 @@ SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { // Workaround a hardware bug on SI where the condition output from div_scale // is not usable. - const SDValue Hi = DAG.getConstant(1, MVT::i32); + const SDValue Hi = DAG.getConstant(1, SL, MVT::i32); // Figure out if the scale to use for div_fmas. SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); @@ -1218,11 +1334,13 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { } SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); EVT VT = Op.getValueType(); SDValue Arg = Op.getOperand(0); - SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT, - DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg, - DAG.getConstantFP(0.5 / M_PI, VT))); + SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, + DAG.getNode(ISD::FMUL, DL, VT, Arg, + DAG.getConstantFP(0.5/M_PI, DL, + VT))); switch (Op.getOpcode()) { case ISD::FCOS: @@ -1341,6 +1459,35 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, return SDValue(); } +/// \brief Return true if the given offset Size in bytes can be folded into +/// the immediate offsets of a memory instruction for the given address space. +static bool canFoldOffset(unsigned OffsetSize, unsigned AS, + const AMDGPUSubtarget &STI) { + switch (AS) { + case AMDGPUAS::GLOBAL_ADDRESS: { + // MUBUF instructions a 12-bit offset in bytes. + return isUInt<12>(OffsetSize); + } + case AMDGPUAS::CONSTANT_ADDRESS: { + // SMRD instructions have an 8-bit offset in dwords on SI and + // a 20-bit offset in bytes on VI. + if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return isUInt<20>(OffsetSize); + else + return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); + } + case AMDGPUAS::LOCAL_ADDRESS: + case AMDGPUAS::REGION_ADDRESS: { + // The single offset versions have a 16-bit offset in bytes. + return isUInt<16>(OffsetSize); + } + case AMDGPUAS::PRIVATE_ADDRESS: + // Indirect register addressing does not use any offsets. + default: + return 0; + } +} + // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) // This is a variant of @@ -1372,13 +1519,10 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, if (!CAdd) return SDValue(); - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); - // If the resulting offset is too large, we can't fold it into the addressing // mode offset. APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); - if (!TII->canFoldOffset(Offset.getZExtValue(), AddrSpace)) + if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *Subtarget)) return SDValue(); SelectionDAG &DAG = DCI.DAG; @@ -1386,7 +1530,7 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, EVT VT = N->getValueType(0); SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); - SDValue COffset = DAG.getConstant(Offset, MVT::i32); + SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32); return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); } @@ -1435,8 +1579,9 @@ SDValue SITargetLowering::performAndCombine(SDNode *N, SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask, "mask not equal"); - return DAG.getNode(AMDGPUISD::FP_CLASS, SDLoc(N), MVT::i1, - X, DAG.getConstant(Mask, MVT::i32)); + SDLoc DL(N); + return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, + X, DAG.getConstant(Mask, DL, MVT::i32)); } } } @@ -1466,8 +1611,9 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, static const uint32_t MaxMask = 0x3ff; uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; - return DAG.getNode(AMDGPUISD::FP_CLASS, SDLoc(N), MVT::i1, - Src, DAG.getConstant(NewMask, MVT::i32)); + SDLoc DL(N); + return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, + Src, DAG.getConstant(NewMask, DL, MVT::i32)); } return SDValue(); @@ -1481,7 +1627,7 @@ SDValue SITargetLowering::performClassCombine(SDNode *N, // fp_class x, 0 -> false if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) { if (CMask->isNullValue()) - return DAG.getConstant(0, MVT::i1); + return DAG.getConstant(0, SDLoc(N), MVT::i1); } return SDValue(); @@ -1491,15 +1637,15 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { switch (Opc) { case ISD::FMAXNUM: return AMDGPUISD::FMAX3; - case AMDGPUISD::SMAX: + case ISD::SMAX: return AMDGPUISD::SMAX3; - case AMDGPUISD::UMAX: + case ISD::UMAX: return AMDGPUISD::UMAX3; case ISD::FMINNUM: return AMDGPUISD::FMIN3; - case AMDGPUISD::SMIN: + case ISD::SMIN: return AMDGPUISD::SMIN3; - case AMDGPUISD::UMIN: + case ISD::UMIN: return AMDGPUISD::UMIN3; default: llvm_unreachable("Not a min/max opcode"); @@ -1565,8 +1711,8 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N, const APFloat &APF = CRHS->getValueAPF(); if (APF.isInfinity() && !APF.isNegative()) { unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY; - return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, - LHS.getOperand(0), DAG.getConstant(Mask, MVT::i32)); + return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0), + DAG.getConstant(Mask, SL, MVT::i32)); } } @@ -1585,10 +1731,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performSetCCCombine(N, DCI); case ISD::FMAXNUM: // TODO: What about fmax_legacy? case ISD::FMINNUM: - case AMDGPUISD::SMAX: - case AMDGPUISD::SMIN: - case AMDGPUISD::UMAX: - case AMDGPUISD::UMIN: { + case ISD::SMAX: + case ISD::SMIN: + case ISD::UMAX: + case ISD::UMIN: { if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && N->getValueType(0) != MVT::f64 && getTargetMachine().getOptLevel() > CodeGenOpt::None) @@ -1628,6 +1774,11 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, if (VT != MVT::f32) break; + // Only do this if we are not trying to support denormals. v_mad_f32 does + // not support denormals ever. + if (Subtarget->hasFP32Denormals()) + break; + SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); @@ -1638,8 +1789,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, if (LHS.getOpcode() == ISD::FADD) { SDValue A = LHS.getOperand(0); if (A == LHS.getOperand(1)) { - const SDValue Two = DAG.getConstantFP(2.0, MVT::f32); - return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, RHS); + const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); + return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS); } } @@ -1647,12 +1798,12 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, if (RHS.getOpcode() == ISD::FADD) { SDValue A = RHS.getOperand(0); if (A == RHS.getOperand(1)) { - const SDValue Two = DAG.getConstantFP(2.0, MVT::f32); - return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, LHS); + const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); + return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS); } } - break; + return SDValue(); } case ISD::FSUB: { if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) @@ -1662,39 +1813,22 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, // Try to get the fneg to fold into the source modifier. This undoes generic // DAG combines and folds them into the mad. - if (VT == MVT::f32) { + // + // Only do this if we are not trying to support denormals. v_mad_f32 does + // not support denormals ever. + if (VT == MVT::f32 && + !Subtarget->hasFP32Denormals()) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - - if (LHS.getOpcode() == ISD::FMUL) { - // (fsub (fmul a, b), c) -> mad a, b, (fneg c) - - SDValue A = LHS.getOperand(0); - SDValue B = LHS.getOperand(1); - SDValue C = DAG.getNode(ISD::FNEG, DL, VT, RHS); - - return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C); - } - - if (RHS.getOpcode() == ISD::FMUL) { - // (fsub c, (fmul a, b)) -> mad (fneg a), b, c - - SDValue A = DAG.getNode(ISD::FNEG, DL, VT, RHS.getOperand(0)); - SDValue B = RHS.getOperand(1); - SDValue C = LHS; - - return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C); - } - if (LHS.getOpcode() == ISD::FADD) { // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) SDValue A = LHS.getOperand(0); if (A == LHS.getOperand(1)) { - const SDValue Two = DAG.getConstantFP(2.0, MVT::f32); + const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS); - return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, NegRHS); + return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS); } } @@ -1703,10 +1837,12 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, SDValue A = RHS.getOperand(0); if (A == RHS.getOperand(1)) { - const SDValue NegTwo = DAG.getConstantFP(-2.0, MVT::f32); - return DAG.getNode(AMDGPUISD::MAD, DL, VT, NegTwo, A, LHS); + const SDValue NegTwo = DAG.getConstantFP(-2.0, DL, MVT::f32); + return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS); } } + + return SDValue(); } break; @@ -1740,9 +1876,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); if (NewPtr) { - SmallVector<SDValue, 8> NewOps; - for (unsigned I = 0, E = MemNode->getNumOperands(); I != E; ++I) - NewOps.push_back(MemNode->getOperand(I)); + SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end()); NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0); @@ -1760,33 +1894,21 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } -/// \brief Test if RegClass is one of the VSrc classes -static bool isVSrc(unsigned RegClass) { - switch(RegClass) { - default: return false; - case AMDGPU::VS_32RegClassID: - case AMDGPU::VS_64RegClassID: - return true; - } -} - /// \brief Analyze the possible immediate value Op /// /// Returns -1 if it isn't an immediate, 0 if it's and inline immediate /// and the immediate value if it's a literal immediate int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { - if (Node->getZExtValue() >> 32) - return -1; - if (TII->isInlineConstant(Node->getAPIntValue())) return 0; - return Node->getZExtValue(); + uint64_t Val = Node->getZExtValue(); + return isUInt<32>(Val) ? Val : -1; } if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) { @@ -1802,69 +1924,6 @@ int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { return -1; } -const TargetRegisterClass *SITargetLowering::getRegClassForNode( - SelectionDAG &DAG, const SDValue &Op) const { - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); - - if (!Op->isMachineOpcode()) { - switch(Op->getOpcode()) { - case ISD::CopyFromReg: { - MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); - unsigned Reg = cast<RegisterSDNode>(Op->getOperand(1))->getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) { - return MRI.getRegClass(Reg); - } - return TRI.getPhysRegClass(Reg); - } - default: return nullptr; - } - } - const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode()); - int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass; - if (OpClassID != -1) { - return TRI.getRegClass(OpClassID); - } - switch(Op.getMachineOpcode()) { - case AMDGPU::COPY_TO_REGCLASS: - // Operand 1 is the register class id for COPY_TO_REGCLASS instructions. - OpClassID = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue(); - - // If the COPY_TO_REGCLASS instruction is copying to a VSrc register - // class, then the register class for the value could be either a - // VReg or and SReg. In order to get a more accurate - if (isVSrc(OpClassID)) - return getRegClassForNode(DAG, Op.getOperand(0)); - - return TRI.getRegClass(OpClassID); - case AMDGPU::EXTRACT_SUBREG: { - int SubIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - const TargetRegisterClass *SuperClass = - getRegClassForNode(DAG, Op.getOperand(0)); - return TRI.getSubClassWithSubReg(SuperClass, SubIdx); - } - case AMDGPU::REG_SEQUENCE: - // Operand 0 is the register class id for REG_SEQUENCE instructions. - return TRI.getRegClass( - cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()); - default: - return getRegClassFor(Op.getSimpleValueType()); - } -} - -/// \brief Does "Op" fit into register class "RegClass" ? -bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op, - unsigned RegClass) const { - const TargetRegisterInfo *TRI = - getTargetMachine().getSubtargetImpl()->getRegisterInfo(); - const TargetRegisterClass *RC = getRegClassForNode(DAG, Op); - if (!RC) { - return false; - } - return TRI->getRegClass(RegClass)->hasSubClassEq(RC); -} - /// \brief Helper function for adjustWritemask static unsigned SubIdx2Lane(unsigned Idx) { switch (Idx) { @@ -1921,15 +1980,15 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, // Adjust the writemask in the node std::vector<SDValue> Ops; - Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32)); - for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) - Ops.push_back(Node->getOperand(i)); + Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); + Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end()); Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); // If we only got one lane, replace it with a copy // (if NewDmask has only one bit set...) if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { - SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, MVT::i32); + SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(), + MVT::i32); SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), Users[Lane]->getValueType(0), SDValue(Node, 0), RC); @@ -1944,7 +2003,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, if (!User) continue; - SDValue Op = DAG.getTargetConstant(Idx, MVT::i32); + SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); DAG.UpdateNodeOperands(User, User->getOperand(0), Op); switch (Idx) { @@ -1981,9 +2040,8 @@ void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, /// \brief Fold the instructions after selecting them. SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); - Node = AdjustRegClass(Node, DAG); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); if (TII->isMIMG(Node->getMachineOpcode())) adjustWritemask(Node, DAG); @@ -2000,8 +2058,8 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, /// bits set in the writemask void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, SDNode *Node) const { - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); TII->legalizeOperands(MI); @@ -2040,26 +2098,26 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, } static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) { - SDValue K = DAG.getTargetConstant(Val, MVT::i32); + SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32); return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); } MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const { - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); #if 1 // XXX - Workaround for moveToVALU not handling different register class // inserts for REG_SEQUENCE. // Build the half of the subregister with the constants. const SDValue Ops0[] = { - DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, MVT::i32), + DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), buildSMovImm32(DAG, DL, 0), - DAG.getTargetConstant(AMDGPU::sub0, MVT::i32), + DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), - DAG.getTargetConstant(AMDGPU::sub1, MVT::i32) + DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) }; SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, @@ -2067,11 +2125,11 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, // Combine the constants and the pointer. const SDValue Ops1[] = { - DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32), + DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), Ptr, - DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32), + DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi, - DAG.getTargetConstant(AMDGPU::sub2_sub3, MVT::i32) + DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) }; return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); @@ -2104,7 +2162,8 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); if (RsrcDword1) { PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, - DAG.getConstant(RsrcDword1, MVT::i32)), 0); + DAG.getConstant(RsrcDword1, DL, MVT::i32)), + 0); } SDValue DataLo = buildSMovImm32(DAG, DL, @@ -2112,15 +2171,15 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); const SDValue Ops[] = { - DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32), + DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), PtrLo, - DAG.getTargetConstant(AMDGPU::sub0, MVT::i32), + DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), PtrHi, - DAG.getTargetConstant(AMDGPU::sub1, MVT::i32), + DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32), DataLo, - DAG.getTargetConstant(AMDGPU::sub2, MVT::i32), + DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32), DataHi, - DAG.getTargetConstant(AMDGPU::sub3, MVT::i32) + DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32) }; return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); @@ -2129,64 +2188,14 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const { - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE | 0xffffffff; // Size return buildRSRC(DAG, DL, Ptr, 0, Rsrc); } -MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N, - SelectionDAG &DAG) const { - - SDLoc DL(N); - unsigned NewOpcode = N->getMachineOpcode(); - - switch (N->getMachineOpcode()) { - default: return N; - case AMDGPU::S_LOAD_DWORD_IMM: - NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_ADDR64; - // Fall-through - case AMDGPU::S_LOAD_DWORDX2_SGPR: - if (NewOpcode == N->getMachineOpcode()) { - NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; - } - // Fall-through - case AMDGPU::S_LOAD_DWORDX4_IMM: - case AMDGPU::S_LOAD_DWORDX4_SGPR: { - if (NewOpcode == N->getMachineOpcode()) { - NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; - } - if (fitsRegClass(DAG, N->getOperand(0), AMDGPU::SReg_64RegClassID)) { - return N; - } - ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1)); - - const SDValue Zero64 = DAG.getTargetConstant(0, MVT::i64); - SDValue Ptr(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Zero64), 0); - MachineSDNode *RSrc = wrapAddr64Rsrc(DAG, DL, Ptr); - - SmallVector<SDValue, 8> Ops; - Ops.push_back(SDValue(RSrc, 0)); - Ops.push_back(N->getOperand(0)); - - // The immediate offset is in dwords on SI and in bytes on VI. - if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - Ops.push_back(DAG.getTargetConstant(Offset->getSExtValue(), MVT::i32)); - else - Ops.push_back(DAG.getTargetConstant(Offset->getSExtValue() << 2, MVT::i32)); - - // Copy remaining operands so we keep any chain and glue nodes that follow - // the normal operands. - for (unsigned I = 2, E = N->getNumOperands(); I != E; ++I) - Ops.push_back(N->getOperand(I)); - - return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops); - } - } -} - SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const { @@ -2195,3 +2204,38 @@ SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), cast<RegisterSDNode>(VReg)->getReg(), VT); } + +//===----------------------------------------------------------------------===// +// SI Inline Assembly Support +//===----------------------------------------------------------------------===// + +std::pair<unsigned, const TargetRegisterClass *> +SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + const std::string &Constraint, + MVT VT) const { + if (Constraint == "r") { + switch(VT.SimpleTy) { + default: llvm_unreachable("Unhandled type for 'r' inline asm constraint"); + case MVT::i64: + return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); + case MVT::i32: + return std::make_pair(0U, &AMDGPU::SGPR_32RegClass); + } + } + + if (Constraint.size() > 1) { + const TargetRegisterClass *RC = nullptr; + if (Constraint[1] == 'v') { + RC = &AMDGPU::VGPR_32RegClass; + } else if (Constraint[1] == 's') { + RC = &AMDGPU::SGPR_32RegClass; + } + + if (RC) { + unsigned Idx = std::atoi(Constraint.substr(2).c_str()); + if (Idx < RC->getNumRegs()) + return std::make_pair(RC->getRegister(Idx), RC); + } + } + return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); +} diff --git a/contrib/llvm/lib/Target/R600/SIISelLowering.h b/contrib/llvm/lib/Target/R600/SIISelLowering.h index 876fd8c..a956b01 100644 --- a/contrib/llvm/lib/Target/R600/SIISelLowering.h +++ b/contrib/llvm/lib/Target/R600/SIISelLowering.h @@ -42,13 +42,7 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; - const TargetRegisterClass *getRegClassForNode(SelectionDAG &DAG, - const SDValue &Op) const; - bool fitsRegClass(SelectionDAG &DAG, const SDValue &Op, - unsigned RegClass) const; - void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; - MachineSDNode *AdjustRegClass(MachineSDNode *N, SelectionDAG &DAG) const; SDValue performUCharToFloatCombine(SDNode *N, DAGCombinerInfo &DCI) const; @@ -63,13 +57,13 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; public: - SITargetLowering(TargetMachine &tm); + SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI); bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/, EVT /*VT*/) const override; bool isLegalAddressingMode(const AddrMode &AM, - Type *Ty) const override; + Type *Ty, unsigned AS) const override; bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, @@ -95,6 +89,7 @@ public: MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI, MachineBasicBlock * BB) const override; + bool enableAggressiveFMAFusion(EVT VT) const override; EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; MVT getScalarShiftAmountTy(EVT VT) const override; bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; @@ -118,6 +113,11 @@ public: MachineSDNode *buildScratchRSRC(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const; + + std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint( + const TargetRegisterInfo *TRI, + const std::string &Constraint, MVT VT) const override; + SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const; }; } // End namespace llvm diff --git a/contrib/llvm/lib/Target/R600/SIInsertWaits.cpp b/contrib/llvm/lib/Target/R600/SIInsertWaits.cpp index 50f20ac..90a37f1 100644 --- a/contrib/llvm/lib/Target/R600/SIInsertWaits.cpp +++ b/contrib/llvm/lib/Target/R600/SIInsertWaits.cpp @@ -259,7 +259,8 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, return; } - if (TRI->ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >= + AMDGPUSubtarget::VOLCANIC_ISLANDS) { // Any occurence of consecutive VMEM or SMEM instructions forms a VMEM // or SMEM clause, respectively. // @@ -412,7 +413,8 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) { void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { - if (TRI->ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() < + AMDGPUSubtarget::VOLCANIC_ISLANDS) return; // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG. diff --git a/contrib/llvm/lib/Target/R600/SIInstrFormats.td b/contrib/llvm/lib/Target/R600/SIInstrFormats.td index b825208..3dddd24 100644 --- a/contrib/llvm/lib/Target/R600/SIInstrFormats.td +++ b/contrib/llvm/lib/Target/R600/SIInstrFormats.td @@ -39,6 +39,7 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : field bits<1> MIMG = 0; field bits<1> FLAT = 0; field bits<1> WQM = 0; + field bits<1> VGPRSpill = 0; // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = VM_CNT; @@ -66,6 +67,7 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : let TSFlags{18} = MIMG; let TSFlags{19} = FLAT; let TSFlags{20} = WQM; + let TSFlags{21} = VGPRSpill; // Most instructions require adjustments after selection to satisfy // operand requirements. @@ -74,17 +76,18 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : } class Enc32 { - field bits<32> Inst; int Size = 4; } class Enc64 { - field bits<64> Inst; int Size = 8; } +class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">; +def VOPDstVCC : VOPDstOperand <VCCReg>; + let Uses = [EXEC] in { class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> : @@ -98,7 +101,7 @@ class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> : } class VOPCCommon <dag ins, string asm, list<dag> pattern> : - VOPAnyCommon <(outs VCCReg:$dst), ins, asm, pattern> { + VOPAnyCommon <(outs VOPDstVCC:$dst), ins, asm, pattern> { let DisableEncoding = "$dst"; let VOPC = 1; @@ -129,6 +132,11 @@ class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> : let AddedComplexity = -1000; let VOP3 = 1; + let VALU = 1; + + let AsmMatchConverter = "cvtVOP3"; + let isCodeGenOnly = 0; + int Size = 8; } @@ -139,53 +147,61 @@ class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> : //===----------------------------------------------------------------------===// class SOP1e <bits<8> op> : Enc32 { + bits<7> sdst; + bits<8> ssrc0; - bits<7> SDST; - bits<8> SSRC0; - - let Inst{7-0} = SSRC0; + let Inst{7-0} = ssrc0; let Inst{15-8} = op; - let Inst{22-16} = SDST; + let Inst{22-16} = sdst; let Inst{31-23} = 0x17d; //encoding; } class SOP2e <bits<7> op> : Enc32 { + bits<7> sdst; + bits<8> ssrc0; + bits<8> ssrc1; - bits<7> SDST; - bits<8> SSRC0; - bits<8> SSRC1; - - let Inst{7-0} = SSRC0; - let Inst{15-8} = SSRC1; - let Inst{22-16} = SDST; + let Inst{7-0} = ssrc0; + let Inst{15-8} = ssrc1; + let Inst{22-16} = sdst; let Inst{29-23} = op; let Inst{31-30} = 0x2; // encoding } class SOPCe <bits<7> op> : Enc32 { + bits<8> ssrc0; + bits<8> ssrc1; - bits<8> SSRC0; - bits<8> SSRC1; - - let Inst{7-0} = SSRC0; - let Inst{15-8} = SSRC1; + let Inst{7-0} = ssrc0; + let Inst{15-8} = ssrc1; let Inst{22-16} = op; let Inst{31-23} = 0x17e; } class SOPKe <bits<5> op> : Enc32 { + bits <7> sdst; + bits <16> simm16; - bits <7> SDST; - bits <16> SIMM16; - - let Inst{15-0} = SIMM16; - let Inst{22-16} = SDST; + let Inst{15-0} = simm16; + let Inst{22-16} = sdst; let Inst{27-23} = op; let Inst{31-28} = 0xb; //encoding } -class SOPPe <bits<7> op> : Enc32 { +class SOPK64e <bits<5> op> : Enc64 { + bits <7> sdst = 0; + bits <16> simm16; + bits <32> imm; + + let Inst{15-0} = simm16; + let Inst{22-16} = sdst; + let Inst{27-23} = op; + let Inst{31-28} = 0xb; + + let Inst{63-32} = imm; +} +class SOPPe <bits<7> op> : Enc32 { bits <16> simm16; let Inst{15-0} = simm16; @@ -194,15 +210,14 @@ class SOPPe <bits<7> op> : Enc32 { } class SMRDe <bits<5> op, bits<1> imm> : Enc32 { + bits<7> sdst; + bits<7> sbase; + bits<8> offset; - bits<7> SDST; - bits<7> SBASE; - bits<8> OFFSET; - - let Inst{7-0} = OFFSET; + let Inst{7-0} = offset; let Inst{8} = imm; - let Inst{14-9} = SBASE{6-1}; - let Inst{21-15} = SDST; + let Inst{14-9} = sbase{6-1}; + let Inst{21-15} = sdst; let Inst{26-22} = op; let Inst{31-27} = 0x18; //encoding } @@ -213,6 +228,7 @@ class SOP1 <dag outs, dag ins, string asm, list<dag> pattern> : let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; + let isCodeGenOnly = 0; let SALU = 1; let SOP1 = 1; } @@ -223,6 +239,7 @@ class SOP2 <dag outs, dag ins, string asm, list<dag> pattern> : let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; + let isCodeGenOnly = 0; let SALU = 1; let SOP2 = 1; @@ -238,6 +255,7 @@ class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : let hasSideEffects = 0; let SALU = 1; let SOPC = 1; + let isCodeGenOnly = 0; let UseNamedOperandTable = 1; } @@ -260,7 +278,6 @@ class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> : let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; - let isCodeGenOnly = 0; let SALU = 1; let SOPP = 1; @@ -286,13 +303,12 @@ class SMRD <dag outs, dag ins, string asm, list<dag> pattern> : //===----------------------------------------------------------------------===// class VOP1e <bits<8> op> : Enc32 { + bits<8> vdst; + bits<9> src0; - bits<8> VDST; - bits<9> SRC0; - - let Inst{8-0} = SRC0; + let Inst{8-0} = src0; let Inst{16-9} = op; - let Inst{24-17} = VDST; + let Inst{24-17} = vdst; let Inst{31-25} = 0x3f; //encoding } @@ -324,8 +340,7 @@ class VOP2_MADKe <bits<6> op> : Enc64 { } class VOP3e <bits<9> op> : Enc64 { - - bits<8> dst; + bits<8> vdst; bits<2> src0_modifiers; bits<9> src0; bits<2> src1_modifiers; @@ -335,7 +350,7 @@ class VOP3e <bits<9> op> : Enc64 { bits<1> clamp; bits<2> omod; - let Inst{7-0} = dst; + let Inst{7-0} = vdst; let Inst{8} = src0_modifiers{1}; let Inst{9} = src1_modifiers{1}; let Inst{10} = src2_modifiers{1}; @@ -352,8 +367,7 @@ class VOP3e <bits<9> op> : Enc64 { } class VOP3be <bits<9> op> : Enc64 { - - bits<8> dst; + bits<8> vdst; bits<2> src0_modifiers; bits<9> src0; bits<2> src1_modifiers; @@ -363,7 +377,7 @@ class VOP3be <bits<9> op> : Enc64 { bits<7> sdst; bits<2> omod; - let Inst{7-0} = dst; + let Inst{7-0} = vdst; let Inst{14-8} = sdst; let Inst{25-17} = op; let Inst{31-26} = 0x34; //encoding @@ -377,33 +391,30 @@ class VOP3be <bits<9> op> : Enc64 { } class VOPCe <bits<8> op> : Enc32 { + bits<9> src0; + bits<8> vsrc1; - bits<9> SRC0; - bits<8> VSRC1; - - let Inst{8-0} = SRC0; - let Inst{16-9} = VSRC1; + let Inst{8-0} = src0; + let Inst{16-9} = vsrc1; let Inst{24-17} = op; let Inst{31-25} = 0x3e; } class VINTRPe <bits<2> op> : Enc32 { + bits<8> vdst; + bits<8> vsrc; + bits<2> attrchan; + bits<6> attr; - bits<8> VDST; - bits<8> VSRC; - bits<2> ATTRCHAN; - bits<6> ATTR; - - let Inst{7-0} = VSRC; - let Inst{9-8} = ATTRCHAN; - let Inst{15-10} = ATTR; + let Inst{7-0} = vsrc; + let Inst{9-8} = attrchan; + let Inst{15-10} = attr; let Inst{17-16} = op; - let Inst{25-18} = VDST; + let Inst{25-18} = vdst; let Inst{31-26} = 0x32; // encoding } class DSe <bits<8> op> : Enc64 { - bits<8> vdst; bits<1> gds; bits<8> addr; @@ -424,7 +435,6 @@ class DSe <bits<8> op> : Enc64 { } class MUBUFe <bits<7> op> : Enc64 { - bits<12> offset; bits<1> offen; bits<1> idxen; @@ -455,67 +465,65 @@ class MUBUFe <bits<7> op> : Enc64 { } class MTBUFe <bits<3> op> : Enc64 { + bits<8> vdata; + bits<12> offset; + bits<1> offen; + bits<1> idxen; + bits<1> glc; + bits<1> addr64; + bits<4> dfmt; + bits<3> nfmt; + bits<8> vaddr; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; - bits<8> VDATA; - bits<12> OFFSET; - bits<1> OFFEN; - bits<1> IDXEN; - bits<1> GLC; - bits<1> ADDR64; - bits<4> DFMT; - bits<3> NFMT; - bits<8> VADDR; - bits<7> SRSRC; - bits<1> SLC; - bits<1> TFE; - bits<8> SOFFSET; - - let Inst{11-0} = OFFSET; - let Inst{12} = OFFEN; - let Inst{13} = IDXEN; - let Inst{14} = GLC; - let Inst{15} = ADDR64; + let Inst{11-0} = offset; + let Inst{12} = offen; + let Inst{13} = idxen; + let Inst{14} = glc; + let Inst{15} = addr64; let Inst{18-16} = op; - let Inst{22-19} = DFMT; - let Inst{25-23} = NFMT; + let Inst{22-19} = dfmt; + let Inst{25-23} = nfmt; let Inst{31-26} = 0x3a; //encoding - let Inst{39-32} = VADDR; - let Inst{47-40} = VDATA; - let Inst{52-48} = SRSRC{6-2}; - let Inst{54} = SLC; - let Inst{55} = TFE; - let Inst{63-56} = SOFFSET; + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{54} = slc; + let Inst{55} = tfe; + let Inst{63-56} = soffset; } class MIMGe <bits<7> op> : Enc64 { - - bits<8> VDATA; - bits<4> DMASK; - bits<1> UNORM; - bits<1> GLC; - bits<1> DA; - bits<1> R128; - bits<1> TFE; - bits<1> LWE; - bits<1> SLC; - bits<8> VADDR; - bits<7> SRSRC; - bits<7> SSAMP; - - let Inst{11-8} = DMASK; - let Inst{12} = UNORM; - let Inst{13} = GLC; - let Inst{14} = DA; - let Inst{15} = R128; - let Inst{16} = TFE; - let Inst{17} = LWE; + bits<8> vdata; + bits<4> dmask; + bits<1> unorm; + bits<1> glc; + bits<1> da; + bits<1> r128; + bits<1> tfe; + bits<1> lwe; + bits<1> slc; + bits<8> vaddr; + bits<7> srsrc; + bits<7> ssamp; + + let Inst{11-8} = dmask; + let Inst{12} = unorm; + let Inst{13} = glc; + let Inst{14} = da; + let Inst{15} = r128; + let Inst{16} = tfe; + let Inst{17} = lwe; let Inst{24-18} = op; - let Inst{25} = SLC; + let Inst{25} = slc; let Inst{31-26} = 0x3c; - let Inst{39-32} = VADDR; - let Inst{47-40} = VDATA; - let Inst{52-48} = SRSRC{6-2}; - let Inst{57-53} = SSAMP{6-2}; + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{57-53} = ssamp{6-2}; } class FLATe<bits<7> op> : Enc64 { @@ -539,36 +547,40 @@ class FLATe<bits<7> op> : Enc64 { } class EXPe : Enc64 { - bits<4> EN; - bits<6> TGT; - bits<1> COMPR; - bits<1> DONE; - bits<1> VM; - bits<8> VSRC0; - bits<8> VSRC1; - bits<8> VSRC2; - bits<8> VSRC3; - - let Inst{3-0} = EN; - let Inst{9-4} = TGT; - let Inst{10} = COMPR; - let Inst{11} = DONE; - let Inst{12} = VM; + bits<4> en; + bits<6> tgt; + bits<1> compr; + bits<1> done; + bits<1> vm; + bits<8> vsrc0; + bits<8> vsrc1; + bits<8> vsrc2; + bits<8> vsrc3; + + let Inst{3-0} = en; + let Inst{9-4} = tgt; + let Inst{10} = compr; + let Inst{11} = done; + let Inst{12} = vm; let Inst{31-26} = 0x3e; - let Inst{39-32} = VSRC0; - let Inst{47-40} = VSRC1; - let Inst{55-48} = VSRC2; - let Inst{63-56} = VSRC3; + let Inst{39-32} = vsrc0; + let Inst{47-40} = vsrc1; + let Inst{55-48} = vsrc2; + let Inst{63-56} = vsrc3; } let Uses = [EXEC] in { class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : VOP1Common <outs, ins, asm, pattern>, - VOP1e<op>; + VOP1e<op> { + let isCodeGenOnly = 0; +} class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> : - VOP2Common <outs, ins, asm, pattern>, VOP2e<op>; + VOP2Common <outs, ins, asm, pattern>, VOP2e<op> { + let isCodeGenOnly = 0; +} class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> : VOPCCommon <ins, asm, pattern>, VOPCe <op>; @@ -594,7 +606,14 @@ class DS <dag outs, dag ins, string asm, list<dag> pattern> : let LGKM_CNT = 1; let DS = 1; let UseNamedOperandTable = 1; - let DisableEncoding = "$m0"; + let Uses = [M0]; + + // Most instruction load and store data, so set this as the default. + let mayLoad = 1; + let mayStore = 1; + + let hasSideEffects = 0; + let AsmMatchConverter = "cvtDS"; let SchedRW = [WriteLDS]; } @@ -607,6 +626,7 @@ class MUBUF <dag outs, dag ins, string asm, list<dag> pattern> : let hasSideEffects = 0; let UseNamedOperandTable = 1; + let AsmMatchConverter = "cvtMubuf"; let SchedRW = [WriteVMEM]; } diff --git a/contrib/llvm/lib/Target/R600/SIInstrInfo.cpp b/contrib/llvm/lib/Target/R600/SIInstrInfo.cpp index 5ab33b4..d647c25 100644 --- a/contrib/llvm/lib/Target/R600/SIInstrInfo.cpp +++ b/contrib/llvm/lib/Target/R600/SIInstrInfo.cpp @@ -28,8 +28,7 @@ using namespace llvm; SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) - : AMDGPUInstrInfo(st), - RI(st) { } + : AMDGPUInstrInfo(st), RI() {} //===----------------------------------------------------------------------===// // TargetInstrInfo callbacks @@ -75,6 +74,20 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); } +bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, + AliasAnalysis *AA) const { + // TODO: The generic check fails for VALU instructions that should be + // rematerializable due to implicit reads of exec. We really want all of the + // generic logic for this except for this. + switch (MI->getOpcode()) { + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B32_e64: + return true; + default: + return false; + } +} + bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const { @@ -427,7 +440,9 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } } -unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const { +unsigned SIInstrInfo::commuteOpcode(const MachineInstr &MI) const { + const unsigned Opcode = MI.getOpcode(); + int NewOpc; // Try to map original to commuted opcode @@ -566,7 +581,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, unsigned Size) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - const AMDGPUSubtarget &ST = MF->getTarget().getSubtarget<AMDGPUSubtarget>(); + const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>(); const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>(ST.getRegisterInfo()); DebugLoc DL = MBB.findDebugLoc(MI); @@ -592,10 +607,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z); unsigned InputPtrReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR); - static const unsigned TIDIGRegs[3] = { - TIDIGXReg, TIDIGYReg, TIDIGZReg - }; - for (unsigned Reg : TIDIGRegs) { + for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { if (!Entry.isLiveIn(Reg)) Entry.addLiveIn(Reg); } @@ -729,6 +741,26 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { MI->eraseFromParent(); break; } + + case AMDGPU::V_CNDMASK_B64_PSEUDO: { + unsigned Dst = MI->getOperand(0).getReg(); + unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); + unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); + unsigned Src0 = MI->getOperand(1).getReg(); + unsigned Src1 = MI->getOperand(2).getReg(); + const MachineOperand &SrcCond = MI->getOperand(3); + + BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) + .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) + .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) + .addOperand(SrcCond); + BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) + .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) + .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) + .addOperand(SrcCond); + MI->eraseFromParent(); + break; + } } return true; } @@ -759,7 +791,7 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, (!isOperandLegal(MI, Src0Idx, &Src1) || !isOperandLegal(MI, Src1Idx, &Src0))) { return nullptr; - } + } if (!Src1.isReg()) { // Allow commuting instructions with Imm operands. @@ -801,7 +833,7 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, } if (MI) - MI->setDesc(get(commuteOpcode(MI->getOpcode()))); + MI->setDesc(get(commuteOpcode(*MI))); return MI; } @@ -868,6 +900,134 @@ SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { return RC != &AMDGPU::EXECRegRegClass; } +static void removeModOperands(MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::src0_modifiers); + int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::src1_modifiers); + int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::src2_modifiers); + + MI.RemoveOperand(Src2ModIdx); + MI.RemoveOperand(Src1ModIdx); + MI.RemoveOperand(Src0ModIdx); +} + +bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, + unsigned Reg, MachineRegisterInfo *MRI) const { + if (!MRI->hasOneNonDBGUse(Reg)) + return false; + + unsigned Opc = UseMI->getOpcode(); + if (Opc == AMDGPU::V_MAD_F32) { + // Don't fold if we are using source modifiers. The new VOP2 instructions + // don't have them. + if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || + hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) || + hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) { + return false; + } + + MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0); + MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1); + MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2); + + // Multiplied part is the constant: Use v_madmk_f32 + // We should only expect these to be on src0 due to canonicalizations. + if (Src0->isReg() && Src0->getReg() == Reg) { + if (!Src1->isReg() || + (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) + return false; + + if (!Src2->isReg() || + (Src2->isReg() && RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))) + return false; + + // We need to do some weird looking operand shuffling since the madmk + // operands are out of the normal expected order with the multiplied + // constant as the last operand. + // + // v_mad_f32 src0, src1, src2 -> v_madmk_f32 src0 * src2K + src1 + // src0 -> src2 K + // src1 -> src0 + // src2 -> src1 + + const int64_t Imm = DefMI->getOperand(1).getImm(); + + // FIXME: This would be a lot easier if we could return a new instruction + // instead of having to modify in place. + + // Remove these first since they are at the end. + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + AMDGPU::OpName::omod)); + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + AMDGPU::OpName::clamp)); + + unsigned Src1Reg = Src1->getReg(); + unsigned Src1SubReg = Src1->getSubReg(); + unsigned Src2Reg = Src2->getReg(); + unsigned Src2SubReg = Src2->getSubReg(); + Src0->setReg(Src1Reg); + Src0->setSubReg(Src1SubReg); + Src0->setIsKill(Src1->isKill()); + + Src1->setReg(Src2Reg); + Src1->setSubReg(Src2SubReg); + Src1->setIsKill(Src2->isKill()); + + Src2->ChangeToImmediate(Imm); + + removeModOperands(*UseMI); + UseMI->setDesc(get(AMDGPU::V_MADMK_F32)); + + bool DeleteDef = MRI->hasOneNonDBGUse(Reg); + if (DeleteDef) + DefMI->eraseFromParent(); + + return true; + } + + // Added part is the constant: Use v_madak_f32 + if (Src2->isReg() && Src2->getReg() == Reg) { + // Not allowed to use constant bus for another operand. + // We can however allow an inline immediate as src0. + if (!Src0->isImm() && + (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) + return false; + + if (!Src1->isReg() || + (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) + return false; + + const int64_t Imm = DefMI->getOperand(1).getImm(); + + // FIXME: This would be a lot easier if we could return a new instruction + // instead of having to modify in place. + + // Remove these first since they are at the end. + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + AMDGPU::OpName::omod)); + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + AMDGPU::OpName::clamp)); + + Src2->ChangeToImmediate(Imm); + + // These come before src2. + removeModOperands(*UseMI); + UseMI->setDesc(get(AMDGPU::V_MADAK_F32)); + + bool DeleteDef = MRI->hasOneNonDBGUse(Reg); + if (DeleteDef) + DefMI->eraseFromParent(); + + return true; + } + } + + return false; +} + bool SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI, AliasAnalysis *AA) const { @@ -1001,15 +1161,25 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { (FloatToBits(-4.0f) == Val); } -bool SIInstrInfo::isInlineConstant(const MachineOperand &MO) const { - if (MO.isImm()) - return isInlineConstant(APInt(32, MO.getImm(), true)); +bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, + unsigned OpSize) const { + if (MO.isImm()) { + // MachineOperand provides no way to tell the true operand size, since it + // only records a 64-bit value. We need to know the size to determine if a + // 32-bit floating point immediate bit pattern is legal for an integer + // immediate. It would be for any 32-bit integer operand, but would not be + // for a 64-bit one. + + unsigned BitSize = 8 * OpSize; + return isInlineConstant(APInt(BitSize, MO.getImm(), true)); + } return false; } -bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO) const { - return MO.isImm() && !isInlineConstant(MO); +bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, + unsigned OpSize) const { + return MO.isImm() && !isInlineConstant(MO, OpSize); } static bool compareMachineOp(const MachineOperand &Op0, @@ -1039,38 +1209,13 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, if (OpInfo.RegClass < 0) return false; - if (isLiteralConstant(MO)) + unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); + if (isLiteralConstant(MO, OpSize)) return RI.opCanUseLiteralConstant(OpInfo.OperandType); return RI.opCanUseInlineConstant(OpInfo.OperandType); } -bool SIInstrInfo::canFoldOffset(unsigned OffsetSize, unsigned AS) const { - switch (AS) { - case AMDGPUAS::GLOBAL_ADDRESS: { - // MUBUF instructions a 12-bit offset in bytes. - return isUInt<12>(OffsetSize); - } - case AMDGPUAS::CONSTANT_ADDRESS: { - // SMRD instructions have an 8-bit offset in dwords on SI and - // a 20-bit offset in bytes on VI. - if (RI.ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - return isUInt<20>(OffsetSize); - else - return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); - } - case AMDGPUAS::LOCAL_ADDRESS: - case AMDGPUAS::REGION_ADDRESS: { - // The single offset versions have a 16-bit offset in bytes. - return isUInt<16>(OffsetSize); - } - case AMDGPUAS::PRIVATE_ADDRESS: - // Indirect register addressing does not use any offsets. - default: - return 0; - } -} - bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { int Op32 = AMDGPU::getVOPe32(Opcode); if (Op32 == -1) @@ -1094,9 +1239,10 @@ bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, } bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, - const MachineOperand &MO) const { + const MachineOperand &MO, + unsigned OpSize) const { // Literal constants use the constant bus. - if (isLiteralConstant(MO)) + if (isLiteralConstant(MO, OpSize)) return true; if (!MO.isReg() || !MO.isUse()) @@ -1152,7 +1298,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, switch (Desc.OpInfo[i].OperandType) { case MCOI::OPERAND_REGISTER: - if (MI->getOperand(i).isImm() || MI->getOperand(i).isFPImm()) { + if (MI->getOperand(i).isImm()) { ErrInfo = "Illegal immediate value for operand."; return false; } @@ -1160,7 +1306,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, case AMDGPU::OPERAND_REG_IMM32: break; case AMDGPU::OPERAND_REG_INLINE_C: - if (isLiteralConstant(MI->getOperand(i))) { + if (isLiteralConstant(MI->getOperand(i), + RI.getRegClass(RegClass)->getSize())) { ErrInfo = "Illegal immediate value for operand."; return false; } @@ -1207,9 +1354,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, for (int OpIdx : OpIndices) { if (OpIdx == -1) break; - const MachineOperand &MO = MI->getOperand(OpIdx); - if (usesConstantBus(MRI, MO)) { + if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { if (MO.isReg()) { if (MO.getReg() != SGPRUsed) ++ConstantBusCount; @@ -1277,6 +1423,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; + case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; @@ -1313,7 +1460,7 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, if (TargetRegisterInfo::isVirtualRegister(Reg)) return MRI.getRegClass(Reg); - return RI.getRegClass(Reg); + return RI.getPhysRegClass(Reg); } unsigned RCID = Desc.OpInfo[OpNo].RegClass; @@ -1457,14 +1604,16 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, if (!MO) MO = &MI->getOperand(OpIdx); - if (isVALU(InstDesc.Opcode) && usesConstantBus(MRI, *MO)) { + if (isVALU(InstDesc.Opcode) && + usesConstantBus(MRI, *MO, DefinedRC->getSize())) { unsigned SGPRUsed = MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister; for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { if (i == OpIdx) continue; - if (usesConstantBus(MRI, MI->getOperand(i)) && - MI->getOperand(i).isReg() && MI->getOperand(i).getReg() != SGPRUsed) { + const MachineOperand &Op = MI->getOperand(i); + if (Op.isReg() && Op.getReg() != SGPRUsed && + usesConstantBus(MRI, Op, getOpSize(*MI, i))) { return false; } } @@ -1557,7 +1706,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { // We can use one SGPR in each VOP3 instruction. continue; } - } else if (!isLiteralConstant(MO)) { + } else if (!isLiteralConstant(MO, getOpSize(MI->getOpcode(), Idx))) { // If it is not a register and not a literal constant, then it must be // an inline constant which is always legal. continue; @@ -1730,20 +1879,21 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); - assert(SOffset->isImm() && SOffset->getImm() == 0 && "Legalizing MUBUF " - "with non-zero soffset is not implemented"); - (void)SOffset; // Create the new instruction. unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); MachineInstr *Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) .addOperand(*VData) - .addOperand(*SRsrc) .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. // This will be replaced later // with the new value of vaddr. - .addOperand(*Offset); + .addOperand(*SRsrc) + .addOperand(*SOffset) + .addOperand(*Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0); // tfe MI->removeFromParent(); MI = Addr64; @@ -1787,14 +1937,20 @@ void SIInstrInfo::splitSMRD(MachineInstr *MI, // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes // on VI. + + bool IsKill = SBase->isKill(); if (OffOp) { - bool isVI = RI.ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS; + bool isVI = + MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >= + AMDGPUSubtarget::VOLCANIC_ISLANDS; unsigned OffScale = isVI ? 1 : 4; // Handle the _IMM variant unsigned LoOffset = OffOp->getImm() * OffScale; unsigned HiOffset = LoOffset + HalfSize; Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo) - .addOperand(*SBase) + // Use addReg instead of addOperand + // to make sure kill flag is cleared. + .addReg(SBase->getReg(), 0, SBase->getSubReg()) .addImm(LoOffset / OffScale); if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) { @@ -1803,25 +1959,28 @@ void SIInstrInfo::splitSMRD(MachineInstr *MI, BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR) .addImm(HiOffset); // The offset in register is in bytes. Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) - .addOperand(*SBase) + .addReg(SBase->getReg(), getKillRegState(IsKill), + SBase->getSubReg()) .addReg(OffsetSGPR); } else { Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi) - .addOperand(*SBase) + .addReg(SBase->getReg(), getKillRegState(IsKill), + SBase->getSubReg()) .addImm(HiOffset / OffScale); } } else { // Handle the _SGPR variant MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff); Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo) - .addOperand(*SBase) + .addReg(SBase->getReg(), 0, SBase->getSubReg()) .addOperand(*SOff); unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR) .addOperand(*SOff) .addImm(HalfSize); Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp)) - .addOperand(*SBase) + .addReg(SBase->getReg(), getKillRegState(IsKill), + SBase->getSubReg()) .addReg(OffsetSGPR); } @@ -1876,7 +2035,8 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con // SMRD instructions take a dword offsets on SI and byte offset on VI // and MUBUF instructions always take a byte offset. ImmOffset = MI->getOperand(2).getImm(); - if (RI.ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) + if (MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <= + AMDGPUSubtarget::SEA_ISLANDS) ImmOffset <<= 2; RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); @@ -1916,12 +2076,15 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con .addImm(AMDGPU::sub3); MI->setDesc(get(NewOpcode)); if (MI->getOperand(2).isReg()) { - MI->getOperand(2).setReg(MI->getOperand(1).getReg()); + MI->getOperand(2).setReg(SRsrc); } else { - MI->getOperand(2).ChangeToRegister(MI->getOperand(1).getReg(), false); + MI->getOperand(2).ChangeToRegister(SRsrc, false); } - MI->getOperand(1).setReg(SRsrc); + MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset)); + MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // glc + MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // slc + MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // tfe const TargetRegisterClass *NewDstRC = RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass); diff --git a/contrib/llvm/lib/Target/R600/SIInstrInfo.h b/contrib/llvm/lib/Target/R600/SIInstrInfo.h index 1298030..64b5120 100644 --- a/contrib/llvm/lib/Target/R600/SIInstrInfo.h +++ b/contrib/llvm/lib/Target/R600/SIInstrInfo.h @@ -72,6 +72,9 @@ public: return RI; } + bool isReallyTriviallyReMaterializable(const MachineInstr *MI, + AliasAnalysis *AA) const override; + bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, int64_t &Offset2) const override; @@ -114,7 +117,7 @@ public: // register. If there is no hardware instruction that can store to \p // DstRC, then AMDGPU::COPY is returned. unsigned getMovOpcode(const TargetRegisterClass *DstRC) const; - unsigned commuteOpcode(unsigned Opcode) const; + unsigned commuteOpcode(const MachineInstr &MI) const; MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI = false) const override; @@ -136,6 +139,11 @@ public: bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; + bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, + unsigned Reg, MachineRegisterInfo *MRI) const final; + + unsigned getMachineCSELookAheadLimit() const override { return 500; } + bool isSALU(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SALU; } @@ -208,24 +216,25 @@ public: return get(Opcode).TSFlags & SIInstrFlags::WQM; } + bool isVGPRSpill(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill; + } + bool isInlineConstant(const APInt &Imm) const; - bool isInlineConstant(const MachineOperand &MO) const; - bool isLiteralConstant(const MachineOperand &MO) const; + bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const; + bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const; bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, const MachineOperand &MO) const; - /// \brief Return true if the given offset Size in bytes can be folded into - /// the immediate offsets of a memory instruction for the given address space. - bool canFoldOffset(unsigned OffsetSize, unsigned AS) const; - /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding. /// This function will return false if you pass it a 32-bit instruction. bool hasVALU32BitEncoding(unsigned Opcode) const; /// \brief Returns true if this operand uses the constant bus. bool usesConstantBus(const MachineRegisterInfo &MRI, - const MachineOperand &MO) const; + const MachineOperand &MO, + unsigned OpSize) const; /// \brief Return true if this instruction has any modifiers. /// e.g. src[012]_mod, omod, clamp. diff --git a/contrib/llvm/lib/Target/R600/SIInstrInfo.td b/contrib/llvm/lib/Target/R600/SIInstrInfo.td index a749e7f..4fc2498 100644 --- a/contrib/llvm/lib/Target/R600/SIInstrInfo.td +++ b/contrib/llvm/lib/Target/R600/SIInstrInfo.td @@ -6,6 +6,13 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// +def isCI : Predicate<"Subtarget->getGeneration() " + ">= AMDGPUSubtarget::SEA_ISLANDS">; +def isVI : Predicate < + "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, + AssemblerPredicate<"FeatureGCN3Encoding">; + +def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">; class vop { field bits<9> SI3; @@ -117,9 +124,111 @@ def SIconstdata_ptr : SDNode< "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]> >; +//===----------------------------------------------------------------------===// +// SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1 +// to be glued to the memory instructions. +//===----------------------------------------------------------------------===// + +def SIld_local : SDNode <"ISD::LOAD", SDTLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] +>; + +def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{ + return isLocalLoad(cast<LoadSDNode>(N)); +}]>; + +def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{ + return cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED && + cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD; +}]>; + +def si_load_local_align8 : Aligned8Bytes < + (ops node:$ptr), (si_load_local node:$ptr) +>; + +def si_sextload_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{ + return cast<LoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD; +}]>; +def si_az_extload_local : AZExtLoadBase <si_ld_local>; + +multiclass SIExtLoadLocal <PatFrag ld_node> { + + def _i8 : PatFrag <(ops node:$ptr), (ld_node node:$ptr), + [{return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;}] + >; + + def _i16 : PatFrag <(ops node:$ptr), (ld_node node:$ptr), + [{return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;}] + >; +} + +defm si_sextload_local : SIExtLoadLocal <si_sextload_local>; +defm si_az_extload_local : SIExtLoadLocal <si_az_extload_local>; + +def SIst_local : SDNode <"ISD::STORE", SDTStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue] +>; + +def si_st_local : PatFrag < + (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{ + return isLocalStore(cast<StoreSDNode>(N)); +}]>; + +def si_store_local : PatFrag < + (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED && + !cast<StoreSDNode>(N)->isTruncatingStore(); +}]>; + +def si_store_local_align8 : Aligned8Bytes < + (ops node:$val, node:$ptr), (si_store_local node:$val, node:$ptr) +>; + +def si_truncstore_local : PatFrag < + (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->isTruncatingStore(); +}]>; + +def si_truncstore_local_i8 : PatFrag < + (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i8; +}]>; + +def si_truncstore_local_i16 : PatFrag < + (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i16; +}]>; + +multiclass SIAtomicM0Glue2 <string op_name> { + + def _glue : SDNode <"ISD::ATOMIC_"#op_name, SDTAtomic2, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] + >; + + def _local : local_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>; +} + +defm si_atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; +defm si_atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">; +defm si_atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">; +defm si_atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">; +defm si_atomic_load_or : SIAtomicM0Glue2 <"LOAD_OR">; +defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">; +defm si_atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">; +defm si_atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">; +defm si_atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">; +defm si_atomic_swap : SIAtomicM0Glue2 <"SWAP">; + +def si_atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] +>; + +defm si_atomic_cmp_swap : AtomicCmpSwapLocal <si_atomic_cmp_swap_glue>; + // Transformation function, extract the lower 32bit of a 64bit immediate def LO32 : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, MVT::i32); + return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, SDLoc(N), + MVT::i32); }]>; def LO32f : SDNodeXForm<fpimm, [{ @@ -129,12 +238,13 @@ def LO32f : SDNodeXForm<fpimm, [{ // Transformation function, extract the upper 32bit of a 64bit immediate def HI32 : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(N->getZExtValue() >> 32, MVT::i32); + return CurDAG->getTargetConstant(N->getZExtValue() >> 32, SDLoc(N), MVT::i32); }]>; def HI32f : SDNodeXForm<fpimm, [{ APInt V = N->getValueAPF().bitcastToAPInt().lshr(32).trunc(32); - return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), MVT::f32); + return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), SDLoc(N), + MVT::f32); }]>; def IMM8bitDWORD : PatLeaf <(imm), @@ -142,39 +252,39 @@ def IMM8bitDWORD : PatLeaf <(imm), >; def as_dword_i32imm : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(N->getZExtValue() >> 2, MVT::i32); + return CurDAG->getTargetConstant(N->getZExtValue() >> 2, SDLoc(N), MVT::i32); }]>; def as_i1imm : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(N->getZExtValue(), MVT::i1); + return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1); }]>; def as_i8imm : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(N->getZExtValue(), MVT::i8); + return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i8); }]>; def as_i16imm : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i16); + return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16); }]>; def as_i32imm: SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i32); + return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32); }]>; def as_i64imm: SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i64); + return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64); }]>; // Copied from the AArch64 backend: def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{ return CurDAG->getTargetConstant( - N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i32); + N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32); }]>; // Copied from the AArch64 backend: def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{ return CurDAG->getTargetConstant( - N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i64); + N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64); }]>; def IMM8bit : PatLeaf <(imm), @@ -211,12 +321,11 @@ class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{ }]>; class SGPRImm <dag frag> : PatLeaf<frag, [{ - if (TM.getSubtarget<AMDGPUSubtarget>().getGeneration() < - AMDGPUSubtarget::SOUTHERN_ISLANDS) { + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) { return false; } const SIRegisterInfo *SIRI = - static_cast<const SIRegisterInfo*>(TM.getSubtargetImpl()->getRegisterInfo()); + static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); U != E; ++U) { if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) { @@ -234,14 +343,88 @@ def FRAMEri32 : Operand<iPTR> { let MIOperandInfo = (ops i32:$ptr, i32imm:$index); } +def SoppBrTarget : AsmOperandClass { + let Name = "SoppBrTarget"; + let ParserMethod = "parseSOppBrTarget"; +} + def sopp_brtarget : Operand<OtherVT> { let EncoderMethod = "getSOPPBrEncoding"; let OperandType = "OPERAND_PCREL"; + let ParserMatchClass = SoppBrTarget; } include "SIInstrFormats.td" include "VIInstrFormats.td" +def MubufOffsetMatchClass : AsmOperandClass { + let Name = "MubufOffset"; + let ParserMethod = "parseMubufOptionalOps"; + let RenderMethod = "addImmOperands"; +} + +class DSOffsetBaseMatchClass <string parser> : AsmOperandClass { + let Name = "DSOffset"#parser; + let ParserMethod = parser; + let RenderMethod = "addImmOperands"; + let PredicateMethod = "isDSOffset"; +} + +def DSOffsetMatchClass : DSOffsetBaseMatchClass <"parseDSOptionalOps">; +def DSOffsetGDSMatchClass : DSOffsetBaseMatchClass <"parseDSOffsetOptional">; + +def DSOffset01MatchClass : AsmOperandClass { + let Name = "DSOffset1"; + let ParserMethod = "parseDSOff01OptionalOps"; + let RenderMethod = "addImmOperands"; + let PredicateMethod = "isDSOffset01"; +} + +class GDSBaseMatchClass <string parser> : AsmOperandClass { + let Name = "GDS"#parser; + let PredicateMethod = "isImm"; + let ParserMethod = parser; + let RenderMethod = "addImmOperands"; +} + +def GDSMatchClass : GDSBaseMatchClass <"parseDSOptionalOps">; +def GDS01MatchClass : GDSBaseMatchClass <"parseDSOff01OptionalOps">; + +def GLCMatchClass : AsmOperandClass { + let Name = "GLC"; + let PredicateMethod = "isImm"; + let ParserMethod = "parseMubufOptionalOps"; + let RenderMethod = "addImmOperands"; +} + +def SLCMatchClass : AsmOperandClass { + let Name = "SLC"; + let PredicateMethod = "isImm"; + let ParserMethod = "parseMubufOptionalOps"; + let RenderMethod = "addImmOperands"; +} + +def TFEMatchClass : AsmOperandClass { + let Name = "TFE"; + let PredicateMethod = "isImm"; + let ParserMethod = "parseMubufOptionalOps"; + let RenderMethod = "addImmOperands"; +} + +def OModMatchClass : AsmOperandClass { + let Name = "OMod"; + let PredicateMethod = "isImm"; + let ParserMethod = "parseVOP3OptionalOps"; + let RenderMethod = "addImmOperands"; +} + +def ClampMatchClass : AsmOperandClass { + let Name = "Clamp"; + let PredicateMethod = "isImm"; + let ParserMethod = "parseVOP3OptionalOps"; + let RenderMethod = "addImmOperands"; +} + let OperandType = "OPERAND_IMMEDIATE" in { def offen : Operand<i1> { @@ -255,36 +438,58 @@ def addr64 : Operand<i1> { } def mbuf_offset : Operand<i16> { let PrintMethod = "printMBUFOffset"; + let ParserMatchClass = MubufOffsetMatchClass; } -def ds_offset : Operand<i16> { +class ds_offset_base <AsmOperandClass mc> : Operand<i16> { let PrintMethod = "printDSOffset"; + let ParserMatchClass = mc; } +def ds_offset : ds_offset_base <DSOffsetMatchClass>; +def ds_offset_gds : ds_offset_base <DSOffsetGDSMatchClass>; + def ds_offset0 : Operand<i8> { let PrintMethod = "printDSOffset0"; + let ParserMatchClass = DSOffset01MatchClass; } def ds_offset1 : Operand<i8> { let PrintMethod = "printDSOffset1"; + let ParserMatchClass = DSOffset01MatchClass; } +class gds_base <AsmOperandClass mc> : Operand <i1> { + let PrintMethod = "printGDS"; + let ParserMatchClass = mc; +} +def gds : gds_base <GDSMatchClass>; + +def gds01 : gds_base <GDS01MatchClass>; + def glc : Operand <i1> { let PrintMethod = "printGLC"; + let ParserMatchClass = GLCMatchClass; } def slc : Operand <i1> { let PrintMethod = "printSLC"; + let ParserMatchClass = SLCMatchClass; } def tfe : Operand <i1> { let PrintMethod = "printTFE"; + let ParserMatchClass = TFEMatchClass; } def omod : Operand <i32> { let PrintMethod = "printOModSI"; + let ParserMatchClass = OModMatchClass; } def ClampMod : Operand <i1> { let PrintMethod = "printClampSI"; + let ParserMatchClass = ClampMatchClass; } } // End OperandType = "OPERAND_IMMEDIATE" +def VOPDstS64 : VOPDstOperand <SReg_64>; + //===----------------------------------------------------------------------===// // Complex patterns //===----------------------------------------------------------------------===// @@ -293,8 +498,8 @@ def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">; def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">; def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">; -def MUBUFAddr64 : ComplexPattern<i64, 3, "SelectMUBUFAddr64">; -def MUBUFAddr64Atomic : ComplexPattern<i64, 4, "SelectMUBUFAddr64">; +def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">; +def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">; def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">; def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">; def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">; @@ -316,6 +521,7 @@ def SIOperand { def SRCMODS { int NONE = 0; + int NEG = 1; } def DSTCLAMP { @@ -364,7 +570,7 @@ class EXPCommon : InstSI< multiclass EXP_m { - let isPseudo = 1 in { + let isPseudo = 1, isCodeGenOnly = 1 in { def "" : EXPCommon, SIMCInstr <"exp", SISubtarget.NONE> ; } @@ -381,83 +587,109 @@ class SOP1_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : SOP1 <outs, ins, "", pattern>, SIMCInstr<opName, SISubtarget.NONE> { let isPseudo = 1; + let isCodeGenOnly = 1; } class SOP1_Real_si <sop1 op, string opName, dag outs, dag ins, string asm> : SOP1 <outs, ins, asm, []>, SOP1e <op.SI>, - SIMCInstr<opName, SISubtarget.SI>; + SIMCInstr<opName, SISubtarget.SI> { + let isCodeGenOnly = 0; + let AssemblerPredicates = [isSICI]; +} class SOP1_Real_vi <sop1 op, string opName, dag outs, dag ins, string asm> : SOP1 <outs, ins, asm, []>, SOP1e <op.VI>, - SIMCInstr<opName, SISubtarget.VI>; + SIMCInstr<opName, SISubtarget.VI> { + let isCodeGenOnly = 0; + let AssemblerPredicates = [isVI]; +} -multiclass SOP1_32 <sop1 op, string opName, list<dag> pattern> { - def "" : SOP1_Pseudo <opName, (outs SReg_32:$dst), (ins SSrc_32:$src0), - pattern>; +multiclass SOP1_m <sop1 op, string opName, dag outs, dag ins, string asm, + list<dag> pattern> { - def _si : SOP1_Real_si <op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0), - opName#" $dst, $src0">; + def "" : SOP1_Pseudo <opName, outs, ins, pattern>; - def _vi : SOP1_Real_vi <op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0), - opName#" $dst, $src0">; -} + def _si : SOP1_Real_si <op, opName, outs, ins, asm>; -multiclass SOP1_64 <sop1 op, string opName, list<dag> pattern> { - def "" : SOP1_Pseudo <opName, (outs SReg_64:$dst), (ins SSrc_64:$src0), - pattern>; + def _vi : SOP1_Real_vi <op, opName, outs, ins, asm>; - def _si : SOP1_Real_si <op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0), - opName#" $dst, $src0">; - - def _vi : SOP1_Real_vi <op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0), - opName#" $dst, $src0">; } +multiclass SOP1_32 <sop1 op, string opName, list<dag> pattern> : SOP1_m < + op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0), + opName#" $dst, $src0", pattern +>; + +multiclass SOP1_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m < + op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0), + opName#" $dst, $src0", pattern +>; + // no input, 64-bit output. multiclass SOP1_64_0 <sop1 op, string opName, list<dag> pattern> { def "" : SOP1_Pseudo <opName, (outs SReg_64:$dst), (ins), pattern>; def _si : SOP1_Real_si <op, opName, (outs SReg_64:$dst), (ins), opName#" $dst"> { - let SSRC0 = 0; + let ssrc0 = 0; } def _vi : SOP1_Real_vi <op, opName, (outs SReg_64:$dst), (ins), opName#" $dst"> { - let SSRC0 = 0; + let ssrc0 = 0; } } -// 64-bit input, 32-bit output. -multiclass SOP1_32_64 <sop1 op, string opName, list<dag> pattern> { - def "" : SOP1_Pseudo <opName, (outs SReg_32:$dst), (ins SSrc_64:$src0), - pattern>; +// 64-bit input, no output +multiclass SOP1_1 <sop1 op, string opName, list<dag> pattern> { + def "" : SOP1_Pseudo <opName, (outs), (ins SReg_64:$src0), pattern>; - def _si : SOP1_Real_si <op, opName, (outs SReg_32:$dst), (ins SSrc_64:$src0), - opName#" $dst, $src0">; + def _si : SOP1_Real_si <op, opName, (outs), (ins SReg_64:$src0), + opName#" $src0"> { + let sdst = 0; + } - def _vi : SOP1_Real_vi <op, opName, (outs SReg_32:$dst), (ins SSrc_64:$src0), - opName#" $dst, $src0">; + def _vi : SOP1_Real_vi <op, opName, (outs), (ins SReg_64:$src0), + opName#" $src0"> { + let sdst = 0; + } } +// 64-bit input, 32-bit output. +multiclass SOP1_32_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m < + op, opName, (outs SReg_32:$dst), (ins SSrc_64:$src0), + opName#" $dst, $src0", pattern +>; + class SOP2_Pseudo<string opName, dag outs, dag ins, list<dag> pattern> : SOP2<outs, ins, "", pattern>, SIMCInstr<opName, SISubtarget.NONE> { let isPseudo = 1; + let isCodeGenOnly = 1; let Size = 4; + + // Pseudo instructions have no encodings, but adding this field here allows + // us to do: + // let sdst = xxx in { + // for multiclasses that include both real and pseudo instructions. + field bits<7> sdst = 0; } class SOP2_Real_si<sop2 op, string opName, dag outs, dag ins, string asm> : SOP2<outs, ins, asm, []>, SOP2e<op.SI>, - SIMCInstr<opName, SISubtarget.SI>; + SIMCInstr<opName, SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; +} class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm> : SOP2<outs, ins, asm, []>, SOP2e<op.VI>, - SIMCInstr<opName, SISubtarget.VI>; + SIMCInstr<opName, SISubtarget.VI> { + let AssemblerPredicates = [isVI]; +} multiclass SOP2_SELECT_32 <sop2 op, string opName, list<dag> pattern> { def "" : SOP2_Pseudo <opName, (outs SReg_32:$dst), @@ -472,44 +704,36 @@ multiclass SOP2_SELECT_32 <sop2 op, string opName, list<dag> pattern> { opName#" $dst, $src0, $src1 [$scc]">; } -multiclass SOP2_32 <sop2 op, string opName, list<dag> pattern> { - def "" : SOP2_Pseudo <opName, (outs SReg_32:$dst), - (ins SSrc_32:$src0, SSrc_32:$src1), pattern>; +multiclass SOP2_m <sop2 op, string opName, dag outs, dag ins, string asm, + list<dag> pattern> { - def _si : SOP2_Real_si <op, opName, (outs SReg_32:$dst), - (ins SSrc_32:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1">; - - def _vi : SOP2_Real_vi <op, opName, (outs SReg_32:$dst), - (ins SSrc_32:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1">; -} + def "" : SOP2_Pseudo <opName, outs, ins, pattern>; -multiclass SOP2_64 <sop2 op, string opName, list<dag> pattern> { - def "" : SOP2_Pseudo <opName, (outs SReg_64:$dst), - (ins SSrc_64:$src0, SSrc_64:$src1), pattern>; + def _si : SOP2_Real_si <op, opName, outs, ins, asm>; - def _si : SOP2_Real_si <op, opName, (outs SReg_64:$dst), - (ins SSrc_64:$src0, SSrc_64:$src1), opName#" $dst, $src0, $src1">; + def _vi : SOP2_Real_vi <op, opName, outs, ins, asm>; - def _vi : SOP2_Real_vi <op, opName, (outs SReg_64:$dst), - (ins SSrc_64:$src0, SSrc_64:$src1), opName#" $dst, $src0, $src1">; } -multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> { - def "" : SOP2_Pseudo <opName, (outs SReg_64:$dst), - (ins SSrc_64:$src0, SSrc_32:$src1), pattern>; - - def _si : SOP2_Real_si <op, opName, (outs SReg_64:$dst), - (ins SSrc_64:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1">; +multiclass SOP2_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m < + op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1), + opName#" $dst, $src0, $src1", pattern +>; - def _vi : SOP2_Real_vi <op, opName, (outs SReg_64:$dst), - (ins SSrc_64:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1">; -} +multiclass SOP2_64 <sop2 op, string opName, list<dag> pattern> : SOP2_m < + op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1), + opName#" $dst, $src0, $src1", pattern +>; +multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m < + op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1), + opName#" $dst, $src0, $src1", pattern +>; class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt, string opName, PatLeaf cond> : SOPC < op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1), - opName#" $dst, $src0, $src1", []>; + opName#" $src0, $src1", []>; class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL> : SOPC_Helper<op, SSrc_32, i32, opName, cond>; @@ -521,17 +745,34 @@ class SOPK_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : SOPK <outs, ins, "", pattern>, SIMCInstr<opName, SISubtarget.NONE> { let isPseudo = 1; + let isCodeGenOnly = 1; } class SOPK_Real_si <sopk op, string opName, dag outs, dag ins, string asm> : SOPK <outs, ins, asm, []>, SOPKe <op.SI>, - SIMCInstr<opName, SISubtarget.SI>; + SIMCInstr<opName, SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; + let isCodeGenOnly = 0; +} class SOPK_Real_vi <sopk op, string opName, dag outs, dag ins, string asm> : SOPK <outs, ins, asm, []>, SOPKe <op.VI>, - SIMCInstr<opName, SISubtarget.VI>; + SIMCInstr<opName, SISubtarget.VI> { + let AssemblerPredicates = [isVI]; + let isCodeGenOnly = 0; +} + +multiclass SOPK_m <sopk op, string opName, dag outs, dag ins, string opAsm, + string asm = opName#opAsm> { + def "" : SOPK_Pseudo <opName, outs, ins, []>; + + def _si : SOPK_Real_si <op, opName, outs, ins, asm>; + + def _vi : SOPK_Real_vi <op, opName, outs, ins, asm>; + +} multiclass SOPK_32 <sopk op, string opName, list<dag> pattern> { def "" : SOPK_Pseudo <opName, (outs SReg_32:$dst), (ins u16imm:$src0), @@ -548,13 +789,39 @@ multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> { def "" : SOPK_Pseudo <opName, (outs SCCReg:$dst), (ins SReg_32:$src0, u16imm:$src1), pattern>; - def _si : SOPK_Real_si <op, opName, (outs SCCReg:$dst), - (ins SReg_32:$src0, u16imm:$src1), opName#" $dst, $src0">; + let DisableEncoding = "$dst" in { + def _si : SOPK_Real_si <op, opName, (outs SCCReg:$dst), + (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16">; - def _vi : SOPK_Real_vi <op, opName, (outs SCCReg:$dst), - (ins SReg_32:$src0, u16imm:$src1), opName#" $dst, $src0">; + def _vi : SOPK_Real_vi <op, opName, (outs SCCReg:$dst), + (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16">; + } } +multiclass SOPK_32TIE <sopk op, string opName, list<dag> pattern> : SOPK_m < + op, opName, (outs SReg_32:$sdst), (ins SReg_32:$src0, u16imm:$simm16), + " $sdst, $simm16" +>; + +multiclass SOPK_IMM32 <sopk op, string opName, dag outs, dag ins, + string argAsm, string asm = opName#argAsm> { + + def "" : SOPK_Pseudo <opName, outs, ins, []>; + + def _si : SOPK <outs, ins, asm, []>, + SOPK64e <op.SI>, + SIMCInstr<opName, SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; + let isCodeGenOnly = 0; + } + + def _vi : SOPK <outs, ins, asm, []>, + SOPK64e <op.VI>, + SIMCInstr<opName, SISubtarget.VI> { + let AssemblerPredicates = [isVI]; + let isCodeGenOnly = 0; + } +} //===----------------------------------------------------------------------===// // SMRD classes //===----------------------------------------------------------------------===// @@ -563,19 +830,24 @@ class SMRD_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : SMRD <outs, ins, "", pattern>, SIMCInstr<opName, SISubtarget.NONE> { let isPseudo = 1; + let isCodeGenOnly = 1; } class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins, string asm> : SMRD <outs, ins, asm, []>, SMRDe <op, imm>, - SIMCInstr<opName, SISubtarget.SI>; + SIMCInstr<opName, SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; +} class SMRD_Real_vi <bits<8> op, string opName, bit imm, dag outs, dag ins, string asm> : SMRD <outs, ins, asm, []>, SMEMe_vi <op, imm>, - SIMCInstr<opName, SISubtarget.VI>; + SIMCInstr<opName, SISubtarget.VI> { + let AssemblerPredicates = [isVI]; +} multiclass SMRD_m <bits<5> op, string opName, bit imm, dag outs, dag ins, string asm, list<dag> pattern> { @@ -584,7 +856,11 @@ multiclass SMRD_m <bits<5> op, string opName, bit imm, dag outs, dag ins, def _si : SMRD_Real_si <op, opName, imm, outs, ins, asm>; - def _vi : SMRD_Real_vi <{0, 0, 0, op}, opName, imm, outs, ins, asm>; + // glc is only applicable to scalar stores, which are not yet + // implemented. + let glc = 0 in { + def _vi : SMRD_Real_vi <{0, 0, 0, op}, opName, imm, outs, ins, asm>; + } } multiclass SMRD_Helper <bits<5> op, string opName, RegisterClass baseClass, @@ -610,8 +886,14 @@ multiclass SMRD_Helper <bits<5> op, string opName, RegisterClass baseClass, def InputMods : OperandWithDefaultOps <i32, (ops (i32 0))> { let PrintMethod = "printOperandAndMods"; } + +def InputModsMatchClass : AsmOperandClass { + let Name = "RegWithInputMods"; +} + def InputModsNoDefault : Operand <i32> { let PrintMethod = "printOperandAndMods"; + let ParserMatchClass = InputModsMatchClass; } class getNumSrcArgs<ValueType Src1, ValueType Src2> { @@ -624,9 +906,9 @@ class getNumSrcArgs<ValueType Src1, ValueType Src2> { // Returns the register class to use for the destination of VOP[123C] // instructions for the given VT. class getVALUDstForVT<ValueType VT> { - RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32, - !if(!eq(VT.Size, 64), VReg_64, - SReg_64)); // else VT == i1 + RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>, + !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>, + VOPDstOperand<SReg_64>)); // else VT == i1 } // Returns the register class to use for source 0 of VOP[12C] @@ -641,31 +923,12 @@ class getVOPSrc1ForVT<ValueType VT> { RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32, VReg_64); } -// Returns the register classes for the source arguments of a VOP[12C] -// instruction for the given SrcVTs. -class getInRC32 <list<ValueType> SrcVT> { - list<DAGOperand> ret = [ - getVOPSrc0ForVT<SrcVT[0]>.ret, - getVOPSrc1ForVT<SrcVT[1]>.ret - ]; -} - // Returns the register class to use for sources of VOP3 instructions for the // given VT. class getVOP3SrcForVT<ValueType VT> { RegisterOperand ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64); } -// Returns the register classes for the source arguments of a VOP3 -// instruction for the given SrcVTs. -class getInRC64 <list<ValueType> SrcVT> { - list<DAGOperand> ret = [ - getVOP3SrcForVT<SrcVT[0]>.ret, - getVOP3SrcForVT<SrcVT[1]>.ret, - getVOP3SrcForVT<SrcVT[2]>.ret - ]; -} - // Returns 1 if the source arguments have modifiers, 0 if they do not. class hasModifiers<ValueType SrcVT> { bit ret = !if(!eq(SrcVT.Value, f32.Value), 1, @@ -723,7 +986,7 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, class getAsm32 <int NumSrcArgs> { string src1 = ", $src1"; string src2 = ", $src2"; - string ret = " $dst, $src0"# + string ret = "$dst, $src0"# !if(!eq(NumSrcArgs, 1), "", src1)# !if(!eq(NumSrcArgs, 3), src2, ""); } @@ -739,7 +1002,7 @@ class getAsm64 <int NumSrcArgs, bit HasModifiers> { string ret = !if(!eq(HasModifiers, 0), getAsm32<NumSrcArgs>.ret, - " $dst, "#src0#src1#src2#"$clamp"#"$omod"); + "$dst, "#src0#src1#src2#"$clamp"#"$omod"); } @@ -751,7 +1014,7 @@ class VOPProfile <list<ValueType> _ArgVT> { field ValueType Src0VT = ArgVT[1]; field ValueType Src1VT = ArgVT[2]; field ValueType Src2VT = ArgVT[3]; - field RegisterClass DstRC = getVALUDstForVT<DstVT>.ret; + field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret; field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret; field RegisterClass Src1RC32 = getVOPSrc1ForVT<Src1VT>.ret; field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret; @@ -767,10 +1030,20 @@ class VOPProfile <list<ValueType> _ArgVT> { field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, HasModifiers>.ret; - field string Asm32 = "_e32"#getAsm32<NumSrcArgs>.ret; + field string Asm32 = getAsm32<NumSrcArgs>.ret; field string Asm64 = getAsm64<NumSrcArgs, HasModifiers>.ret; } +// FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order +// for the instruction patterns to work. +def VOP_F16_F16 : VOPProfile <[f32, f32, untyped, untyped]>; +def VOP_F16_I16 : VOPProfile <[f32, i32, untyped, untyped]>; +def VOP_I16_F16 : VOPProfile <[i32, f32, untyped, untyped]>; + +def VOP_F16_F16_F16 : VOPProfile <[f32, f32, f32, untyped]>; +def VOP_F16_F16_I16 : VOPProfile <[f32, f32, i32, untyped]>; +def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>; + def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>; def VOP_F32_F64 : VOPProfile <[f32, f64, untyped, untyped]>; def VOP_F32_I32 : VOPProfile <[f32, i32, untyped, untyped]>; @@ -794,22 +1067,27 @@ def VOP_I32_I32_I32_VCC : VOPProfile <[i32, i32, i32, untyped]> { def VOP_I1_F32_I32 : VOPProfile <[i1, f32, i32, untyped]> { let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); - let Asm64 = " $dst, $src0_modifiers, $src1"; + let Asm64 = "$dst, $src0_modifiers, $src1"; } def VOP_I1_F64_I32 : VOPProfile <[i1, f64, i32, untyped]> { let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); - let Asm64 = " $dst, $src0_modifiers, $src1"; + let Asm64 = "$dst, $src0_modifiers, $src1"; } def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>; +def VOP_CNDMASK : VOPProfile <[i32, i32, i32, untyped]> { + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VCCReg:$src2); + let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, SSrc_64:$src2); + let Asm64 = "$dst, $src0, $src1, $src2"; +} def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>; def VOP_MADK : VOPProfile <[f32, f32, f32, f32]> { field dag Ins = (ins VCSrc_32:$src0, VGPR_32:$vsrc1, u32imm:$src2); - field string Asm = " $dst, $src0, $vsrc1, $src2"; + field string Asm = "$dst, $src0, $vsrc1, $src2"; } def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>; def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>; @@ -833,34 +1111,62 @@ class AtomicNoRet <string noRetOp, bit isRet> { class VOP1_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : VOP1Common <outs, ins, "", pattern>, VOP <opName>, - SIMCInstr <opName#"_e32", SISubtarget.NONE> { + SIMCInstr <opName#"_e32", SISubtarget.NONE>, + MnemonicAlias<opName#"_e32", opName> { let isPseudo = 1; + let isCodeGenOnly = 1; + + field bits<8> vdst; + field bits<9> src0; +} + +class VOP1_Real_si <string opName, vop1 op, dag outs, dag ins, string asm> : + VOP1<op.SI, outs, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.SI> { + let AssemblerPredicate = SIAssemblerPredicate; +} + +class VOP1_Real_vi <string opName, vop1 op, dag outs, dag ins, string asm> : + VOP1<op.VI, outs, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.VI> { + let AssemblerPredicates = [isVI]; } multiclass VOP1_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern, string opName> { def "" : VOP1_Pseudo <outs, ins, pattern, opName>; - def _si : VOP1<op.SI, outs, ins, asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.SI>; - def _vi : VOP1<op.VI, outs, ins, asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.VI>; + def _si : VOP1_Real_si <opName, op, outs, ins, asm>; + + def _vi : VOP1_Real_vi <opName, op, outs, ins, asm>; } multiclass VOP1SI_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern, string opName> { def "" : VOP1_Pseudo <outs, ins, pattern, opName>; - def _si : VOP1<op.SI, outs, ins, asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.SI>; - // No VI instruction. This class is for SI only. + def _si : VOP1_Real_si <opName, op, outs, ins, asm>; } class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : VOP2Common <outs, ins, "", pattern>, VOP <opName>, - SIMCInstr<opName#"_e32", SISubtarget.NONE> { + SIMCInstr<opName#"_e32", SISubtarget.NONE>, + MnemonicAlias<opName#"_e32", opName> { let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class VOP2_Real_si <string opName, vop2 op, dag outs, dag ins, string asm> : + VOP2 <op.SI, outs, ins, opName#asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; +} + +class VOP2_Real_vi <string opName, vop2 op, dag outs, dag ins, string asm> : + VOP2 <op.VI, outs, ins, opName#asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.VI> { + let AssemblerPredicates = [isVI]; } multiclass VOP2SI_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern, @@ -868,8 +1174,7 @@ multiclass VOP2SI_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern, def "" : VOP2_Pseudo <outs, ins, pattern, opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; - def _si : VOP2 <op.SI, outs, ins, opName#asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.SI>; + def _si : VOP2_Real_si <opName, op, outs, ins, asm>; } multiclass VOP2_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern, @@ -877,49 +1182,70 @@ multiclass VOP2_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern, def "" : VOP2_Pseudo <outs, ins, pattern, opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; - def _si : VOP2 <op.SI, outs, ins, opName#asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.SI>; - def _vi : VOP2 <op.VI, outs, ins, opName#asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.VI>; + def _si : VOP2_Real_si <opName, op, outs, ins, asm>; + + def _vi : VOP2_Real_vi <opName, op, outs, ins, asm>; + } class VOP3DisableFields <bit HasSrc1, bit HasSrc2, bit HasModifiers> { bits<2> src0_modifiers = !if(HasModifiers, ?, 0); bits<2> src1_modifiers = !if(HasModifiers, !if(HasSrc1, ?, 0), 0); - bits<2> src2_modifiers = !if(HasModifiers, !if(HasSrc2, ? ,0) ,0); + bits<2> src2_modifiers = !if(HasModifiers, !if(HasSrc2, ?, 0), 0); bits<2> omod = !if(HasModifiers, ?, 0); bits<1> clamp = !if(HasModifiers, ?, 0); bits<9> src1 = !if(HasSrc1, ?, 0); bits<9> src2 = !if(HasSrc2, ?, 0); } +class VOP3DisableModFields <bit HasSrc0Mods, + bit HasSrc1Mods = 0, + bit HasSrc2Mods = 0, + bit HasOutputMods = 0> { + bits<2> src0_modifiers = !if(HasSrc0Mods, ?, 0); + bits<2> src1_modifiers = !if(HasSrc1Mods, ?, 0); + bits<2> src2_modifiers = !if(HasSrc2Mods, ?, 0); + bits<2> omod = !if(HasOutputMods, ?, 0); + bits<1> clamp = !if(HasOutputMods, ?, 0); +} + class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : VOP3Common <outs, ins, "", pattern>, VOP <opName>, - SIMCInstr<opName#"_e64", SISubtarget.NONE> { + SIMCInstr<opName#"_e64", SISubtarget.NONE>, + MnemonicAlias<opName#"_e64", opName> { let isPseudo = 1; + let isCodeGenOnly = 1; } class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> : VOP3Common <outs, ins, asm, []>, VOP3e <op>, - SIMCInstr<opName#"_e64", SISubtarget.SI>; + SIMCInstr<opName#"_e64", SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; +} class VOP3_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> : VOP3Common <outs, ins, asm, []>, VOP3e_vi <op>, - SIMCInstr <opName#"_e64", SISubtarget.VI>; + SIMCInstr <opName#"_e64", SISubtarget.VI> { + let AssemblerPredicates = [isVI]; +} class VOP3b_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> : VOP3Common <outs, ins, asm, []>, VOP3be <op>, - SIMCInstr<opName#"_e64", SISubtarget.SI>; + SIMCInstr<opName#"_e64", SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; +} class VOP3b_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> : VOP3Common <outs, ins, asm, []>, VOP3be_vi <op>, - SIMCInstr <opName#"_e64", SISubtarget.VI>; + SIMCInstr <opName#"_e64", SISubtarget.VI> { + let AssemblerPredicates = [isVI]; +} multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, int NumSrcArgs, bit HasMods = 1> { @@ -937,14 +1263,16 @@ multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, } // VOP3_m without source modifiers -multiclass VOP3_m_nosrcmod <vop op, dag outs, dag ins, string asm, list<dag> pattern, +multiclass VOP3_m_nomods <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, int NumSrcArgs, bit HasMods = 1> { def "" : VOP3_Pseudo <outs, ins, pattern, opName>; let src0_modifiers = 0, src1_modifiers = 0, - src2_modifiers = 0 in { + src2_modifiers = 0, + clamp = 0, + omod = 0 in { def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>; def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>; } @@ -1034,9 +1362,10 @@ multiclass VOP3b_3_m <vop op, dag outs, dag ins, string asm, multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, - bit HasMods, bit defExec> { + bit HasMods, bit defExec, string revOp> { - def "" : VOP3_Pseudo <outs, ins, pattern, opName>; + def "" : VOP3_Pseudo <outs, ins, pattern, opName>, + VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, VOP3DisableFields<1, 0, HasMods> { @@ -1052,18 +1381,22 @@ multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm, // An instruction that is VOP2 on SI and VOP3 on VI, no modifiers. multiclass VOP2SI_3VI_m <vop3 op, string opName, dag outs, dag ins, string asm, list<dag> pattern = []> { - let isPseudo = 1 in { + let isPseudo = 1, isCodeGenOnly = 1 in { def "" : VOPAnyCommon <outs, ins, "", pattern>, SIMCInstr<opName, SISubtarget.NONE>; } def _si : VOP2 <op.SI3{5-0}, outs, ins, asm, []>, - SIMCInstr <opName, SISubtarget.SI>; + SIMCInstr <opName, SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; + } def _vi : VOP3Common <outs, ins, asm, []>, VOP3e_vi <op.VI3>, VOP3DisableFields <1, 0, 0>, - SIMCInstr <opName, SISubtarget.VI>; + SIMCInstr <opName, SISubtarget.VI> { + let AssemblerPredicates = [isVI]; + } } multiclass VOP1_Helper <vop1 op, string opName, dag outs, @@ -1073,7 +1406,7 @@ multiclass VOP1_Helper <vop1 op, string opName, dag outs, defm _e32 : VOP1_m <op, outs, ins32, opName#asm32, pat32, opName>; - defm _e64 : VOP3_1_m <op, outs, ins64, opName#"_e64"#asm64, pat64, opName, HasMods>; + defm _e64 : VOP3_1_m <op, outs, ins64, opName#asm64, pat64, opName, HasMods>; } multiclass VOP1Inst <vop1 op, string opName, VOPProfile P, @@ -1108,7 +1441,7 @@ multiclass VOP2_Helper <vop2 op, string opName, dag outs, defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>; defm _e64 : VOP3_2_m <op, - outs, ins64, opName#"_e64"#asm64, pat64, opName, revOp, HasMods + outs, ins64, opName#asm64, pat64, opName, revOp, HasMods >; } @@ -1132,7 +1465,7 @@ multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P, string revOp = opName> { defm _e32 : VOP2SI_m <op, P.Outs, P.Ins32, P.Asm32, [], opName, revOp>; - defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#"_e64"#P.Asm64, + defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#P.Asm64, !if(P.HasModifiers, [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, @@ -1150,7 +1483,7 @@ multiclass VOP2b_Helper <vop2 op, string opName, dag outs, defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>; defm _e64 : VOP3b_2_m <op, - outs, ins64, opName#"_e64"#asm64, pat64, opName, revOp, HasMods + outs, ins64, opName#asm64, pat64, opName, revOp, HasMods >; } @@ -1176,7 +1509,7 @@ multiclass VOP2_VI3_Helper <vop23 op, string opName, dag outs, string revOp, bit HasMods> { defm _e32 : VOP2SI_m <op, outs, ins32, asm32, pat32, opName, revOp>; - defm _e64 : VOP3_2_m <op, outs, ins64, opName#"_e64"#asm64, pat64, opName, + defm _e64 : VOP3_2_m <op, outs, ins64, opName#asm64, pat64, opName, revOp, HasMods>; } @@ -1204,53 +1537,75 @@ let isCodeGenOnly = 0 in { def _si : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins, !strconcat(opName, VOP_MADK.Asm), []>, SIMCInstr <opName#"_e32", SISubtarget.SI>, - VOP2_MADKe <op.SI>; + VOP2_MADKe <op.SI> { + let AssemblerPredicates = [isSICI]; + } def _vi : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins, !strconcat(opName, VOP_MADK.Asm), []>, SIMCInstr <opName#"_e32", SISubtarget.VI>, - VOP2_MADKe <op.VI>; + VOP2_MADKe <op.VI> { + let AssemblerPredicates = [isVI]; + } } // End isCodeGenOnly = 0 } class VOPC_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : VOPCCommon <ins, "", pattern>, VOP <opName>, - SIMCInstr<opName#"_e32", SISubtarget.NONE> { + SIMCInstr<opName#"_e32", SISubtarget.NONE>, + MnemonicAlias<opName#"_e32", opName> { let isPseudo = 1; + let isCodeGenOnly = 1; } multiclass VOPC_m <vopc op, dag outs, dag ins, string asm, list<dag> pattern, - string opName, bit DefExec> { + string opName, bit DefExec, string revOpName = ""> { def "" : VOPC_Pseudo <outs, ins, pattern, opName>; def _si : VOPC<op.SI, ins, asm, []>, SIMCInstr <opName#"_e32", SISubtarget.SI> { let Defs = !if(DefExec, [EXEC], []); + let hasSideEffects = DefExec; } def _vi : VOPC<op.VI, ins, asm, []>, SIMCInstr <opName#"_e32", SISubtarget.VI> { let Defs = !if(DefExec, [EXEC], []); + let hasSideEffects = DefExec; } } multiclass VOPC_Helper <vopc op, string opName, dag ins32, string asm32, list<dag> pat32, dag out64, dag ins64, string asm64, list<dag> pat64, - bit HasMods, bit DefExec> { + bit HasMods, bit DefExec, string revOp> { defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>; - defm _e64 : VOP3_C_m <op, out64, ins64, opName#"_e64"#asm64, pat64, - opName, HasMods, DefExec>; + defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64, + opName, HasMods, DefExec, revOp>; +} + +// Special case for class instructions which only have modifiers on +// the 1st source operand. +multiclass VOPC_Class_Helper <vopc op, string opName, + dag ins32, string asm32, list<dag> pat32, + dag out64, dag ins64, string asm64, list<dag> pat64, + bit HasMods, bit DefExec, string revOp> { + defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>; + + defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64, + opName, HasMods, DefExec, revOp>, + VOP3DisableModFields<1, 0, 0>; } multiclass VOPCInst <vopc op, string opName, VOPProfile P, PatLeaf cond = COND_NULL, + string revOp = opName, bit DefExec = 0> : VOPC_Helper < op, opName, P.Ins32, P.Asm32, [], - (outs SReg_64:$dst), P.Ins64, P.Asm64, + (outs VOPDstS64:$dst), P.Ins64, P.Asm64, !if(P.HasModifiers, [(set i1:$dst, (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, @@ -1258,54 +1613,55 @@ multiclass VOPCInst <vopc op, string opName, (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), cond))], [(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]), - P.HasModifiers, DefExec + P.HasModifiers, DefExec, revOp >; multiclass VOPCClassInst <vopc op, string opName, VOPProfile P, - bit DefExec = 0> : VOPC_Helper < + bit DefExec = 0> : VOPC_Class_Helper < op, opName, P.Ins32, P.Asm32, [], - (outs SReg_64:$dst), P.Ins64, P.Asm64, + (outs VOPDstS64:$dst), P.Ins64, P.Asm64, !if(P.HasModifiers, [(set i1:$dst, (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))], [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]), - P.HasModifiers, DefExec + P.HasModifiers, DefExec, opName >; -multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL> : - VOPCInst <op, opName, VOP_F32_F32_F32, cond>; +multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : + VOPCInst <op, opName, VOP_F32_F32_F32, cond, revOp>; -multiclass VOPC_F64 <vopc op, string opName, PatLeaf cond = COND_NULL> : - VOPCInst <op, opName, VOP_F64_F64_F64, cond>; +multiclass VOPC_F64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : + VOPCInst <op, opName, VOP_F64_F64_F64, cond, revOp>; -multiclass VOPC_I32 <vopc op, string opName, PatLeaf cond = COND_NULL> : - VOPCInst <op, opName, VOP_I32_I32_I32, cond>; +multiclass VOPC_I32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : + VOPCInst <op, opName, VOP_I32_I32_I32, cond, revOp>; -multiclass VOPC_I64 <vopc op, string opName, PatLeaf cond = COND_NULL> : - VOPCInst <op, opName, VOP_I64_I64_I64, cond>; +multiclass VOPC_I64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : + VOPCInst <op, opName, VOP_I64_I64_I64, cond, revOp>; multiclass VOPCX <vopc op, string opName, VOPProfile P, - PatLeaf cond = COND_NULL> - : VOPCInst <op, opName, P, cond, 1>; + PatLeaf cond = COND_NULL, + string revOp = ""> + : VOPCInst <op, opName, P, cond, revOp, 1>; -multiclass VOPCX_F32 <vopc op, string opName, PatLeaf cond = COND_NULL> : - VOPCX <op, opName, VOP_F32_F32_F32, cond>; +multiclass VOPCX_F32 <vopc op, string opName, string revOp = opName> : + VOPCX <op, opName, VOP_F32_F32_F32, COND_NULL, revOp>; -multiclass VOPCX_F64 <vopc op, string opName, PatLeaf cond = COND_NULL> : - VOPCX <op, opName, VOP_F64_F64_F64, cond>; +multiclass VOPCX_F64 <vopc op, string opName, string revOp = opName> : + VOPCX <op, opName, VOP_F64_F64_F64, COND_NULL, revOp>; -multiclass VOPCX_I32 <vopc op, string opName, PatLeaf cond = COND_NULL> : - VOPCX <op, opName, VOP_I32_I32_I32, cond>; +multiclass VOPCX_I32 <vopc op, string opName, string revOp = opName> : + VOPCX <op, opName, VOP_I32_I32_I32, COND_NULL, revOp>; -multiclass VOPCX_I64 <vopc op, string opName, PatLeaf cond = COND_NULL> : - VOPCX <op, opName, VOP_I64_I64_I64, cond>; +multiclass VOPCX_I64 <vopc op, string opName, string revOp = opName> : + VOPCX <op, opName, VOP_I64_I64_I64, COND_NULL, revOp>; multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm, list<dag> pat, int NumSrcArgs, bit HasMods> : VOP3_m < - op, outs, ins, opName#asm, pat, opName, NumSrcArgs, HasMods + op, outs, ins, opName#" "#asm, pat, opName, NumSrcArgs, HasMods >; multiclass VOPC_CLASS_F32 <vopc op, string opName> : @@ -1322,7 +1678,7 @@ multiclass VOPCX_CLASS_F64 <vopc op, string opName> : multiclass VOP3Inst <vop3 op, string opName, VOPProfile P, SDPatternOperator node = null_frag> : VOP3_Helper < - op, opName, P.Outs, P.Ins64, P.Asm64, + op, opName, (outs P.DstRC.RegClass:$dst), P.Ins64, P.Asm64, !if(!eq(P.NumSrcArgs, 3), !if(P.HasModifiers, [(set P.DstVT:$dst, @@ -1354,7 +1710,7 @@ multiclass VOP3_VCC_Inst <vop3 op, string opName, VOPProfile P, SDPatternOperator node = null_frag> : VOP3_Helper < op, opName, - P.Outs, + (outs P.DstRC.RegClass:$dst), (ins InputModsNoDefault:$src0_modifiers, P.Src0RC64:$src0, InputModsNoDefault:$src1_modifiers, P.Src1RC64:$src1, InputModsNoDefault:$src2_modifiers, P.Src2RC64:$src2, @@ -1407,6 +1763,7 @@ class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : VINTRPCommon <outs, ins, "", pattern>, SIMCInstr<opName, SISubtarget.NONE> { let isPseudo = 1; + let isCodeGenOnly = 1; } class VINTRP_Real_si <bits <2> op, string opName, dag outs, dag ins, @@ -1421,17 +1778,13 @@ class VINTRP_Real_vi <bits <2> op, string opName, dag outs, dag ins, VINTRPe_vi <op>, SIMCInstr<opName, SISubtarget.VI>; -multiclass VINTRP_m <bits <2> op, string opName, dag outs, dag ins, string asm, - string disableEncoding = "", string constraints = "", +multiclass VINTRP_m <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern = []> { - let DisableEncoding = disableEncoding, - Constraints = constraints in { - def "" : VINTRP_Pseudo <opName, outs, ins, pattern>; + def "" : VINTRP_Pseudo <NAME, outs, ins, pattern>; - def _si : VINTRP_Real_si <op, opName, outs, ins, asm>; + def _si : VINTRP_Real_si <op, NAME, outs, ins, asm>; - def _vi : VINTRP_Real_vi <op, opName, outs, ins, asm>; - } + def _vi : VINTRP_Real_vi <op, NAME, outs, ins, asm>; } //===----------------------------------------------------------------------===// @@ -1442,33 +1795,33 @@ class DS_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : DS <outs, ins, "", pattern>, SIMCInstr <opName, SISubtarget.NONE> { let isPseudo = 1; + let isCodeGenOnly = 1; } class DS_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> : DS <outs, ins, asm, []>, DSe <op>, - SIMCInstr <opName, SISubtarget.SI>; + SIMCInstr <opName, SISubtarget.SI> { + let isCodeGenOnly = 0; +} class DS_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> : DS <outs, ins, asm, []>, DSe_vi <op>, SIMCInstr <opName, SISubtarget.VI>; -class DS_1A_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> : - DS <outs, ins, asm, []>, - DSe <op>, - SIMCInstr <opName, SISubtarget.SI> { +class DS_Off16_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> : + DS_Real_si <op,opName, outs, ins, asm> { // Single load interpret the 2 i8imm operands as a single i16 offset. bits<16> offset; let offset0 = offset{7-0}; let offset1 = offset{15-8}; + let isCodeGenOnly = 0; } -class DS_1A_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> : - DS <outs, ins, asm, []>, - DSe_vi <op>, - SIMCInstr <opName, SISubtarget.VI> { +class DS_Off16_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> : + DS_Real_vi <op, opName, outs, ins, asm> { // Single load interpret the 2 i8imm operands as a single i16 offset. bits<16> offset; @@ -1476,178 +1829,166 @@ class DS_1A_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> : let offset1 = offset{15-8}; } -multiclass DS_1A_Load_m <bits<8> op, string opName, dag outs, dag ins, string asm, - list<dag> pat> { - let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { - def "" : DS_Pseudo <opName, outs, ins, pat>; +multiclass DS_1A_RET <bits<8> op, string opName, RegisterClass rc, + dag outs = (outs rc:$vdst), + dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds), + string asm = opName#" $vdst, $addr"#"$offset$gds"> { - let data0 = 0, data1 = 0 in { - def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; - } + def "" : DS_Pseudo <opName, outs, ins, []>; + + let data0 = 0, data1 = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; } } -multiclass DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> - : DS_1A_Load_m < - op, - asm, - (outs regClass:$vdst), - (ins i1imm:$gds, VGPR_32:$addr, ds_offset:$offset, M0Reg:$m0), - asm#" $vdst, $addr"#"$offset"#" [M0]", - []>; - -multiclass DS_Load2_m <bits<8> op, string opName, dag outs, dag ins, string asm, - list<dag> pat> { - let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { - def "" : DS_Pseudo <opName, outs, ins, pat>; - - let data0 = 0, data1 = 0 in { - def _si : DS_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_Real_vi <op, opName, outs, ins, asm>; - } +multiclass DS_1A_Off8_RET <bits<8> op, string opName, RegisterClass rc, + dag outs = (outs rc:$vdst), + dag ins = (ins VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1, + gds01:$gds), + string asm = opName#" $vdst, $addr"#"$offset0"#"$offset1$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>; + + let data0 = 0, data1 = 0, AsmMatchConverter = "cvtDSOffset01" in { + def _si : DS_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Real_vi <op, opName, outs, ins, asm>; } } -multiclass DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> - : DS_Load2_m < - op, - asm, - (outs regClass:$vdst), - (ins i1imm:$gds, VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1, - M0Reg:$m0), - asm#" $vdst, $addr"#"$offset0"#"$offset1 [M0]", - []>; - -multiclass DS_1A_Store_m <bits<8> op, string opName, dag outs, dag ins, - string asm, list<dag> pat> { - let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in { - def "" : DS_Pseudo <opName, outs, ins, pat>; - - let data1 = 0, vdst = 0 in { - def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; - } +multiclass DS_1A1D_NORET <bits<8> op, string opName, RegisterClass rc, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds), + string asm = opName#" $addr, $data0"#"$offset$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>, + AtomicNoRet<opName, 0>; + + let data1 = 0, vdst = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; } } -multiclass DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> - : DS_1A_Store_m < - op, - asm, - (outs), - (ins i1imm:$gds, VGPR_32:$addr, regClass:$data0, ds_offset:$offset, M0Reg:$m0), - asm#" $addr, $data0"#"$offset"#" [M0]", - []>; - -multiclass DS_Store_m <bits<8> op, string opName, dag outs, dag ins, - string asm, list<dag> pat> { - let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in { - def "" : DS_Pseudo <opName, outs, ins, pat>; - - let vdst = 0 in { - def _si : DS_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_Real_vi <op, opName, outs, ins, asm>; - } +multiclass DS_1A1D_Off8_NORET <bits<8> op, string opName, RegisterClass rc, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1, + ds_offset0:$offset0, ds_offset1:$offset1, gds01:$gds), + string asm = opName#" $addr, $data0, $data1"#"$offset0"#"$offset1"#"$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>; + + let vdst = 0, AsmMatchConverter = "cvtDSOffset01" in { + def _si : DS_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Real_vi <op, opName, outs, ins, asm>; } } -multiclass DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> - : DS_Store_m < - op, - asm, - (outs), - (ins i1imm:$gds, VGPR_32:$addr, regClass:$data0, regClass:$data1, - ds_offset0:$offset0, ds_offset1:$offset1, M0Reg:$m0), - asm#" $addr, $data0, $data1"#"$offset0"#"$offset1 [M0]", - []>; - -// 1 address, 1 data. -multiclass DS_1A1D_RET_m <bits<8> op, string opName, dag outs, dag ins, - string asm, list<dag> pat, string noRetOp> { - let mayLoad = 1, mayStore = 1, - hasPostISelHook = 1 // Adjusted to no return version. - in { - def "" : DS_Pseudo <opName, outs, ins, pat>, - AtomicNoRet<noRetOp, 1>; - - let data1 = 0 in { - def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; - } +multiclass DS_1A1D_RET <bits<8> op, string opName, RegisterClass rc, + string noRetOp = "", + dag outs = (outs rc:$vdst), + dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds), + string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>, + AtomicNoRet<noRetOp, 1>; + + let data1 = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; } } -multiclass DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc, - string noRetOp = ""> : DS_1A1D_RET_m < - op, asm, - (outs rc:$vdst), - (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, ds_offset:$offset, M0Reg:$m0), - asm#" $vdst, $addr, $data0"#"$offset"#" [M0]", [], noRetOp>; - -// 1 address, 2 data. -multiclass DS_1A2D_RET_m <bits<8> op, string opName, dag outs, dag ins, - string asm, list<dag> pat, string noRetOp> { - let mayLoad = 1, mayStore = 1, - hasPostISelHook = 1 // Adjusted to no return version. - in { - def "" : DS_Pseudo <opName, outs, ins, pat>, - AtomicNoRet<noRetOp, 1>; - - def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; - } +multiclass DS_1A2D_RET_m <bits<8> op, string opName, RegisterClass rc, + string noRetOp = "", dag ins, + dag outs = (outs rc:$vdst), + string asm = opName#" $vdst, $addr, $data0, $data1"#"$offset"#"$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>, + AtomicNoRet<noRetOp, 1>; + + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; } multiclass DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc, - string noRetOp = ""> : DS_1A2D_RET_m < - op, asm, - (outs rc:$vdst), - (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset, M0Reg:$m0), - asm#" $vdst, $addr, $data0, $data1"#"$offset"#" [M0]", - [], noRetOp>; - -// 1 address, 2 data. -multiclass DS_1A2D_NORET_m <bits<8> op, string opName, dag outs, dag ins, - string asm, list<dag> pat, string noRetOp> { - let mayLoad = 1, mayStore = 1 in { - def "" : DS_Pseudo <opName, outs, ins, pat>, - AtomicNoRet<noRetOp, 0>; + string noRetOp = "", RegisterClass src = rc> : + DS_1A2D_RET_m <op, asm, rc, noRetOp, + (ins VGPR_32:$addr, src:$data0, src:$data1, + ds_offset:$offset, gds:$gds) +>; + +multiclass DS_1A2D_NORET <bits<8> op, string opName, RegisterClass rc, + string noRetOp = opName, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1, + ds_offset:$offset, gds:$gds), + string asm = opName#" $addr, $data0, $data1"#"$offset"#"$gds"> { - def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; + def "" : DS_Pseudo <opName, outs, ins, []>, + AtomicNoRet<noRetOp, 0>; + + let vdst = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; } } -multiclass DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc, - string noRetOp = asm> : DS_1A2D_NORET_m < - op, asm, - (outs), - (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset, M0Reg:$m0), - asm#" $addr, $data0, $data1"#"$offset"#" [M0]", - [], noRetOp>; +multiclass DS_0A_RET <bits<8> op, string opName, + dag outs = (outs VGPR_32:$vdst), + dag ins = (ins ds_offset:$offset, gds:$gds), + string asm = opName#" $vdst"#"$offset"#"$gds"> { -// 1 address, 1 data. -multiclass DS_1A1D_NORET_m <bits<8> op, string opName, dag outs, dag ins, - string asm, list<dag> pat, string noRetOp> { let mayLoad = 1, mayStore = 1 in { - def "" : DS_Pseudo <opName, outs, ins, pat>, - AtomicNoRet<noRetOp, 0>; + def "" : DS_Pseudo <opName, outs, ins, []>; - let data1 = 0 in { - def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; - } - } + let addr = 0, data0 = 0, data1 = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } // end addr = 0, data0 = 0, data1 = 0 + } // end mayLoad = 1, mayStore = 1 } -multiclass DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc, - string noRetOp = asm> : DS_1A1D_NORET_m < - op, asm, - (outs), - (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, ds_offset:$offset, M0Reg:$m0), - asm#" $addr, $data0"#"$offset"#" [M0]", - [], noRetOp>; +multiclass DS_1A_RET_GDS <bits<8> op, string opName, + dag outs = (outs VGPR_32:$vdst), + dag ins = (ins VGPR_32:$addr, ds_offset_gds:$offset), + string asm = opName#" $vdst, $addr"#"$offset gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>; + + let data0 = 0, data1 = 0, gds = 1 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } // end data0 = 0, data1 = 0, gds = 1 +} + +multiclass DS_1A_GDS <bits<8> op, string opName, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr), + string asm = opName#" $addr gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>; + + let vdst = 0, data0 = 0, data1 = 0, offset0 = 0, offset1 = 0, gds = 1 in { + def _si : DS_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Real_vi <op, opName, outs, ins, asm>; + } // end vdst = 0, data = 0, data1 = 0, gds = 1 +} + +multiclass DS_1A <bits<8> op, string opName, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds), + string asm = opName#" $addr"#"$offset"#"$gds"> { + + let mayLoad = 1, mayStore = 1 in { + def "" : DS_Pseudo <opName, outs, ins, []>; + + let vdst = 0, data0 = 0, data1 = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } // let vdst = 0, data0 = 0, data1 = 0 + } // end mayLoad = 1, mayStore = 1 +} //===----------------------------------------------------------------------===// // MTBUF classes @@ -1657,6 +1998,7 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : MTBUF <outs, ins, "", pattern>, SIMCInstr<opName, SISubtarget.NONE> { let isPseudo = 1; + let isCodeGenOnly = 1; } class MTBUF_Real_si <bits<3> op, string opName, dag outs, dag ins, @@ -1718,6 +2060,20 @@ class mubuf <bits<7> si, bits<7> vi = si> { field bits<7> VI = vi; } +let isCodeGenOnly = 0 in { + +class MUBUF_si <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + MUBUF <outs, ins, asm, pattern>, MUBUFe <op> { + let lds = 0; +} + +} // End let isCodeGenOnly = 0 + +class MUBUF_vi <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + MUBUF <outs, ins, asm, pattern>, MUBUFe_vi <op> { + let lds = 0; +} + class MUBUFAddr64Table <bit is_addr64, string suffix = ""> { bit IsAddr64 = is_addr64; string OpName = NAME # suffix; @@ -1727,6 +2083,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : MUBUF <outs, ins, "", pattern>, SIMCInstr<opName, SISubtarget.NONE> { let isPseudo = 1; + let isCodeGenOnly = 1; // dummy fields, so that we can use let statements around multiclasses bits<1> offen; @@ -1760,7 +2117,7 @@ multiclass MUBUF_m <mubuf op, string opName, dag outs, dag ins, string asm, def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, MUBUFAddr64Table <0>; - let addr64 = 0 in { + let addr64 = 0, isCodeGenOnly = 0 in { def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; } @@ -1773,7 +2130,7 @@ multiclass MUBUFAddr64_m <mubuf op, string opName, dag outs, def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, MUBUFAddr64Table <1>; - let addr64 = 1 in { + let addr64 = 1, isCodeGenOnly = 0 in { def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; } @@ -1781,11 +2138,6 @@ multiclass MUBUFAddr64_m <mubuf op, string opName, dag outs, // for VI appropriately. } -class MUBUF_si <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : - MUBUF <outs, ins, asm, pattern>, MUBUFe <op> { - let lds = 0; -} - multiclass MUBUFAtomicOffset_m <mubuf op, string opName, dag outs, dag ins, string asm, list<dag> pattern, bit is_return> { @@ -1809,7 +2161,7 @@ multiclass MUBUFAtomicAddr64_m <mubuf op, string opName, dag outs, dag ins, MUBUFAddr64Table <1, !if(is_return, "_RTN", "")>, AtomicNoRet<NAME#"_ADDR64", is_return>; - let offen = 0, idxen = 0, addr64 = 1, tfe = 0, soffset = 128 in { + let offen = 0, idxen = 0, addr64 = 1, tfe = 0 in { def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; } @@ -1828,14 +2180,14 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, defm _ADDR64 : MUBUFAtomicAddr64_m < op, name#"_addr64", (outs), (ins rc:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, - mbuf_offset:$offset, slc:$slc), - name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset"#"$slc", [], 0 + SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0 >; defm _OFFSET : MUBUFAtomicOffset_m < op, name#"_offset", (outs), - (ins rc:$vdata, SReg_128:$srsrc, mbuf_offset:$offset, - SCSrc_32:$soffset, slc:$slc), + (ins rc:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, + slc:$slc), name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", [], 0 >; } // glc = 0 @@ -1847,17 +2199,17 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, defm _RTN_ADDR64 : MUBUFAtomicAddr64_m < op, name#"_rtn_addr64", (outs rc:$vdata), (ins rc:$vdata_in, SReg_128:$srsrc, VReg_64:$vaddr, - mbuf_offset:$offset, slc:$slc), - name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset"#" glc"#"$slc", + SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc", [(set vt:$vdata, - (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i16:$offset, - i1:$slc), vt:$vdata_in))], 1 + (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, + i16:$offset, i1:$slc), vt:$vdata_in))], 1 >; defm _RTN_OFFSET : MUBUFAtomicOffset_m < op, name#"_rtn_offset", (outs rc:$vdata), - (ins rc:$vdata_in, SReg_128:$srsrc, mbuf_offset:$offset, - SCSrc_32:$soffset, slc:$slc), + (ins rc:$vdata_in, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, slc:$slc), name#" $vdata, $srsrc, $soffset"#"$offset"#" glc $slc", [(set vt:$vdata, (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, @@ -1876,9 +2228,8 @@ multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass, let mayLoad = 1, mayStore = 0 in { let offen = 0, idxen = 0, vaddr = 0 in { defm _OFFSET : MUBUF_m <op, name#"_offset", (outs regClass:$vdata), - (ins SReg_128:$srsrc, - mbuf_offset:$offset, SCSrc_32:$soffset, glc:$glc, - slc:$slc, tfe:$tfe), + (ins SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, @@ -1887,7 +2238,7 @@ multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass, let offen = 1, idxen = 0 in { defm _OFFEN : MUBUF_m <op, name#"_offen", (outs regClass:$vdata), - (ins SReg_128:$srsrc, VGPR_32:$vaddr, + (ins VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; @@ -1895,43 +2246,48 @@ multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass, let offen = 0, idxen = 1 in { defm _IDXEN : MUBUF_m <op, name#"_idxen", (outs regClass:$vdata), - (ins SReg_128:$srsrc, VGPR_32:$vaddr, - mbuf_offset:$offset, SCSrc_32:$soffset, glc:$glc, + (ins VGPR_32:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; } let offen = 1, idxen = 1 in { defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs regClass:$vdata), - (ins SReg_128:$srsrc, VReg_64:$vaddr, - SCSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe), - name#" $vdata, $vaddr, $srsrc, $soffset, idxen offen"#"$glc"#"$slc"#"$tfe", []>; + (ins VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset idxen offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; } - let offen = 0, idxen = 0, glc = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */ in { + let offen = 0, idxen = 0 in { defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs regClass:$vdata), - (ins SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset), - name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset", + (ins VReg_64:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, mbuf_offset:$offset, + glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"# + "$glc"#"$slc"#"$tfe", [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc, - i64:$vaddr, i16:$offset)))]>; + i64:$vaddr, i32:$soffset, + i16:$offset, i1:$glc, i1:$slc, + i1:$tfe)))]>; } } } multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass, - ValueType store_vt, SDPatternOperator st> { + ValueType store_vt = i32, SDPatternOperator st = null_frag> { let mayLoad = 0, mayStore = 1 in { defm : MUBUF_m <op, name, (outs), - (ins vdataClass:$vdata, SReg_128:$srsrc, VGPR_32:$vaddr, SCSrc_32:$soffset, + (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, offen:$offen, idxen:$idxen, glc:$glc, slc:$slc, tfe:$tfe), name#" $vdata, $vaddr, $srsrc, $soffset"#"$offen"#"$idxen"#"$offset"# - "$glc"#"$slc"#"$tfe", []>; + "$glc"#"$slc"#"$tfe", []>; let offen = 0, idxen = 0, vaddr = 0 in { defm _OFFSET : MUBUF_m <op, name#"_offset",(outs), - (ins vdataClass:$vdata, SReg_128:$srsrc, mbuf_offset:$offset, - SCSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe), + (ins vdataClass:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>; @@ -1939,30 +2295,52 @@ multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass, let offen = 1, idxen = 0 in { defm _OFFEN : MUBUF_m <op, name#"_offen", (outs), - (ins vdataClass:$vdata, SReg_128:$srsrc, VGPR_32:$vaddr, SCSrc_32:$soffset, - mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, + slc:$slc, tfe:$tfe), name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"# "$glc"#"$slc"#"$tfe", []>; } // end offen = 1, idxen = 0 - let offen = 0, idxen = 0, glc = 0, slc = 0, tfe = 0, - soffset = 128 /* ZERO */ in { + let offen = 0, idxen = 1 in { + defm _IDXEN : MUBUF_m <op, name#"_idxen", (outs), + (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, + slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + } + + let offen = 1, idxen = 1 in { + defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs), + (ins vdataClass:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset idxen offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + } + + let offen = 0, idxen = 0 in { defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs), - (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset), - name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset", + (ins vdataClass:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, + tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset addr64"# + "$offset"#"$glc"#"$slc"#"$tfe", [(st store_vt:$vdata, - (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i16:$offset))]>; + (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, + i32:$soffset, i16:$offset, + i1:$glc, i1:$slc, i1:$tfe))]>; } } // End mayLoad = 0, mayStore = 1 } class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : - FLAT <op, (outs regClass:$data), + FLAT <op, (outs regClass:$vdst), (ins VReg_64:$addr), - asm#" $data, $addr, [M0, FLAT_SCRATCH]", []> { + asm#" $vdst, $addr, [M0, FLAT_SCRATCH]", []> { let glc = 0; let slc = 0; let tfe = 0; + let data = 0; let mayLoad = 1; } @@ -1978,6 +2356,7 @@ class FLAT_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> : let glc = 0; let slc = 0; let tfe = 0; + let vdst = 0; } class MIMG_Mask <string op, int channels> { @@ -1996,7 +2375,7 @@ class MIMG_NoSampler_Helper <bits<7> op, string asm, asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," #" $tfe, $lwe, $slc, $vaddr, $srsrc", []> { - let SSAMP = 0; + let ssamp = 0; let mayLoad = 1; let mayStore = 0; let hasPostISelHook = 1; @@ -2143,15 +2522,6 @@ def getVOPe32 : InstrMapping { let ValueCols = [["4"]]; } -// Maps an original opcode to its commuted version -def getCommuteRev : InstrMapping { - let FilterClass = "VOP2_REV"; - let RowFields = ["RevOp"]; - let ColFields = ["IsOrig"]; - let KeyCol = ["1"]; - let ValueCols = [["0"]]; -} - def getMaskedMIMGOp : InstrMapping { let FilterClass = "MIMG_Mask"; let RowFields = ["Op"]; @@ -2169,6 +2539,33 @@ def getCommuteOrig : InstrMapping { let ValueCols = [["1"]]; } +// Maps an original opcode to its commuted version +def getCommuteRev : InstrMapping { + let FilterClass = "VOP2_REV"; + let RowFields = ["RevOp"]; + let ColFields = ["IsOrig"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + +def getCommuteCmpOrig : InstrMapping { + let FilterClass = "VOP2_REV"; + let RowFields = ["RevOp"]; + let ColFields = ["IsOrig"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +// Maps an original opcode to its commuted version +def getCommuteCmpRev : InstrMapping { + let FilterClass = "VOP2_REV"; + let RowFields = ["RevOp"]; + let ColFields = ["IsOrig"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + + def getMCOpcodeGen : InstrMapping { let FilterClass = "SIMCInstr"; let RowFields = ["PseudoInstr"]; diff --git a/contrib/llvm/lib/Target/R600/SIInstructions.td b/contrib/llvm/lib/Target/R600/SIInstructions.td index bbedef2..2f39074 100644 --- a/contrib/llvm/lib/Target/R600/SIInstructions.td +++ b/contrib/llvm/lib/Target/R600/SIInstructions.td @@ -26,20 +26,17 @@ def SendMsgImm : Operand<i32> { let PrintMethod = "printSendMsg"; } -def isGCN : Predicate<"Subtarget.getGeneration() " - ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">; -def isSICI : Predicate< - "Subtarget.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" - "Subtarget.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS" ->; -def isCI : Predicate<"Subtarget.getGeneration() " - ">= AMDGPUSubtarget::SEA_ISLANDS">; -def isVI : Predicate < - "Subtarget.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS" ->; +def isGCN : Predicate<"Subtarget->getGeneration() " + ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">, + AssemblerPredicate<"FeatureGCN">; +def isSI : Predicate<"Subtarget->getGeneration() " + "== AMDGPUSubtarget::SOUTHERN_ISLANDS">; def HasFlatAddressSpace : Predicate<"Subtarget.hasFlatAddressSpace()">; +def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">; +def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">; + def SWaitMatchClass : AsmOperandClass { let Name = "SWaitCnt"; let RenderMethod = "addImmOperands"; @@ -103,7 +100,7 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper < //===----------------------------------------------------------------------===// let isMoveImm = 1 in { - let isReMaterializable = 1 in { + let isReMaterializable = 1, isAsCheapAsAMove = 1 in { defm S_MOV_B32 : SOP1_32 <sop1<0x03, 0x00>, "s_mov_b32", []>; defm S_MOV_B64 : SOP1_64 <sop1<0x04, 0x01>, "s_mov_b64", []>; } // let isRematerializeable = 1 @@ -133,20 +130,20 @@ defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32", defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>; let Defs = [SCC] in { - //defm S_BCNT0_I32_B32 : SOP1_BCNT0 <sop1<0x0d, 0x0a>, "s_bcnt0_i32_b32", []>; - //defm S_BCNT0_I32_B64 : SOP1_BCNT0 <sop1<0x0e, 0x0b>, "s_bcnt0_i32_b64", []>; + defm S_BCNT0_I32_B32 : SOP1_32 <sop1<0x0d, 0x0a>, "s_bcnt0_i32_b32", []>; + defm S_BCNT0_I32_B64 : SOP1_32_64 <sop1<0x0e, 0x0b>, "s_bcnt0_i32_b64", []>; defm S_BCNT1_I32_B32 : SOP1_32 <sop1<0x0f, 0x0c>, "s_bcnt1_i32_b32", [(set i32:$dst, (ctpop i32:$src0))] >; defm S_BCNT1_I32_B64 : SOP1_32_64 <sop1<0x10, 0x0d>, "s_bcnt1_i32_b64", []>; } // End Defs = [SCC] -//defm S_FF0_I32_B32 : SOP1_32 <sop1<0x11, 0x0e>, "s_ff0_i32_b32", []>; -//defm S_FF0_I32_B64 : SOP1_FF0 <sop1<0x12, 0x0f>, "s_ff0_i32_b64", []>; +defm S_FF0_I32_B32 : SOP1_32 <sop1<0x11, 0x0e>, "s_ff0_i32_b32", []>; +defm S_FF0_I32_B64 : SOP1_32_64 <sop1<0x12, 0x0f>, "s_ff0_i32_b64", []>; defm S_FF1_I32_B32 : SOP1_32 <sop1<0x13, 0x10>, "s_ff1_i32_b32", [(set i32:$dst, (cttz_zero_undef i32:$src0))] >; -////defm S_FF1_I32_B64 : SOP1_FF1 <sop1<0x14, 0x11>, "s_ff1_i32_b64", []>; +defm S_FF1_I32_B64 : SOP1_32_64 <sop1<0x14, 0x11>, "s_ff1_i32_b64", []>; defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32", [(set i32:$dst, (ctlz_zero_undef i32:$src0))] @@ -164,10 +161,10 @@ defm S_SEXT_I32_I16 : SOP1_32 <sop1<0x1a, 0x17>, "s_sext_i32_i16", [(set i32:$dst, (sext_inreg i32:$src0, i16))] >; -////defm S_BITSET0_B32 : SOP1_BITSET0 <sop1<0x1b, 0x18>, "s_bitset0_b32", []>; -////defm S_BITSET0_B64 : SOP1_BITSET0 <sop1<0x1c, 0x19>, "s_bitset0_b64", []>; -////defm S_BITSET1_B32 : SOP1_BITSET1 <sop1<0x1d, 0x1a>, "s_bitset1_b32", []>; -////defm S_BITSET1_B64 : SOP1_BITSET1 <sop1<0x1e, 0x1b>, "s_bitset1_b64", []>; +defm S_BITSET0_B32 : SOP1_32 <sop1<0x1b, 0x18>, "s_bitset0_b32", []>; +defm S_BITSET0_B64 : SOP1_64 <sop1<0x1c, 0x19>, "s_bitset0_b64", []>; +defm S_BITSET1_B32 : SOP1_32 <sop1<0x1d, 0x1a>, "s_bitset1_b32", []>; +defm S_BITSET1_B64 : SOP1_64 <sop1<0x1e, 0x1b>, "s_bitset1_b64", []>; defm S_GETPC_B64 : SOP1_64_0 <sop1<0x1f, 0x1c>, "s_getpc_b64", []>; defm S_SETPC_B64 : SOP1_64 <sop1<0x20, 0x1d>, "s_setpc_b64", []>; defm S_SWAPPC_B64 : SOP1_64 <sop1<0x21, 0x1e>, "s_swappc_b64", []>; @@ -192,7 +189,7 @@ defm S_MOVRELS_B32 : SOP1_32 <sop1<0x2e, 0x2a>, "s_movrels_b32", []>; defm S_MOVRELS_B64 : SOP1_64 <sop1<0x2f, 0x2b>, "s_movrels_b64", []>; defm S_MOVRELD_B32 : SOP1_32 <sop1<0x30, 0x2c>, "s_movreld_b32", []>; defm S_MOVRELD_B64 : SOP1_64 <sop1<0x31, 0x2d>, "s_movreld_b64", []>; -//defm S_CBRANCH_JOIN : SOP1_ <sop1<0x32, 0x2e>, "s_cbranch_join", []>; +defm S_CBRANCH_JOIN : SOP1_1 <sop1<0x32, 0x2e>, "s_cbranch_join", []>; defm S_MOV_REGRD_B32 : SOP1_32 <sop1<0x33, 0x2f>, "s_mov_regrd_b32", []>; let Defs = [SCC] in { defm S_ABS_I32 : SOP1_32 <sop1<0x34, 0x30>, "s_abs_i32", []>; @@ -227,22 +224,22 @@ defm S_SUBB_U32 : SOP2_32 <sop2<0x05>, "s_subb_u32", } // End Uses = [SCC] defm S_MIN_I32 : SOP2_32 <sop2<0x06>, "s_min_i32", - [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))] + [(set i32:$dst, (smin i32:$src0, i32:$src1))] >; defm S_MIN_U32 : SOP2_32 <sop2<0x07>, "s_min_u32", - [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))] + [(set i32:$dst, (umin i32:$src0, i32:$src1))] >; defm S_MAX_I32 : SOP2_32 <sop2<0x08>, "s_max_i32", - [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))] + [(set i32:$dst, (smax i32:$src0, i32:$src1))] >; defm S_MAX_U32 : SOP2_32 <sop2<0x09>, "s_max_u32", - [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))] + [(set i32:$dst, (umax i32:$src0, i32:$src1))] >; } // End Defs = [SCC] -defm S_CSELECT_B32 : SOP2_SELECT_32 <sop2<0x0a>, "s_cselect_b32", []>; let Uses = [SCC] in { + defm S_CSELECT_B32 : SOP2_32 <sop2<0x0a>, "s_cselect_b32", []>; defm S_CSELECT_B64 : SOP2_64 <sop2<0x0b>, "s_cselect_b64", []>; } // End Uses = [SCC] @@ -306,7 +303,8 @@ defm S_ASHR_I64 : SOP2_64_32 <sop2<0x23, 0x21>, "s_ashr_i64", >; } // End Defs = [SCC] -defm S_BFM_B32 : SOP2_32 <sop2<0x24, 0x22>, "s_bfm_b32", []>; +defm S_BFM_B32 : SOP2_32 <sop2<0x24, 0x22>, "s_bfm_b32", + [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))]>; defm S_BFM_B64 : SOP2_64 <sop2<0x25, 0x23>, "s_bfm_b64", []>; defm S_MUL_I32 : SOP2_32 <sop2<0x26, 0x24>, "s_mul_i32", [(set i32:$dst, (mul i32:$src0, i32:$src1))] @@ -321,7 +319,13 @@ defm S_BFE_U64 : SOP2_64 <sop2<0x29, 0x27>, "s_bfe_u64", []>; defm S_BFE_I64 : SOP2_64_32 <sop2<0x2a, 0x28>, "s_bfe_i64", []>; } // End Defs = [SCC] -//defm S_CBRANCH_G_FORK : SOP2_ <sop2<0x2b, 0x29>, "s_cbranch_g_fork", []>; +let sdst = 0 in { +defm S_CBRANCH_G_FORK : SOP2_m < + sop2<0x2b, 0x29>, "s_cbranch_g_fork", (outs), + (ins SReg_64:$src0, SReg_64:$src1), "s_cbranch_g_fork $src0, $src1", [] +>; +} + let Defs = [SCC] in { defm S_ABSDIFF_I32 : SOP2_32 <sop2<0x2c, 0x2a>, "s_absdiff_i32", []>; } // End Defs = [SCC] @@ -378,6 +382,7 @@ defm S_CMPK_EQ_I32 : SOPK_SCC <sopk<0x03, 0x02>, "s_cmpk_eq_i32", >; */ +defm S_CMPK_EQ_I32 : SOPK_SCC <sopk<0x03, 0x02>, "s_cmpk_eq_i32", []>; defm S_CMPK_LG_I32 : SOPK_SCC <sopk<0x04, 0x03>, "s_cmpk_lg_i32", []>; defm S_CMPK_GT_I32 : SOPK_SCC <sopk<0x05, 0x04>, "s_cmpk_gt_i32", []>; defm S_CMPK_GE_I32 : SOPK_SCC <sopk<0x06, 0x05>, "s_cmpk_ge_i32", []>; @@ -391,18 +396,27 @@ defm S_CMPK_LT_U32 : SOPK_SCC <sopk<0x0d, 0x0c>, "s_cmpk_lt_u32", []>; defm S_CMPK_LE_U32 : SOPK_SCC <sopk<0x0e, 0x0d>, "s_cmpk_le_u32", []>; } // End isCompare = 1 -let isCommutable = 1 in { - let Defs = [SCC], isCommutable = 1 in { - defm S_ADDK_I32 : SOPK_32 <sopk<0x0f, 0x0e>, "s_addk_i32", []>; - } - defm S_MULK_I32 : SOPK_32 <sopk<0x10, 0x0f>, "s_mulk_i32", []>; +let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0", + Constraints = "$sdst = $src0" in { + defm S_ADDK_I32 : SOPK_32TIE <sopk<0x0f, 0x0e>, "s_addk_i32", []>; + defm S_MULK_I32 : SOPK_32TIE <sopk<0x10, 0x0f>, "s_mulk_i32", []>; } -//defm S_CBRANCH_I_FORK : SOPK_ <sopk<0x11, 0x10>, "s_cbranch_i_fork", []>; +defm S_CBRANCH_I_FORK : SOPK_m < + sopk<0x11, 0x10>, "s_cbranch_i_fork", (outs), + (ins SReg_64:$sdst, u16imm:$simm16), " $sdst, $simm16" +>; defm S_GETREG_B32 : SOPK_32 <sopk<0x12, 0x11>, "s_getreg_b32", []>; -defm S_SETREG_B32 : SOPK_32 <sopk<0x13, 0x12>, "s_setreg_b32", []>; -defm S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32", []>; -//defm S_SETREG_IMM32_B32 : SOPK_32 <sopk<0x15, 0x14>, "s_setreg_imm32_b32", []>; +defm S_SETREG_B32 : SOPK_m < + sopk<0x13, 0x12>, "s_setreg_b32", (outs), + (ins SReg_32:$sdst, u16imm:$simm16), " $sdst, $simm16" +>; +// FIXME: Not on SI? +//defm S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32", []>; +defm S_SETREG_IMM32_B32 : SOPK_IMM32 < + sopk<0x15, 0x14>, "s_setreg_imm32_b32", (outs), + (ins i32imm:$imm, u16imm:$simm16), " $imm, $simm16" +>; //===----------------------------------------------------------------------===// // SOPP Instructions @@ -477,13 +491,11 @@ def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">; def S_SLEEP : SOPP <0x0000000e, (ins i16imm:$simm16), "s_sleep $simm16">; def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$sim16), "s_setprio $sim16">; -let Uses = [EXEC] in { - def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16, M0Reg:$m0), "s_sendmsg $simm16", - [(int_SI_sendmsg imm:$simm16, M0Reg:$m0)] - > { - let DisableEncoding = "$m0"; - } -} // End Uses = [EXEC] +let Uses = [EXEC, M0] in { + def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16", + [(AMDGPUsendmsg (i32 imm:$simm16))] + >; +} // End Uses = [EXEC, M0] def S_SENDMSGHALT : SOPP <0x00000011, (ins i16imm:$simm16), "s_sendmsghalt $simm16">; def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">; @@ -501,31 +513,30 @@ def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> { // VOPC Instructions //===----------------------------------------------------------------------===// -let isCompare = 1 in { +let isCompare = 1, isCommutable = 1 in { defm V_CMP_F_F32 : VOPC_F32 <vopc<0x0, 0x40>, "v_cmp_f_f32">; -defm V_CMP_LT_F32 : VOPC_F32 <vopc<0x1, 0x41>, "v_cmp_lt_f32", COND_OLT>; +defm V_CMP_LT_F32 : VOPC_F32 <vopc<0x1, 0x41>, "v_cmp_lt_f32", COND_OLT, "v_cmp_gt_f32">; defm V_CMP_EQ_F32 : VOPC_F32 <vopc<0x2, 0x42>, "v_cmp_eq_f32", COND_OEQ>; -defm V_CMP_LE_F32 : VOPC_F32 <vopc<0x3, 0x43>, "v_cmp_le_f32", COND_OLE>; +defm V_CMP_LE_F32 : VOPC_F32 <vopc<0x3, 0x43>, "v_cmp_le_f32", COND_OLE, "v_cmp_ge_f32">; defm V_CMP_GT_F32 : VOPC_F32 <vopc<0x4, 0x44>, "v_cmp_gt_f32", COND_OGT>; defm V_CMP_LG_F32 : VOPC_F32 <vopc<0x5, 0x45>, "v_cmp_lg_f32", COND_ONE>; defm V_CMP_GE_F32 : VOPC_F32 <vopc<0x6, 0x46>, "v_cmp_ge_f32", COND_OGE>; defm V_CMP_O_F32 : VOPC_F32 <vopc<0x7, 0x47>, "v_cmp_o_f32", COND_O>; defm V_CMP_U_F32 : VOPC_F32 <vopc<0x8, 0x48>, "v_cmp_u_f32", COND_UO>; -defm V_CMP_NGE_F32 : VOPC_F32 <vopc<0x9, 0x49>, "v_cmp_nge_f32", COND_ULT>; +defm V_CMP_NGE_F32 : VOPC_F32 <vopc<0x9, 0x49>, "v_cmp_nge_f32", COND_ULT, "v_cmp_nle_f32">; defm V_CMP_NLG_F32 : VOPC_F32 <vopc<0xa, 0x4a>, "v_cmp_nlg_f32", COND_UEQ>; -defm V_CMP_NGT_F32 : VOPC_F32 <vopc<0xb, 0x4b>, "v_cmp_ngt_f32", COND_ULE>; +defm V_CMP_NGT_F32 : VOPC_F32 <vopc<0xb, 0x4b>, "v_cmp_ngt_f32", COND_ULE, "v_cmp_nlt_f32">; defm V_CMP_NLE_F32 : VOPC_F32 <vopc<0xc, 0x4c>, "v_cmp_nle_f32", COND_UGT>; defm V_CMP_NEQ_F32 : VOPC_F32 <vopc<0xd, 0x4d>, "v_cmp_neq_f32", COND_UNE>; defm V_CMP_NLT_F32 : VOPC_F32 <vopc<0xe, 0x4e>, "v_cmp_nlt_f32", COND_UGE>; defm V_CMP_TRU_F32 : VOPC_F32 <vopc<0xf, 0x4f>, "v_cmp_tru_f32">; -let hasSideEffects = 1 in { defm V_CMPX_F_F32 : VOPCX_F32 <vopc<0x10, 0x50>, "v_cmpx_f_f32">; -defm V_CMPX_LT_F32 : VOPCX_F32 <vopc<0x11, 0x51>, "v_cmpx_lt_f32">; +defm V_CMPX_LT_F32 : VOPCX_F32 <vopc<0x11, 0x51>, "v_cmpx_lt_f32", "v_cmpx_gt_f32">; defm V_CMPX_EQ_F32 : VOPCX_F32 <vopc<0x12, 0x52>, "v_cmpx_eq_f32">; -defm V_CMPX_LE_F32 : VOPCX_F32 <vopc<0x13, 0x53>, "v_cmpx_le_f32">; +defm V_CMPX_LE_F32 : VOPCX_F32 <vopc<0x13, 0x53>, "v_cmpx_le_f32", "v_cmpx_ge_f32">; defm V_CMPX_GT_F32 : VOPCX_F32 <vopc<0x14, 0x54>, "v_cmpx_gt_f32">; defm V_CMPX_LG_F32 : VOPCX_F32 <vopc<0x15, 0x55>, "v_cmpx_lg_f32">; defm V_CMPX_GE_F32 : VOPCX_F32 <vopc<0x16, 0x56>, "v_cmpx_ge_f32">; @@ -539,233 +550,207 @@ defm V_CMPX_NEQ_F32 : VOPCX_F32 <vopc<0x1d, 0x5d>, "v_cmpx_neq_f32">; defm V_CMPX_NLT_F32 : VOPCX_F32 <vopc<0x1e, 0x5e>, "v_cmpx_nlt_f32">; defm V_CMPX_TRU_F32 : VOPCX_F32 <vopc<0x1f, 0x5f>, "v_cmpx_tru_f32">; -} // End hasSideEffects = 1 defm V_CMP_F_F64 : VOPC_F64 <vopc<0x20, 0x60>, "v_cmp_f_f64">; -defm V_CMP_LT_F64 : VOPC_F64 <vopc<0x21, 0x61>, "v_cmp_lt_f64", COND_OLT>; +defm V_CMP_LT_F64 : VOPC_F64 <vopc<0x21, 0x61>, "v_cmp_lt_f64", COND_OLT, "v_cmp_gt_f64">; defm V_CMP_EQ_F64 : VOPC_F64 <vopc<0x22, 0x62>, "v_cmp_eq_f64", COND_OEQ>; -defm V_CMP_LE_F64 : VOPC_F64 <vopc<0x23, 0x63>, "v_cmp_le_f64", COND_OLE>; +defm V_CMP_LE_F64 : VOPC_F64 <vopc<0x23, 0x63>, "v_cmp_le_f64", COND_OLE, "v_cmp_ge_f64">; defm V_CMP_GT_F64 : VOPC_F64 <vopc<0x24, 0x64>, "v_cmp_gt_f64", COND_OGT>; defm V_CMP_LG_F64 : VOPC_F64 <vopc<0x25, 0x65>, "v_cmp_lg_f64", COND_ONE>; defm V_CMP_GE_F64 : VOPC_F64 <vopc<0x26, 0x66>, "v_cmp_ge_f64", COND_OGE>; defm V_CMP_O_F64 : VOPC_F64 <vopc<0x27, 0x67>, "v_cmp_o_f64", COND_O>; defm V_CMP_U_F64 : VOPC_F64 <vopc<0x28, 0x68>, "v_cmp_u_f64", COND_UO>; -defm V_CMP_NGE_F64 : VOPC_F64 <vopc<0x29, 0x69>, "v_cmp_nge_f64", COND_ULT>; +defm V_CMP_NGE_F64 : VOPC_F64 <vopc<0x29, 0x69>, "v_cmp_nge_f64", COND_ULT, "v_cmp_nle_f64">; defm V_CMP_NLG_F64 : VOPC_F64 <vopc<0x2a, 0x6a>, "v_cmp_nlg_f64", COND_UEQ>; -defm V_CMP_NGT_F64 : VOPC_F64 <vopc<0x2b, 0x6b>, "v_cmp_ngt_f64", COND_ULE>; +defm V_CMP_NGT_F64 : VOPC_F64 <vopc<0x2b, 0x6b>, "v_cmp_ngt_f64", COND_ULE, "v_cmp_nlt_f64">; defm V_CMP_NLE_F64 : VOPC_F64 <vopc<0x2c, 0x6c>, "v_cmp_nle_f64", COND_UGT>; defm V_CMP_NEQ_F64 : VOPC_F64 <vopc<0x2d, 0x6d>, "v_cmp_neq_f64", COND_UNE>; defm V_CMP_NLT_F64 : VOPC_F64 <vopc<0x2e, 0x6e>, "v_cmp_nlt_f64", COND_UGE>; defm V_CMP_TRU_F64 : VOPC_F64 <vopc<0x2f, 0x6f>, "v_cmp_tru_f64">; -let hasSideEffects = 1 in { defm V_CMPX_F_F64 : VOPCX_F64 <vopc<0x30, 0x70>, "v_cmpx_f_f64">; -defm V_CMPX_LT_F64 : VOPCX_F64 <vopc<0x31, 0x71>, "v_cmpx_lt_f64">; +defm V_CMPX_LT_F64 : VOPCX_F64 <vopc<0x31, 0x71>, "v_cmpx_lt_f64", "v_cmpx_gt_f64">; defm V_CMPX_EQ_F64 : VOPCX_F64 <vopc<0x32, 0x72>, "v_cmpx_eq_f64">; -defm V_CMPX_LE_F64 : VOPCX_F64 <vopc<0x33, 0x73>, "v_cmpx_le_f64">; +defm V_CMPX_LE_F64 : VOPCX_F64 <vopc<0x33, 0x73>, "v_cmpx_le_f64", "v_cmpx_ge_f64">; defm V_CMPX_GT_F64 : VOPCX_F64 <vopc<0x34, 0x74>, "v_cmpx_gt_f64">; defm V_CMPX_LG_F64 : VOPCX_F64 <vopc<0x35, 0x75>, "v_cmpx_lg_f64">; defm V_CMPX_GE_F64 : VOPCX_F64 <vopc<0x36, 0x76>, "v_cmpx_ge_f64">; defm V_CMPX_O_F64 : VOPCX_F64 <vopc<0x37, 0x77>, "v_cmpx_o_f64">; defm V_CMPX_U_F64 : VOPCX_F64 <vopc<0x38, 0x78>, "v_cmpx_u_f64">; -defm V_CMPX_NGE_F64 : VOPCX_F64 <vopc<0x39, 0x79>, "v_cmpx_nge_f64">; +defm V_CMPX_NGE_F64 : VOPCX_F64 <vopc<0x39, 0x79>, "v_cmpx_nge_f64", "v_cmpx_nle_f64">; defm V_CMPX_NLG_F64 : VOPCX_F64 <vopc<0x3a, 0x7a>, "v_cmpx_nlg_f64">; -defm V_CMPX_NGT_F64 : VOPCX_F64 <vopc<0x3b, 0x7b>, "v_cmpx_ngt_f64">; +defm V_CMPX_NGT_F64 : VOPCX_F64 <vopc<0x3b, 0x7b>, "v_cmpx_ngt_f64", "v_cmpx_nlt_f64">; defm V_CMPX_NLE_F64 : VOPCX_F64 <vopc<0x3c, 0x7c>, "v_cmpx_nle_f64">; defm V_CMPX_NEQ_F64 : VOPCX_F64 <vopc<0x3d, 0x7d>, "v_cmpx_neq_f64">; defm V_CMPX_NLT_F64 : VOPCX_F64 <vopc<0x3e, 0x7e>, "v_cmpx_nlt_f64">; defm V_CMPX_TRU_F64 : VOPCX_F64 <vopc<0x3f, 0x7f>, "v_cmpx_tru_f64">; -} // End hasSideEffects = 1 let SubtargetPredicate = isSICI in { defm V_CMPS_F_F32 : VOPC_F32 <vopc<0x40>, "v_cmps_f_f32">; -defm V_CMPS_LT_F32 : VOPC_F32 <vopc<0x41>, "v_cmps_lt_f32">; +defm V_CMPS_LT_F32 : VOPC_F32 <vopc<0x41>, "v_cmps_lt_f32", COND_NULL, "v_cmps_gt_f32">; defm V_CMPS_EQ_F32 : VOPC_F32 <vopc<0x42>, "v_cmps_eq_f32">; -defm V_CMPS_LE_F32 : VOPC_F32 <vopc<0x43>, "v_cmps_le_f32">; +defm V_CMPS_LE_F32 : VOPC_F32 <vopc<0x43>, "v_cmps_le_f32", COND_NULL, "v_cmps_ge_f32">; defm V_CMPS_GT_F32 : VOPC_F32 <vopc<0x44>, "v_cmps_gt_f32">; defm V_CMPS_LG_F32 : VOPC_F32 <vopc<0x45>, "v_cmps_lg_f32">; defm V_CMPS_GE_F32 : VOPC_F32 <vopc<0x46>, "v_cmps_ge_f32">; defm V_CMPS_O_F32 : VOPC_F32 <vopc<0x47>, "v_cmps_o_f32">; defm V_CMPS_U_F32 : VOPC_F32 <vopc<0x48>, "v_cmps_u_f32">; -defm V_CMPS_NGE_F32 : VOPC_F32 <vopc<0x49>, "v_cmps_nge_f32">; +defm V_CMPS_NGE_F32 : VOPC_F32 <vopc<0x49>, "v_cmps_nge_f32", COND_NULL, "v_cmps_nle_f32">; defm V_CMPS_NLG_F32 : VOPC_F32 <vopc<0x4a>, "v_cmps_nlg_f32">; -defm V_CMPS_NGT_F32 : VOPC_F32 <vopc<0x4b>, "v_cmps_ngt_f32">; +defm V_CMPS_NGT_F32 : VOPC_F32 <vopc<0x4b>, "v_cmps_ngt_f32", COND_NULL, "v_cmps_nlt_f32">; defm V_CMPS_NLE_F32 : VOPC_F32 <vopc<0x4c>, "v_cmps_nle_f32">; defm V_CMPS_NEQ_F32 : VOPC_F32 <vopc<0x4d>, "v_cmps_neq_f32">; defm V_CMPS_NLT_F32 : VOPC_F32 <vopc<0x4e>, "v_cmps_nlt_f32">; defm V_CMPS_TRU_F32 : VOPC_F32 <vopc<0x4f>, "v_cmps_tru_f32">; -let hasSideEffects = 1 in { defm V_CMPSX_F_F32 : VOPCX_F32 <vopc<0x50>, "v_cmpsx_f_f32">; -defm V_CMPSX_LT_F32 : VOPCX_F32 <vopc<0x51>, "v_cmpsx_lt_f32">; +defm V_CMPSX_LT_F32 : VOPCX_F32 <vopc<0x51>, "v_cmpsx_lt_f32", "v_cmpsx_gt_f32">; defm V_CMPSX_EQ_F32 : VOPCX_F32 <vopc<0x52>, "v_cmpsx_eq_f32">; -defm V_CMPSX_LE_F32 : VOPCX_F32 <vopc<0x53>, "v_cmpsx_le_f32">; +defm V_CMPSX_LE_F32 : VOPCX_F32 <vopc<0x53>, "v_cmpsx_le_f32", "v_cmpsx_ge_f32">; defm V_CMPSX_GT_F32 : VOPCX_F32 <vopc<0x54>, "v_cmpsx_gt_f32">; defm V_CMPSX_LG_F32 : VOPCX_F32 <vopc<0x55>, "v_cmpsx_lg_f32">; defm V_CMPSX_GE_F32 : VOPCX_F32 <vopc<0x56>, "v_cmpsx_ge_f32">; defm V_CMPSX_O_F32 : VOPCX_F32 <vopc<0x57>, "v_cmpsx_o_f32">; defm V_CMPSX_U_F32 : VOPCX_F32 <vopc<0x58>, "v_cmpsx_u_f32">; -defm V_CMPSX_NGE_F32 : VOPCX_F32 <vopc<0x59>, "v_cmpsx_nge_f32">; +defm V_CMPSX_NGE_F32 : VOPCX_F32 <vopc<0x59>, "v_cmpsx_nge_f32", "v_cmpsx_nle_f32">; defm V_CMPSX_NLG_F32 : VOPCX_F32 <vopc<0x5a>, "v_cmpsx_nlg_f32">; -defm V_CMPSX_NGT_F32 : VOPCX_F32 <vopc<0x5b>, "v_cmpsx_ngt_f32">; +defm V_CMPSX_NGT_F32 : VOPCX_F32 <vopc<0x5b>, "v_cmpsx_ngt_f32", "v_cmpsx_nlt_f32">; defm V_CMPSX_NLE_F32 : VOPCX_F32 <vopc<0x5c>, "v_cmpsx_nle_f32">; defm V_CMPSX_NEQ_F32 : VOPCX_F32 <vopc<0x5d>, "v_cmpsx_neq_f32">; defm V_CMPSX_NLT_F32 : VOPCX_F32 <vopc<0x5e>, "v_cmpsx_nlt_f32">; defm V_CMPSX_TRU_F32 : VOPCX_F32 <vopc<0x5f>, "v_cmpsx_tru_f32">; -} // End hasSideEffects = 1 defm V_CMPS_F_F64 : VOPC_F64 <vopc<0x60>, "v_cmps_f_f64">; -defm V_CMPS_LT_F64 : VOPC_F64 <vopc<0x61>, "v_cmps_lt_f64">; +defm V_CMPS_LT_F64 : VOPC_F64 <vopc<0x61>, "v_cmps_lt_f64", COND_NULL, "v_cmps_gt_f64">; defm V_CMPS_EQ_F64 : VOPC_F64 <vopc<0x62>, "v_cmps_eq_f64">; -defm V_CMPS_LE_F64 : VOPC_F64 <vopc<0x63>, "v_cmps_le_f64">; +defm V_CMPS_LE_F64 : VOPC_F64 <vopc<0x63>, "v_cmps_le_f64", COND_NULL, "v_cmps_ge_f64">; defm V_CMPS_GT_F64 : VOPC_F64 <vopc<0x64>, "v_cmps_gt_f64">; defm V_CMPS_LG_F64 : VOPC_F64 <vopc<0x65>, "v_cmps_lg_f64">; defm V_CMPS_GE_F64 : VOPC_F64 <vopc<0x66>, "v_cmps_ge_f64">; defm V_CMPS_O_F64 : VOPC_F64 <vopc<0x67>, "v_cmps_o_f64">; defm V_CMPS_U_F64 : VOPC_F64 <vopc<0x68>, "v_cmps_u_f64">; -defm V_CMPS_NGE_F64 : VOPC_F64 <vopc<0x69>, "v_cmps_nge_f64">; +defm V_CMPS_NGE_F64 : VOPC_F64 <vopc<0x69>, "v_cmps_nge_f64", COND_NULL, "v_cmps_nle_f64">; defm V_CMPS_NLG_F64 : VOPC_F64 <vopc<0x6a>, "v_cmps_nlg_f64">; -defm V_CMPS_NGT_F64 : VOPC_F64 <vopc<0x6b>, "v_cmps_ngt_f64">; +defm V_CMPS_NGT_F64 : VOPC_F64 <vopc<0x6b>, "v_cmps_ngt_f64", COND_NULL, "v_cmps_nlt_f64">; defm V_CMPS_NLE_F64 : VOPC_F64 <vopc<0x6c>, "v_cmps_nle_f64">; defm V_CMPS_NEQ_F64 : VOPC_F64 <vopc<0x6d>, "v_cmps_neq_f64">; defm V_CMPS_NLT_F64 : VOPC_F64 <vopc<0x6e>, "v_cmps_nlt_f64">; defm V_CMPS_TRU_F64 : VOPC_F64 <vopc<0x6f>, "v_cmps_tru_f64">; -let hasSideEffects = 1, Defs = [EXEC] in { - -defm V_CMPSX_F_F64 : VOPC_F64 <vopc<0x70>, "v_cmpsx_f_f64">; -defm V_CMPSX_LT_F64 : VOPC_F64 <vopc<0x71>, "v_cmpsx_lt_f64">; -defm V_CMPSX_EQ_F64 : VOPC_F64 <vopc<0x72>, "v_cmpsx_eq_f64">; -defm V_CMPSX_LE_F64 : VOPC_F64 <vopc<0x73>, "v_cmpsx_le_f64">; -defm V_CMPSX_GT_F64 : VOPC_F64 <vopc<0x74>, "v_cmpsx_gt_f64">; -defm V_CMPSX_LG_F64 : VOPC_F64 <vopc<0x75>, "v_cmpsx_lg_f64">; -defm V_CMPSX_GE_F64 : VOPC_F64 <vopc<0x76>, "v_cmpsx_ge_f64">; -defm V_CMPSX_O_F64 : VOPC_F64 <vopc<0x77>, "v_cmpsx_o_f64">; -defm V_CMPSX_U_F64 : VOPC_F64 <vopc<0x78>, "v_cmpsx_u_f64">; -defm V_CMPSX_NGE_F64 : VOPC_F64 <vopc<0x79>, "v_cmpsx_nge_f64">; -defm V_CMPSX_NLG_F64 : VOPC_F64 <vopc<0x7a>, "v_cmpsx_nlg_f64">; -defm V_CMPSX_NGT_F64 : VOPC_F64 <vopc<0x7b>, "v_cmpsx_ngt_f64">; -defm V_CMPSX_NLE_F64 : VOPC_F64 <vopc<0x7c>, "v_cmpsx_nle_f64">; -defm V_CMPSX_NEQ_F64 : VOPC_F64 <vopc<0x7d>, "v_cmpsx_neq_f64">; -defm V_CMPSX_NLT_F64 : VOPC_F64 <vopc<0x7e>, "v_cmpsx_nlt_f64">; -defm V_CMPSX_TRU_F64 : VOPC_F64 <vopc<0x7f>, "v_cmpsx_tru_f64">; - -} // End hasSideEffects = 1, Defs = [EXEC] + +defm V_CMPSX_F_F64 : VOPCX_F64 <vopc<0x70>, "v_cmpsx_f_f64">; +defm V_CMPSX_LT_F64 : VOPCX_F64 <vopc<0x71>, "v_cmpsx_lt_f64", "v_cmpsx_gt_f64">; +defm V_CMPSX_EQ_F64 : VOPCX_F64 <vopc<0x72>, "v_cmpsx_eq_f64">; +defm V_CMPSX_LE_F64 : VOPCX_F64 <vopc<0x73>, "v_cmpsx_le_f64", "v_cmpsx_ge_f64">; +defm V_CMPSX_GT_F64 : VOPCX_F64 <vopc<0x74>, "v_cmpsx_gt_f64">; +defm V_CMPSX_LG_F64 : VOPCX_F64 <vopc<0x75>, "v_cmpsx_lg_f64">; +defm V_CMPSX_GE_F64 : VOPCX_F64 <vopc<0x76>, "v_cmpsx_ge_f64">; +defm V_CMPSX_O_F64 : VOPCX_F64 <vopc<0x77>, "v_cmpsx_o_f64">; +defm V_CMPSX_U_F64 : VOPCX_F64 <vopc<0x78>, "v_cmpsx_u_f64">; +defm V_CMPSX_NGE_F64 : VOPCX_F64 <vopc<0x79>, "v_cmpsx_nge_f64", "v_cmpsx_nle_f64">; +defm V_CMPSX_NLG_F64 : VOPCX_F64 <vopc<0x7a>, "v_cmpsx_nlg_f64">; +defm V_CMPSX_NGT_F64 : VOPCX_F64 <vopc<0x7b>, "v_cmpsx_ngt_f64", "v_cmpsx_nlt_f64">; +defm V_CMPSX_NLE_F64 : VOPCX_F64 <vopc<0x7c>, "v_cmpsx_nle_f64">; +defm V_CMPSX_NEQ_F64 : VOPCX_F64 <vopc<0x7d>, "v_cmpsx_neq_f64">; +defm V_CMPSX_NLT_F64 : VOPCX_F64 <vopc<0x7e>, "v_cmpsx_nlt_f64">; +defm V_CMPSX_TRU_F64 : VOPCX_F64 <vopc<0x7f>, "v_cmpsx_tru_f64">; } // End SubtargetPredicate = isSICI defm V_CMP_F_I32 : VOPC_I32 <vopc<0x80, 0xc0>, "v_cmp_f_i32">; -defm V_CMP_LT_I32 : VOPC_I32 <vopc<0x81, 0xc1>, "v_cmp_lt_i32", COND_SLT>; +defm V_CMP_LT_I32 : VOPC_I32 <vopc<0x81, 0xc1>, "v_cmp_lt_i32", COND_SLT, "v_cmp_gt_i32">; defm V_CMP_EQ_I32 : VOPC_I32 <vopc<0x82, 0xc2>, "v_cmp_eq_i32", COND_EQ>; -defm V_CMP_LE_I32 : VOPC_I32 <vopc<0x83, 0xc3>, "v_cmp_le_i32", COND_SLE>; +defm V_CMP_LE_I32 : VOPC_I32 <vopc<0x83, 0xc3>, "v_cmp_le_i32", COND_SLE, "v_cmp_ge_i32">; defm V_CMP_GT_I32 : VOPC_I32 <vopc<0x84, 0xc4>, "v_cmp_gt_i32", COND_SGT>; defm V_CMP_NE_I32 : VOPC_I32 <vopc<0x85, 0xc5>, "v_cmp_ne_i32", COND_NE>; defm V_CMP_GE_I32 : VOPC_I32 <vopc<0x86, 0xc6>, "v_cmp_ge_i32", COND_SGE>; defm V_CMP_T_I32 : VOPC_I32 <vopc<0x87, 0xc7>, "v_cmp_t_i32">; -let hasSideEffects = 1 in { defm V_CMPX_F_I32 : VOPCX_I32 <vopc<0x90, 0xd0>, "v_cmpx_f_i32">; -defm V_CMPX_LT_I32 : VOPCX_I32 <vopc<0x91, 0xd1>, "v_cmpx_lt_i32">; +defm V_CMPX_LT_I32 : VOPCX_I32 <vopc<0x91, 0xd1>, "v_cmpx_lt_i32", "v_cmpx_gt_i32">; defm V_CMPX_EQ_I32 : VOPCX_I32 <vopc<0x92, 0xd2>, "v_cmpx_eq_i32">; -defm V_CMPX_LE_I32 : VOPCX_I32 <vopc<0x93, 0xd3>, "v_cmpx_le_i32">; +defm V_CMPX_LE_I32 : VOPCX_I32 <vopc<0x93, 0xd3>, "v_cmpx_le_i32", "v_cmpx_ge_i32">; defm V_CMPX_GT_I32 : VOPCX_I32 <vopc<0x94, 0xd4>, "v_cmpx_gt_i32">; defm V_CMPX_NE_I32 : VOPCX_I32 <vopc<0x95, 0xd5>, "v_cmpx_ne_i32">; defm V_CMPX_GE_I32 : VOPCX_I32 <vopc<0x96, 0xd6>, "v_cmpx_ge_i32">; defm V_CMPX_T_I32 : VOPCX_I32 <vopc<0x97, 0xd7>, "v_cmpx_t_i32">; -} // End hasSideEffects = 1 defm V_CMP_F_I64 : VOPC_I64 <vopc<0xa0, 0xe0>, "v_cmp_f_i64">; -defm V_CMP_LT_I64 : VOPC_I64 <vopc<0xa1, 0xe1>, "v_cmp_lt_i64", COND_SLT>; +defm V_CMP_LT_I64 : VOPC_I64 <vopc<0xa1, 0xe1>, "v_cmp_lt_i64", COND_SLT, "v_cmp_gt_i64">; defm V_CMP_EQ_I64 : VOPC_I64 <vopc<0xa2, 0xe2>, "v_cmp_eq_i64", COND_EQ>; -defm V_CMP_LE_I64 : VOPC_I64 <vopc<0xa3, 0xe3>, "v_cmp_le_i64", COND_SLE>; +defm V_CMP_LE_I64 : VOPC_I64 <vopc<0xa3, 0xe3>, "v_cmp_le_i64", COND_SLE, "v_cmp_ge_i64">; defm V_CMP_GT_I64 : VOPC_I64 <vopc<0xa4, 0xe4>, "v_cmp_gt_i64", COND_SGT>; defm V_CMP_NE_I64 : VOPC_I64 <vopc<0xa5, 0xe5>, "v_cmp_ne_i64", COND_NE>; defm V_CMP_GE_I64 : VOPC_I64 <vopc<0xa6, 0xe6>, "v_cmp_ge_i64", COND_SGE>; defm V_CMP_T_I64 : VOPC_I64 <vopc<0xa7, 0xe7>, "v_cmp_t_i64">; -let hasSideEffects = 1 in { defm V_CMPX_F_I64 : VOPCX_I64 <vopc<0xb0, 0xf0>, "v_cmpx_f_i64">; -defm V_CMPX_LT_I64 : VOPCX_I64 <vopc<0xb1, 0xf1>, "v_cmpx_lt_i64">; +defm V_CMPX_LT_I64 : VOPCX_I64 <vopc<0xb1, 0xf1>, "v_cmpx_lt_i64", "v_cmpx_gt_i64">; defm V_CMPX_EQ_I64 : VOPCX_I64 <vopc<0xb2, 0xf2>, "v_cmpx_eq_i64">; -defm V_CMPX_LE_I64 : VOPCX_I64 <vopc<0xb3, 0xf3>, "v_cmpx_le_i64">; +defm V_CMPX_LE_I64 : VOPCX_I64 <vopc<0xb3, 0xf3>, "v_cmpx_le_i64", "v_cmpx_ge_i64">; defm V_CMPX_GT_I64 : VOPCX_I64 <vopc<0xb4, 0xf4>, "v_cmpx_gt_i64">; defm V_CMPX_NE_I64 : VOPCX_I64 <vopc<0xb5, 0xf5>, "v_cmpx_ne_i64">; defm V_CMPX_GE_I64 : VOPCX_I64 <vopc<0xb6, 0xf6>, "v_cmpx_ge_i64">; defm V_CMPX_T_I64 : VOPCX_I64 <vopc<0xb7, 0xf7>, "v_cmpx_t_i64">; -} // End hasSideEffects = 1 defm V_CMP_F_U32 : VOPC_I32 <vopc<0xc0, 0xc8>, "v_cmp_f_u32">; -defm V_CMP_LT_U32 : VOPC_I32 <vopc<0xc1, 0xc9>, "v_cmp_lt_u32", COND_ULT>; +defm V_CMP_LT_U32 : VOPC_I32 <vopc<0xc1, 0xc9>, "v_cmp_lt_u32", COND_ULT, "v_cmp_gt_u32">; defm V_CMP_EQ_U32 : VOPC_I32 <vopc<0xc2, 0xca>, "v_cmp_eq_u32", COND_EQ>; -defm V_CMP_LE_U32 : VOPC_I32 <vopc<0xc3, 0xcb>, "v_cmp_le_u32", COND_ULE>; +defm V_CMP_LE_U32 : VOPC_I32 <vopc<0xc3, 0xcb>, "v_cmp_le_u32", COND_ULE, "v_cmp_ge_u32">; defm V_CMP_GT_U32 : VOPC_I32 <vopc<0xc4, 0xcc>, "v_cmp_gt_u32", COND_UGT>; defm V_CMP_NE_U32 : VOPC_I32 <vopc<0xc5, 0xcd>, "v_cmp_ne_u32", COND_NE>; defm V_CMP_GE_U32 : VOPC_I32 <vopc<0xc6, 0xce>, "v_cmp_ge_u32", COND_UGE>; defm V_CMP_T_U32 : VOPC_I32 <vopc<0xc7, 0xcf>, "v_cmp_t_u32">; -let hasSideEffects = 1 in { defm V_CMPX_F_U32 : VOPCX_I32 <vopc<0xd0, 0xd8>, "v_cmpx_f_u32">; -defm V_CMPX_LT_U32 : VOPCX_I32 <vopc<0xd1, 0xd9>, "v_cmpx_lt_u32">; +defm V_CMPX_LT_U32 : VOPCX_I32 <vopc<0xd1, 0xd9>, "v_cmpx_lt_u32", "v_cmpx_gt_u32">; defm V_CMPX_EQ_U32 : VOPCX_I32 <vopc<0xd2, 0xda>, "v_cmpx_eq_u32">; -defm V_CMPX_LE_U32 : VOPCX_I32 <vopc<0xd3, 0xdb>, "v_cmpx_le_u32">; +defm V_CMPX_LE_U32 : VOPCX_I32 <vopc<0xd3, 0xdb>, "v_cmpx_le_u32", "v_cmpx_le_u32">; defm V_CMPX_GT_U32 : VOPCX_I32 <vopc<0xd4, 0xdc>, "v_cmpx_gt_u32">; defm V_CMPX_NE_U32 : VOPCX_I32 <vopc<0xd5, 0xdd>, "v_cmpx_ne_u32">; defm V_CMPX_GE_U32 : VOPCX_I32 <vopc<0xd6, 0xde>, "v_cmpx_ge_u32">; defm V_CMPX_T_U32 : VOPCX_I32 <vopc<0xd7, 0xdf>, "v_cmpx_t_u32">; -} // End hasSideEffects = 1 defm V_CMP_F_U64 : VOPC_I64 <vopc<0xe0, 0xe8>, "v_cmp_f_u64">; -defm V_CMP_LT_U64 : VOPC_I64 <vopc<0xe1, 0xe9>, "v_cmp_lt_u64", COND_ULT>; +defm V_CMP_LT_U64 : VOPC_I64 <vopc<0xe1, 0xe9>, "v_cmp_lt_u64", COND_ULT, "v_cmp_gt_u64">; defm V_CMP_EQ_U64 : VOPC_I64 <vopc<0xe2, 0xea>, "v_cmp_eq_u64", COND_EQ>; -defm V_CMP_LE_U64 : VOPC_I64 <vopc<0xe3, 0xeb>, "v_cmp_le_u64", COND_ULE>; +defm V_CMP_LE_U64 : VOPC_I64 <vopc<0xe3, 0xeb>, "v_cmp_le_u64", COND_ULE, "v_cmp_ge_u64">; defm V_CMP_GT_U64 : VOPC_I64 <vopc<0xe4, 0xec>, "v_cmp_gt_u64", COND_UGT>; defm V_CMP_NE_U64 : VOPC_I64 <vopc<0xe5, 0xed>, "v_cmp_ne_u64", COND_NE>; defm V_CMP_GE_U64 : VOPC_I64 <vopc<0xe6, 0xee>, "v_cmp_ge_u64", COND_UGE>; defm V_CMP_T_U64 : VOPC_I64 <vopc<0xe7, 0xef>, "v_cmp_t_u64">; -let hasSideEffects = 1 in { - defm V_CMPX_F_U64 : VOPCX_I64 <vopc<0xf0, 0xf8>, "v_cmpx_f_u64">; -defm V_CMPX_LT_U64 : VOPCX_I64 <vopc<0xf1, 0xf9>, "v_cmpx_lt_u64">; +defm V_CMPX_LT_U64 : VOPCX_I64 <vopc<0xf1, 0xf9>, "v_cmpx_lt_u64", "v_cmpx_gt_u64">; defm V_CMPX_EQ_U64 : VOPCX_I64 <vopc<0xf2, 0xfa>, "v_cmpx_eq_u64">; -defm V_CMPX_LE_U64 : VOPCX_I64 <vopc<0xf3, 0xfb>, "v_cmpx_le_u64">; +defm V_CMPX_LE_U64 : VOPCX_I64 <vopc<0xf3, 0xfb>, "v_cmpx_le_u64", "v_cmpx_ge_u64">; defm V_CMPX_GT_U64 : VOPCX_I64 <vopc<0xf4, 0xfc>, "v_cmpx_gt_u64">; defm V_CMPX_NE_U64 : VOPCX_I64 <vopc<0xf5, 0xfd>, "v_cmpx_ne_u64">; defm V_CMPX_GE_U64 : VOPCX_I64 <vopc<0xf6, 0xfe>, "v_cmpx_ge_u64">; defm V_CMPX_T_U64 : VOPCX_I64 <vopc<0xf7, 0xff>, "v_cmpx_t_u64">; -} // End hasSideEffects = 1 +} // End isCompare = 1, isCommutable = 1 defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <vopc<0x88, 0x10>, "v_cmp_class_f32">; - -let hasSideEffects = 1 in { defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <vopc<0x98, 0x11>, "v_cmpx_class_f32">; -} // End hasSideEffects = 1 - defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <vopc<0xa8, 0x12>, "v_cmp_class_f64">; - -let hasSideEffects = 1 in { defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <vopc<0xb8, 0x13>, "v_cmpx_class_f64">; -} // End hasSideEffects = 1 - -} // End isCompare = 1 //===----------------------------------------------------------------------===// // DS Instructions //===----------------------------------------------------------------------===// - defm DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VGPR_32>; defm DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VGPR_32>; defm DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VGPR_32>; @@ -778,12 +763,26 @@ defm DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VGPR_32>; defm DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VGPR_32>; defm DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VGPR_32>; defm DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>; -defm DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "ds_mskor_b32", VGPR_32>; +defm DS_MSKOR_B32 : DS_1A2D_NORET <0xc, "ds_mskor_b32", VGPR_32>; +let mayLoad = 0 in { +defm DS_WRITE_B32 : DS_1A1D_NORET <0xd, "ds_write_b32", VGPR_32>; +defm DS_WRITE2_B32 : DS_1A1D_Off8_NORET <0xe, "ds_write2_b32", VGPR_32>; +defm DS_WRITE2ST64_B32 : DS_1A1D_Off8_NORET <0xf, "ds_write2st64_b32", VGPR_32>; +} defm DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>; defm DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>; -defm DS_MIN_F32 : DS_1A1D_NORET <0x12, "ds_min_f32", VGPR_32>; -defm DS_MAX_F32 : DS_1A1D_NORET <0x13, "ds_max_f32", VGPR_32>; - +defm DS_MIN_F32 : DS_1A2D_NORET <0x12, "ds_min_f32", VGPR_32>; +defm DS_MAX_F32 : DS_1A2D_NORET <0x13, "ds_max_f32", VGPR_32>; + +defm DS_GWS_INIT : DS_1A_GDS <0x19, "ds_gws_init">; +defm DS_GWS_SEMA_V : DS_1A_GDS <0x1a, "ds_gws_sema_v">; +defm DS_GWS_SEMA_BR : DS_1A_GDS <0x1b, "ds_gws_sema_br">; +defm DS_GWS_SEMA_P : DS_1A_GDS <0x1c, "ds_gws_sema_p">; +defm DS_GWS_BARRIER : DS_1A_GDS <0x1d, "ds_gws_barrier">; +let mayLoad = 0 in { +defm DS_WRITE_B8 : DS_1A1D_NORET <0x1e, "ds_write_b8", VGPR_32>; +defm DS_WRITE_B16 : DS_1A1D_NORET <0x1f, "ds_write_b16", VGPR_32>; +} defm DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VGPR_32, "ds_add_u32">; defm DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">; defm DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">; @@ -796,20 +795,34 @@ defm DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VGPR_32, "ds_max_u32" defm DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VGPR_32, "ds_and_b32">; defm DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VGPR_32, "ds_or_b32">; defm DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">; -defm DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">; +defm DS_MSKOR_RTN_B32 : DS_1A2D_RET <0x2c, "ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">; defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VGPR_32>; -//def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "ds_wrxchg2_rtn_b32", VGPR_32, "ds_wrxchg2_b32">; -//def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "ds_wrxchg2_rtn_b32", VGPR_32, "ds_wrxchg2st64_b32">; +defm DS_WRXCHG2_RTN_B32 : DS_1A2D_RET < + 0x2e, "ds_wrxchg2_rtn_b32", VReg_64, "", VGPR_32 +>; +defm DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_RET < + 0x2f, "ds_wrxchg2st64_rtn_b32", VReg_64, "", VGPR_32 +>; defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">; defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">; -defm DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">; -defm DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">; - +defm DS_MIN_RTN_F32 : DS_1A2D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">; +defm DS_MAX_RTN_F32 : DS_1A2D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">; let SubtargetPredicate = isCI in { defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">; } // End isCI - - +defm DS_SWIZZLE_B32 : DS_1A_RET <0x35, "ds_swizzle_b32", VGPR_32>; +let mayStore = 0 in { +defm DS_READ_B32 : DS_1A_RET <0x36, "ds_read_b32", VGPR_32>; +defm DS_READ2_B32 : DS_1A_Off8_RET <0x37, "ds_read2_b32", VReg_64>; +defm DS_READ2ST64_B32 : DS_1A_Off8_RET <0x38, "ds_read2st64_b32", VReg_64>; +defm DS_READ_I8 : DS_1A_RET <0x39, "ds_read_i8", VGPR_32>; +defm DS_READ_U8 : DS_1A_RET <0x3a, "ds_read_u8", VGPR_32>; +defm DS_READ_I16 : DS_1A_RET <0x3b, "ds_read_i16", VGPR_32>; +defm DS_READ_U16 : DS_1A_RET <0x3c, "ds_read_u16", VGPR_32>; +} +defm DS_CONSUME : DS_0A_RET <0x3d, "ds_consume">; +defm DS_APPEND : DS_0A_RET <0x3e, "ds_append">; +defm DS_ORDERED_COUNT : DS_1A_RET_GDS <0x3f, "ds_ordered_count">; defm DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>; defm DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>; defm DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>; @@ -822,7 +835,12 @@ defm DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>; defm DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>; defm DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>; defm DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>; -defm DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "ds_mskor_b64", VReg_64>; +defm DS_MSKOR_B64 : DS_1A2D_NORET <0x4c, "ds_mskor_b64", VReg_64>; +let mayLoad = 0 in { +defm DS_WRITE_B64 : DS_1A1D_NORET <0x4d, "ds_write_b64", VReg_64>; +defm DS_WRITE2_B64 : DS_1A1D_Off8_NORET <0x4E, "ds_write2_b64", VReg_64>; +defm DS_WRITE2ST64_B64 : DS_1A1D_Off8_NORET <0x4f, "ds_write2st64_b64", VReg_64>; +} defm DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>; defm DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>; defm DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>; @@ -840,57 +858,88 @@ defm DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64" defm DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">; defm DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">; defm DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">; -defm DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">; +defm DS_MSKOR_RTN_B64 : DS_1A2D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">; defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">; -//def DS_WRXCHG2_RTN_B64 : DS_2A0D_RET <0x6e, "ds_wrxchg2_rtn_b64", VReg_64, "ds_wrxchg2_b64">; -//def DS_WRXCHG2ST64_RTN_B64 : DS_2A0D_RET <0x6f, "ds_wrxchg2_rtn_b64", VReg_64, "ds_wrxchg2st64_b64">; +defm DS_WRXCHG2_RTN_B64 : DS_1A2D_RET <0x6e, "ds_wrxchg2_rtn_b64", VReg_128, "ds_wrxchg2_b64", VReg_64>; +defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_RET <0x6f, "ds_wrxchg2st64_rtn_b64", VReg_128, "ds_wrxchg2st64_b64", VReg_64>; defm DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">; defm DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">; defm DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_rtn_f64", VReg_64, "ds_min_f64">; defm DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_rtn_f64", VReg_64, "ds_max_f64">; +let mayStore = 0 in { +defm DS_READ_B64 : DS_1A_RET <0x76, "ds_read_b64", VReg_64>; +defm DS_READ2_B64 : DS_1A_Off8_RET <0x77, "ds_read2_b64", VReg_128>; +defm DS_READ2ST64_B64 : DS_1A_Off8_RET <0x78, "ds_read2st64_b64", VReg_128>; +} + +defm DS_ADD_SRC2_U32 : DS_1A <0x80, "ds_add_src2_u32">; +defm DS_SUB_SRC2_U32 : DS_1A <0x81, "ds_sub_src2_u32">; +defm DS_RSUB_SRC2_U32 : DS_1A <0x82, "ds_rsub_src2_u32">; +defm DS_INC_SRC2_U32 : DS_1A <0x83, "ds_inc_src2_u32">; +defm DS_DEC_SRC2_U32 : DS_1A <0x84, "ds_dec_src2_u32">; +defm DS_MIN_SRC2_I32 : DS_1A <0x85, "ds_min_src2_i32">; +defm DS_MAX_SRC2_I32 : DS_1A <0x86, "ds_max_src2_i32">; +defm DS_MIN_SRC2_U32 : DS_1A <0x87, "ds_min_src2_u32">; +defm DS_MAX_SRC2_U32 : DS_1A <0x88, "ds_max_src2_u32">; +defm DS_AND_SRC2_B32 : DS_1A <0x89, "ds_and_src_b32">; +defm DS_OR_SRC2_B32 : DS_1A <0x8a, "ds_or_src2_b32">; +defm DS_XOR_SRC2_B32 : DS_1A <0x8b, "ds_xor_src2_b32">; +defm DS_WRITE_SRC2_B32 : DS_1A <0x8c, "ds_write_src2_b32">; + +defm DS_MIN_SRC2_F32 : DS_1A <0x92, "ds_min_src2_f32">; +defm DS_MAX_SRC2_F32 : DS_1A <0x93, "ds_max_src2_f32">; + +defm DS_ADD_SRC2_U64 : DS_1A <0xc0, "ds_add_src2_u64">; +defm DS_SUB_SRC2_U64 : DS_1A <0xc1, "ds_sub_src2_u64">; +defm DS_RSUB_SRC2_U64 : DS_1A <0xc2, "ds_rsub_src2_u64">; +defm DS_INC_SRC2_U64 : DS_1A <0xc3, "ds_inc_src2_u64">; +defm DS_DEC_SRC2_U64 : DS_1A <0xc4, "ds_dec_src2_u64">; +defm DS_MIN_SRC2_I64 : DS_1A <0xc5, "ds_min_src2_i64">; +defm DS_MAX_SRC2_I64 : DS_1A <0xc6, "ds_max_src2_i64">; +defm DS_MIN_SRC2_U64 : DS_1A <0xc7, "ds_min_src2_u64">; +defm DS_MAX_SRC2_U64 : DS_1A <0xc8, "ds_max_src2_u64">; +defm DS_AND_SRC2_B64 : DS_1A <0xc9, "ds_and_src2_b64">; +defm DS_OR_SRC2_B64 : DS_1A <0xca, "ds_or_src2_b64">; +defm DS_XOR_SRC2_B64 : DS_1A <0xcb, "ds_xor_src2_b64">; +defm DS_WRITE_SRC2_B64 : DS_1A <0xcc, "ds_write_src2_b64">; + +defm DS_MIN_SRC2_F64 : DS_1A <0xd2, "ds_min_src2_f64">; +defm DS_MAX_SRC2_F64 : DS_1A <0xd3, "ds_max_src2_f64">; + //let SubtargetPredicate = isCI in { // DS_CONDXCHG32_RTN_B64 // DS_CONDXCHG32_RTN_B128 //} // End isCI -// TODO: _SRC2_* forms - -defm DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "ds_write_b32", VGPR_32>; -defm DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "ds_write_b8", VGPR_32>; -defm DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "ds_write_b16", VGPR_32>; -defm DS_WRITE_B64 : DS_Store_Helper <0x00000004d, "ds_write_b64", VReg_64>; - -defm DS_READ_B32 : DS_Load_Helper <0x00000036, "ds_read_b32", VGPR_32>; -defm DS_READ_I8 : DS_Load_Helper <0x00000039, "ds_read_i8", VGPR_32>; -defm DS_READ_U8 : DS_Load_Helper <0x0000003a, "ds_read_u8", VGPR_32>; -defm DS_READ_I16 : DS_Load_Helper <0x0000003b, "ds_read_i16", VGPR_32>; -defm DS_READ_U16 : DS_Load_Helper <0x0000003c, "ds_read_u16", VGPR_32>; -defm DS_READ_B64 : DS_Load_Helper <0x00000076, "ds_read_b64", VReg_64>; - -// 2 forms. -defm DS_WRITE2_B32 : DS_Store2_Helper <0x0000000E, "ds_write2_b32", VGPR_32>; -defm DS_WRITE2ST64_B32 : DS_Store2_Helper <0x0000000F, "ds_write2st64_b32", VGPR_32>; -defm DS_WRITE2_B64 : DS_Store2_Helper <0x0000004E, "ds_write2_b64", VReg_64>; -defm DS_WRITE2ST64_B64 : DS_Store2_Helper <0x0000004F, "ds_write2st64_b64", VReg_64>; - -defm DS_READ2_B32 : DS_Load2_Helper <0x00000037, "ds_read2_b32", VReg_64>; -defm DS_READ2ST64_B32 : DS_Load2_Helper <0x00000038, "ds_read2st64_b32", VReg_64>; -defm DS_READ2_B64 : DS_Load2_Helper <0x00000077, "ds_read2_b64", VReg_128>; -defm DS_READ2ST64_B64 : DS_Load2_Helper <0x00000078, "ds_read2st64_b64", VReg_128>; - //===----------------------------------------------------------------------===// // MUBUF Instructions //===----------------------------------------------------------------------===// -//def BUFFER_LOAD_FORMAT_X : MUBUF_ <mubuf<0x00>, "buffer_load_format_x", []>; -//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <mubuf<0x01>, "buffer_load_format_xy", []>; -//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <mubuf<0x02>, "buffer_load_format_xyz", []>; -defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <mubuf<0x03>, "buffer_load_format_xyzw", VReg_128>; -//def BUFFER_STORE_FORMAT_X : MUBUF_ <mubuf<0x04>, "buffer_store_format_x", []>; -//def BUFFER_STORE_FORMAT_XY : MUBUF_ <mubuf<0x05>, "buffer_store_format_xy", []>; -//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <mubuf<0x06>, "buffer_store_format_xyz", []>; -//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <mubuf<0x07>, "buffer_store_format_xyzw", []>; +defm BUFFER_LOAD_FORMAT_X : MUBUF_Load_Helper < + mubuf<0x00>, "buffer_load_format_x", VGPR_32 +>; +defm BUFFER_LOAD_FORMAT_XY : MUBUF_Load_Helper < + mubuf<0x01>, "buffer_load_format_xy", VReg_64 +>; +defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Load_Helper < + mubuf<0x02>, "buffer_load_format_xyz", VReg_96 +>; +defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper < + mubuf<0x03>, "buffer_load_format_xyzw", VReg_128 +>; +defm BUFFER_STORE_FORMAT_X : MUBUF_Store_Helper < + mubuf<0x04>, "buffer_store_format_x", VGPR_32 +>; +defm BUFFER_STORE_FORMAT_XY : MUBUF_Store_Helper < + mubuf<0x05>, "buffer_store_format_xy", VReg_64 +>; +defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Store_Helper < + mubuf<0x06>, "buffer_store_format_xyz", VReg_96 +>; +defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Store_Helper < + mubuf<0x07>, "buffer_store_format_xyzw", VReg_128 +>; defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper < mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, az_extloadi8_global >; @@ -1182,9 +1231,11 @@ def FLAT_STORE_DWORDX3 : FLAT_Store_Helper < // VOP1 Instructions //===----------------------------------------------------------------------===// -//def V_NOP : VOP1_ <0x00000000, "v_nop", []>; +let vdst = 0, src0 = 0 in { +defm V_NOP : VOP1_m <vop1<0x0>, (outs), (ins), "v_nop", [], "v_nop">; +} -let isMoveImm = 1 in { +let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in { defm V_MOV_B32 : VOP1Inst <vop1<0x1>, "v_mov_b32", VOP_I32_I32>; } // End isMoveImm = 1 @@ -1222,16 +1273,17 @@ defm V_CVT_U32_F32 : VOP1Inst <vop1<0x7>, "v_cvt_u32_f32", defm V_CVT_I32_F32 : VOP1Inst <vop1<0x8>, "v_cvt_i32_f32", VOP_I32_F32, fp_to_sint >; -defm V_MOV_FED_B32 : VOP1Inst <vop1<0x9>, "v_mov_fed_b32", VOP_I32_I32>; defm V_CVT_F16_F32 : VOP1Inst <vop1<0xa>, "v_cvt_f16_f32", VOP_I32_F32, fp_to_f16 >; defm V_CVT_F32_F16 : VOP1Inst <vop1<0xb>, "v_cvt_f32_f16", VOP_F32_I32, f16_to_fp >; -//defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "v_cvt_rpi_i32_f32", []>; -//defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "v_cvt_flr_i32_f32", []>; -//defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "v_cvt_off_f32_i4", []>; +defm V_CVT_RPI_I32_F32 : VOP1Inst <vop1<0xc>, "v_cvt_rpi_i32_f32", + VOP_I32_F32, cvt_rpi_i32_f32>; +defm V_CVT_FLR_I32_F32 : VOP1Inst <vop1<0xd>, "v_cvt_flr_i32_f32", + VOP_I32_F32, cvt_flr_i32_f32>; +defm V_CVT_OFF_F32_I4 : VOP1Inst <vop1<0x0e>, "v_cvt_off_f32_i4", VOP_F32_I32>; defm V_CVT_F32_F64 : VOP1Inst <vop1<0xf>, "v_cvt_f32_f64", VOP_F32_F64, fround >; @@ -1329,16 +1381,24 @@ defm V_BFREV_B32 : VOP1Inst <vop1<0x38, 0x2c>, "v_bfrev_b32", VOP_I32_I32>; defm V_FFBH_U32 : VOP1Inst <vop1<0x39, 0x2d>, "v_ffbh_u32", VOP_I32_I32>; defm V_FFBL_B32 : VOP1Inst <vop1<0x3a, 0x2e>, "v_ffbl_b32", VOP_I32_I32>; defm V_FFBH_I32 : VOP1Inst <vop1<0x3b, 0x2f>, "v_ffbh_i32", VOP_I32_I32>; -//defm V_FREXP_EXP_I32_F64 : VOPInst <0x0000003c, "v_frexp_exp_i32_f64", VOP_I32_F32>; +defm V_FREXP_EXP_I32_F64 : VOP1Inst <vop1<0x3c,0x30>, "v_frexp_exp_i32_f64", + VOP_I32_F64 +>; defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d, 0x31>, "v_frexp_mant_f64", VOP_F64_F64 >; defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64", VOP_F64_F64>; -//defm V_FREXP_EXP_I32_F32 : VOPInst <0x0000003f, "v_frexp_exp_i32_f32", VOP_I32_F32>; +defm V_FREXP_EXP_I32_F32 : VOP1Inst <vop1<0x3f, 0x33>, "v_frexp_exp_i32_f32", + VOP_I32_F32 +>; defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40, 0x34>, "v_frexp_mant_f32", VOP_F32_F32 >; -//def V_CLREXCP : VOP1_ <0x00000041, "v_clrexcp", []>; +let vdst = 0, src0 = 0 in { +defm V_CLREXCP : VOP1_m <vop1<0x41,0x35>, (outs), (ins), "v_clrexcp", [], + "v_clrexcp" +>; +} defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_I32_I32>; defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_I32_I32>; defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_I32_I32>; @@ -1348,6 +1408,7 @@ let SubtargetPredicate = isSICI in { let SchedRW = [WriteQuarterRate32] in { +defm V_MOV_FED_B32 : VOP1InstSI <vop1<0x9>, "v_mov_fed_b32", VOP_I32_I32>; defm V_LOG_CLAMP_F32 : VOP1InstSI <vop1<0x26>, "v_log_clamp_f32", VOP_F32_F32>; defm V_RCP_CLAMP_F32 : VOP1InstSI <vop1<0x28>, "v_rcp_clamp_f32", VOP_F32_F32>; defm V_RCP_LEGACY_F32 : VOP1InstSI <vop1<0x29>, "v_rcp_legacy_f32", VOP_F32_F32>; @@ -1375,40 +1436,68 @@ defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64", // VINTRP Instructions //===----------------------------------------------------------------------===// +let Uses = [M0] in { + // FIXME: Specify SchedRW for VINTRP insturctions. -defm V_INTERP_P1_F32 : VINTRP_m < - 0x00000000, "v_interp_p1_f32", + +multiclass V_INTERP_P1_F32_m : VINTRP_m < + 0x00000000, (outs VGPR_32:$dst), - (ins VGPR_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), - "v_interp_p1_f32 $dst, $i, $attr_chan, $attr, [$m0]", - "$m0">; + (ins VGPR_32:$i, i32imm:$attr_chan, i32imm:$attr), + "v_interp_p1_f32 $dst, $i, $attr_chan, $attr, [m0]", + [(set f32:$dst, (AMDGPUinterp_p1 i32:$i, (i32 imm:$attr_chan), + (i32 imm:$attr)))] +>; + +let OtherPredicates = [has32BankLDS] in { + +defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m; + +} // End OtherPredicates = [has32BankLDS] + +let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $dst" in { + +defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m; + +} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $dst" + +let DisableEncoding = "$src0", Constraints = "$src0 = $dst" in { defm V_INTERP_P2_F32 : VINTRP_m < - 0x00000001, "v_interp_p2_f32", + 0x00000001, (outs VGPR_32:$dst), - (ins VGPR_32:$src0, VGPR_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), - "v_interp_p2_f32 $dst, [$src0], $j, $attr_chan, $attr, [$m0]", - "$src0,$m0", - "$src0 = $dst">; + (ins VGPR_32:$src0, VGPR_32:$j, i32imm:$attr_chan, i32imm:$attr), + "v_interp_p2_f32 $dst, [$src0], $j, $attr_chan, $attr, [m0]", + [(set f32:$dst, (AMDGPUinterp_p2 f32:$src0, i32:$j, (i32 imm:$attr_chan), + (i32 imm:$attr)))]>; + +} // End DisableEncoding = "$src0", Constraints = "$src0 = $dst" defm V_INTERP_MOV_F32 : VINTRP_m < - 0x00000002, "v_interp_mov_f32", + 0x00000002, (outs VGPR_32:$dst), - (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), - "v_interp_mov_f32 $dst, $src0, $attr_chan, $attr, [$m0]", - "$m0">; + (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr), + "v_interp_mov_f32 $dst, $src0, $attr_chan, $attr, [m0]", + [(set f32:$dst, (AMDGPUinterp_mov (i32 imm:$src0), (i32 imm:$attr_chan), + (i32 imm:$attr)))]>; + +} // End Uses = [M0] //===----------------------------------------------------------------------===// // VOP2 Instructions //===----------------------------------------------------------------------===// -defm V_CNDMASK_B32_e64 : VOP3_m_nosrcmod <vop3<0x100>, (outs VGPR_32:$dst), - (ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2), - "v_cndmask_b32_e64 $dst, $src0, $src1, $src2", - [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))], - "v_cndmask_b32_e64", 3 ->; +multiclass V_CNDMASK <vop2 op, string name> { + defm _e32 : VOP2_m < + op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins32, VOP_CNDMASK.Asm32, [], + name, name>; + + defm _e64 : VOP3_m < + op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins64, + name#!cast<string>(VOP_CNDMASK.Asm64), [], name, 3>; +} +defm V_CNDMASK_B32 : V_CNDMASK<vop2<0x0>, "v_cndmask_b32">; let isCommutable = 1 in { defm V_ADD_F32 : VOP2Inst <vop2<0x3, 0x1>, "v_add_f32", @@ -1434,11 +1523,18 @@ defm V_MUL_F32 : VOP2Inst <vop2<0x8, 0x5>, "v_mul_f32", defm V_MUL_I32_I24 : VOP2Inst <vop2<0x9, 0x6>, "v_mul_i32_i24", VOP_I32_I32_I32, AMDGPUmul_i24 >; -//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "v_mul_hi_i32_i24", []>; + +defm V_MUL_HI_I32_I24 : VOP2Inst <vop2<0xa,0x7>, "v_mul_hi_i32_i24", + VOP_I32_I32_I32 +>; + defm V_MUL_U32_U24 : VOP2Inst <vop2<0xb, 0x8>, "v_mul_u32_u24", VOP_I32_I32_I32, AMDGPUmul_u24 >; -//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "v_mul_hi_u32_u24", []>; + +defm V_MUL_HI_U32_U24 : VOP2Inst <vop2<0xc,0x9>, "v_mul_hi_u32_u24", + VOP_I32_I32_I32 +>; defm V_MIN_F32 : VOP2Inst <vop2<0xf, 0xa>, "v_min_f32", VOP_F32_F32_F32, fminnum>; @@ -1545,8 +1641,8 @@ defm V_MAC_LEGACY_F32 : VOP2_VI3_Inst <vop23<0x6, 0x28e>, "v_mac_legacy_f32", >; } // End isCommutable = 1 -defm V_BFM_B32 : VOP2_VI3_Inst <vop23<0x1e, 0x293>, "v_bfm_b32", VOP_I32_I32_I32, - AMDGPUbfm +defm V_BFM_B32 : VOP2_VI3_Inst <vop23<0x1e, 0x293>, "v_bfm_b32", + VOP_I32_I32_I32 >; defm V_BCNT_U32_B32 : VOP2_VI3_Inst <vop23<0x22, 0x28b>, "v_bcnt_u32_b32", VOP_I32_I32_I32 @@ -1561,7 +1657,6 @@ defm V_LDEXP_F32 : VOP2_VI3_Inst <vop23<0x2b, 0x288>, "v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp >; - defm V_CVT_PKACCUM_U8_F32 : VOP2_VI3_Inst <vop23<0x2c, 0x1f0>, "v_cvt_pkaccum_u8_f32", VOP_I32_F32_I32>; // TODO: set "Uses = dst" @@ -1615,14 +1710,12 @@ defm V_CUBEMA_F32 : VOP3Inst <vop3<0x147, 0x1c7>, "v_cubema_f32", VOP_F32_F32_F32_F32 >; -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { defm V_BFE_U32 : VOP3Inst <vop3<0x148, 0x1c8>, "v_bfe_u32", VOP_I32_I32_I32_I32, AMDGPUbfe_u32 >; defm V_BFE_I32 : VOP3Inst <vop3<0x149, 0x1c9>, "v_bfe_i32", VOP_I32_I32_I32_I32, AMDGPUbfe_i32 >; -} defm V_BFI_B32 : VOP3Inst <vop3<0x14a, 0x1ca>, "v_bfi_b32", VOP_I32_I32_I32_I32, AMDGPUbfi @@ -1810,6 +1903,11 @@ defm V_ASHRREV_I64 : VOP3Inst <vop3<0, 0x291>, "v_ashrrev_i64", //===----------------------------------------------------------------------===// let isCodeGenOnly = 1, isPseudo = 1 in { +// For use in patterns +def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$dst), + (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", [] +>; + let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { // 64-bit vector move instruction. This is mainly used by the SIFoldOperands // pass to enable folding of inline immediates. @@ -1950,17 +2048,6 @@ def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>; } // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0] -let usesCustomInserter = 1 in { - -def V_SUB_F64 : InstSI < - (outs VReg_64:$dst), - (ins VReg_64:$src0, VReg_64:$src1), - "v_sub_f64 $dst, $src0, $src1", - [(set f64:$dst, (fsub f64:$src0, f64:$src1))] ->; - -} // end usesCustomInserter - multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { let UseNamedOperandTable = 1 in { @@ -1979,14 +2066,17 @@ multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { } // End UseNamedOperandTable = 1 } -defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>; +// It's unclear whether you can use M0 as the output of v_readlane_b32 +// instructions, so use SGPR_32 register class for spills to prevent +// this from happening. +defm SI_SPILL_S32 : SI_SPILL_SGPR <SGPR_32>; defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>; defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>; defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> { - let UseNamedOperandTable = 1 in { + let UseNamedOperandTable = 1, VGPRSpill = 1 in { def _SAVE : InstSI < (outs), (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, @@ -1999,7 +2089,7 @@ multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> { (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), "", [] >; - } // End UseNamedOperandTable = 1 + } // End UseNamedOperandTable = 1, VGPRSpill = 1 } defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>; @@ -2040,7 +2130,7 @@ def : Pat < /* int_SI_vs_load_input */ def : Pat< (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr), - (BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset, 0, 0, 0, 0) + (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, 0, imm:$attr_offset, 0, 0, 0) >; /* int_SI_export */ @@ -2187,6 +2277,11 @@ def : Pat < (V_BCNT_U32_B32_e64 $popcnt, $val) >; +def : Pat < + (i32 (select i1:$src0, i32:$src1, i32:$src2)), + (V_CNDMASK_B32_e64 $src2, $src1, $src0) +>; + /********** ======================= **********/ /********** Image sampling patterns **********/ /********** ======================= **********/ @@ -2659,27 +2754,6 @@ def : Pat < (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineFPImm<f64>:$imm))) >; -/********** ===================== **********/ -/********** Interpolation Paterns **********/ -/********** ===================== **********/ - -// The value of $params is constant through out the entire kernel. -// We need to use S_MOV_B32 $params, because CSE ignores copies, so -// without it we end up with a lot of redundant moves. - -def : Pat < - (int_SI_fs_constant imm:$attr_chan, imm:$attr, i32:$params), - (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, (S_MOV_B32 $params)) ->; - -def : Pat < - (int_SI_fs_interp imm:$attr_chan, imm:$attr, i32:$params, v2i32:$ij), - (V_INTERP_P2_F32 (V_INTERP_P1_F32 (EXTRACT_SUBREG v2i32:$ij, sub0), - imm:$attr_chan, imm:$attr, (S_MOV_B32 $params)), - (EXTRACT_SUBREG $ij, sub1), - imm:$attr_chan, imm:$attr, (S_MOV_B32 $params)) ->; - /********** ================== **********/ /********** Intrinsic Patterns **********/ /********** ================== **********/ @@ -2729,7 +2803,7 @@ def : Ext32Pat <anyext>; // Offset in an 32Bit VGPR def : Pat < (SIload_constant v4i32:$sbase, i32:$voff), - (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0, 0) + (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, 0, 0, 0, 0, 0) >; // The multiplication scales from [0,1] to the unsigned integer range @@ -2763,9 +2837,6 @@ def : Pat < (V_MUL_HI_I32 $src0, $src1) >; -def : Vop3ModPat<V_MAD_F32, VOP_F32_F32_F32_F32, AMDGPUmad>; - - defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>; def : ROTRPattern <V_ALIGNBIT_B32>; @@ -2775,52 +2846,52 @@ def : ROTRPattern <V_ALIGNBIT_B32>; class DSReadPat <DS inst, ValueType vt, PatFrag frag> : Pat < (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), - (inst (i1 0), $ptr, (as_i16imm $offset), (S_MOV_B32 -1)) + (inst $ptr, (as_i16imm $offset), (i1 0)) >; -def : DSReadPat <DS_READ_I8, i32, sextloadi8_local>; -def : DSReadPat <DS_READ_U8, i32, az_extloadi8_local>; -def : DSReadPat <DS_READ_I16, i32, sextloadi16_local>; -def : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>; -def : DSReadPat <DS_READ_B32, i32, local_load>; +def : DSReadPat <DS_READ_I8, i32, si_sextload_local_i8>; +def : DSReadPat <DS_READ_U8, i32, si_az_extload_local_i8>; +def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>; +def : DSReadPat <DS_READ_U16, i32, si_az_extload_local_i16>; +def : DSReadPat <DS_READ_B32, i32, si_load_local>; let AddedComplexity = 100 in { -def : DSReadPat <DS_READ_B64, v2i32, local_load_aligned8bytes>; +def : DSReadPat <DS_READ_B64, v2i32, si_load_local_align8>; } // End AddedComplexity = 100 def : Pat < - (v2i32 (local_load (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, + (v2i32 (si_load_local (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))), - (DS_READ2_B32 (i1 0), $ptr, $offset0, $offset1, (S_MOV_B32 -1)) + (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0)) >; class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat < (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), - (inst (i1 0), $ptr, $value, (as_i16imm $offset), (S_MOV_B32 -1)) + (inst $ptr, $value, (as_i16imm $offset), (i1 0)) >; -def : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>; -def : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>; -def : DSWritePat <DS_WRITE_B32, i32, local_store>; +def : DSWritePat <DS_WRITE_B8, i32, si_truncstore_local_i8>; +def : DSWritePat <DS_WRITE_B16, i32, si_truncstore_local_i16>; +def : DSWritePat <DS_WRITE_B32, i32, si_store_local>; let AddedComplexity = 100 in { -def : DSWritePat <DS_WRITE_B64, v2i32, local_store_aligned8bytes>; +def : DSWritePat <DS_WRITE_B64, v2i32, si_store_local_align8>; } // End AddedComplexity = 100 def : Pat < - (local_store v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, - i8:$offset1)), - (DS_WRITE2_B32 (i1 0), $ptr, (EXTRACT_SUBREG $value, sub0), - (EXTRACT_SUBREG $value, sub1), $offset0, $offset1, - (S_MOV_B32 -1)) + (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, + i8:$offset1)), + (DS_WRITE2_B32 $ptr, (EXTRACT_SUBREG $value, sub0), + (EXTRACT_SUBREG $value, sub1), $offset0, $offset1, + (i1 0)) >; class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), - (inst (i1 0), $ptr, $value, (as_i16imm $offset), (S_MOV_B32 -1)) + (inst $ptr, $value, (as_i16imm $offset), (i1 0)) >; // Special case of DSAtomicRetPat for add / sub 1 -> inc / dec @@ -2836,53 +2907,53 @@ class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat < class DSAtomicIncRetPat<DS inst, ValueType vt, Instruction LoadImm, PatFrag frag> : Pat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)), - (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (S_MOV_B32 -1)) + (inst $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (i1 0)) >; class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), - (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset), (S_MOV_B32 -1)) + (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0)) >; // 32-bit atomics. def : DSAtomicIncRetPat<DS_INC_RTN_U32, i32, - S_MOV_B32, atomic_load_add_local>; + S_MOV_B32, si_atomic_load_add_local>; def : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32, - S_MOV_B32, atomic_load_sub_local>; + S_MOV_B32, si_atomic_load_sub_local>; -def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, atomic_swap_local>; -def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, atomic_load_add_local>; -def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, atomic_load_sub_local>; -def : DSAtomicRetPat<DS_AND_RTN_B32, i32, atomic_load_and_local>; -def : DSAtomicRetPat<DS_OR_RTN_B32, i32, atomic_load_or_local>; -def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, atomic_load_xor_local>; -def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, atomic_load_min_local>; -def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, atomic_load_max_local>; -def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, atomic_load_umin_local>; -def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, atomic_load_umax_local>; +def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>; +def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>; +def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, si_atomic_load_sub_local>; +def : DSAtomicRetPat<DS_AND_RTN_B32, i32, si_atomic_load_and_local>; +def : DSAtomicRetPat<DS_OR_RTN_B32, i32, si_atomic_load_or_local>; +def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, si_atomic_load_xor_local>; +def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, si_atomic_load_min_local>; +def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, si_atomic_load_max_local>; +def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, si_atomic_load_umin_local>; +def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, si_atomic_load_umax_local>; -def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, atomic_cmp_swap_32_local>; +def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>; // 64-bit atomics. def : DSAtomicIncRetPat<DS_INC_RTN_U64, i64, - S_MOV_B64, atomic_load_add_local>; + S_MOV_B64, si_atomic_load_add_local>; def : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64, - S_MOV_B64, atomic_load_sub_local>; + S_MOV_B64, si_atomic_load_sub_local>; -def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, atomic_swap_local>; -def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, atomic_load_add_local>; -def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, atomic_load_sub_local>; -def : DSAtomicRetPat<DS_AND_RTN_B64, i64, atomic_load_and_local>; -def : DSAtomicRetPat<DS_OR_RTN_B64, i64, atomic_load_or_local>; -def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, atomic_load_xor_local>; -def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, atomic_load_min_local>; -def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, atomic_load_max_local>; -def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, atomic_load_umin_local>; -def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, atomic_load_umax_local>; +def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>; +def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>; +def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, si_atomic_load_sub_local>; +def : DSAtomicRetPat<DS_AND_RTN_B64, i64, si_atomic_load_and_local>; +def : DSAtomicRetPat<DS_OR_RTN_B64, i64, si_atomic_load_or_local>; +def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, si_atomic_load_xor_local>; +def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, si_atomic_load_min_local>; +def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, si_atomic_load_max_local>; +def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, si_atomic_load_umin_local>; +def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, si_atomic_load_umax_local>; -def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>; +def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, si_atomic_cmp_swap_64_local>; //===----------------------------------------------------------------------===// @@ -2892,8 +2963,9 @@ def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>; multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt, PatFrag constant_ld> { def : Pat < - (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i16:$offset))), - (Instr_ADDR64 $srsrc, $vaddr, $offset) + (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe) >; } @@ -2910,7 +2982,7 @@ defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, constant_load>; class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat < (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset))), - (Instr $srsrc, $vaddr, $soffset, $offset, 0, 0, 0) + (Instr $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) >; def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>; @@ -2929,7 +3001,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset, imm:$offset, 0, 0, imm:$glc, imm:$slc, imm:$tfe)), - (offset $rsrc, (as_i16imm $offset), $soffset, (as_i1imm $glc), + (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), (as_i1imm $tfe)) >; @@ -2937,7 +3009,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, imm:$offset, 1, 0, imm:$glc, imm:$slc, imm:$tfe)), - (offen $rsrc, $vaddr, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), + (offen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), (as_i1imm $tfe)) >; @@ -2945,15 +3017,15 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, imm:$offset, 0, 1, imm:$glc, imm:$slc, imm:$tfe)), - (idxen $rsrc, $vaddr, (as_i16imm $offset), $soffset, (as_i1imm $glc), + (idxen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), (as_i1imm $tfe)) >; def : Pat < (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset, - imm, 1, 1, imm:$glc, imm:$slc, + imm:$offset, 1, 1, imm:$glc, imm:$slc, imm:$tfe)), - (bothen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc), + (bothen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), (as_i1imm $tfe)) >; } @@ -2968,7 +3040,7 @@ defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_ class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat < (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset)), - (Instr $value, $srsrc, $vaddr, $soffset, $offset, 0, 0, 0) + (Instr $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) >; def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>; @@ -3098,26 +3170,26 @@ multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, SI_INDIRECT_DST I // 1. Extract with offset def : Pat< - (vector_extract vt:$vec, (add i32:$idx, imm:$off)), - (eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off)) + (eltvt (vector_extract vt:$vec, (add i32:$idx, imm:$off))), + (SI_INDIRECT_SRC $vec, $idx, imm:$off) >; // 2. Extract without offset def : Pat< - (vector_extract vt:$vec, i32:$idx), - (eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0)) + (eltvt (vector_extract vt:$vec, i32:$idx)), + (SI_INDIRECT_SRC $vec, $idx, 0) >; // 3. Insert with offset def : Pat< (vector_insert vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)), - (IndDst (IMPLICIT_DEF), $vec, $idx, imm:$off, $val) + (IndDst $vec, $idx, imm:$off, $val) >; // 4. Insert without offset def : Pat< (vector_insert vt:$vec, eltvt:$val, i32:$idx), - (IndDst (IMPLICIT_DEF), $vec, $idx, 0, $val) + (IndDst $vec, $idx, 0, $val) >; } @@ -3263,10 +3335,71 @@ def : Pat < (V_CNDMASK_B32_e64 $src0, $src1, $src2) >; +multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> { + def : Pat < + (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)), + (BFM $a, $b) + >; + + def : Pat < + (vt (add (vt (shl 1, vt:$a)), -1)), + (BFM $a, (MOV 0)) + >; +} + +defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>; +// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>; + +def : BFEPattern <V_BFE_U32, S_MOV_B32>; + //===----------------------------------------------------------------------===// // Fract Patterns //===----------------------------------------------------------------------===// +let Predicates = [isSI] in { + +// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is +// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient +// way to implement it is using V_FRACT_F64. +// The workaround for the V_FRACT bug is: +// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) + +// Convert (x + (-floor(x)) to fract(x) +def : Pat < + (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), + (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), + (V_CNDMASK_B64_PSEUDO + $x, + (V_MIN_F64 + SRCMODS.NONE, + (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE), + SRCMODS.NONE, + (V_MOV_B64_PSEUDO 0x3fefffffffffffff), + DSTCLAMP.NONE, DSTOMOD.NONE), + (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)) +>; + +// Convert floor(x) to (x - fract(x)) +def : Pat < + (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))), + (V_ADD_F64 + $mods, + $x, + SRCMODS.NEG, + (V_CNDMASK_B64_PSEUDO + $x, + (V_MIN_F64 + SRCMODS.NONE, + (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE), + SRCMODS.NONE, + (V_MOV_B64_PSEUDO 0x3fefffffffffffff), + DSTCLAMP.NONE, DSTOMOD.NONE), + (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)), + DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +} // End Predicates = [isSI] + let Predicates = [isCI] in { // Convert (x - floor(x)) to fract(x) @@ -3291,4 +3424,12 @@ def : Pat < def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>; +//============================================================================// +// Assembler aliases +//============================================================================// + +def : MnemonicAlias<"v_add_u32", "v_add_i32">; +def : MnemonicAlias<"v_sub_u32", "v_sub_i32">; +def : MnemonicAlias<"v_subrev_u32", "v_subrev_i32">; + } // End isGCN predicate diff --git a/contrib/llvm/lib/Target/R600/SILoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/R600/SILoadStoreOptimizer.cpp index 0cb6746..9b1d256 100644 --- a/contrib/llvm/lib/Target/R600/SILoadStoreOptimizer.cpp +++ b/contrib/llvm/lib/Target/R600/SILoadStoreOptimizer.cpp @@ -45,6 +45,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; @@ -55,7 +56,6 @@ namespace { class SILoadStoreOptimizer : public MachineFunctionPass { private: - const TargetMachine *TM; const SIInstrInfo *TII; const SIRegisterInfo *TRI; MachineRegisterInfo *MRI; @@ -86,20 +86,11 @@ private: public: static char ID; - SILoadStoreOptimizer() : - MachineFunctionPass(ID), - TM(nullptr), - TII(nullptr), - TRI(nullptr), - MRI(nullptr), - LIS(nullptr) { + SILoadStoreOptimizer() + : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr), + LIS(nullptr) {} - } - - SILoadStoreOptimizer(const TargetMachine &TM_) : - MachineFunctionPass(ID), - TM(&TM_), - TII(static_cast<const SIInstrInfo*>(TM->getSubtargetImpl()->getInstrInfo())) { + SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) { initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); } @@ -222,7 +213,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( // Be careful, since the addresses could be subregisters themselves in weird // cases, like vectors of pointers. const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); - const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0); unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg(); unsigned DestReg1 @@ -259,41 +249,28 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( DebugLoc DL = I->getDebugLoc(); MachineInstrBuilder Read2 = BuildMI(*MBB, I, DL, Read2Desc, DestReg) - .addImm(0) // gds .addOperand(*AddrReg) // addr .addImm(NewOffset0) // offset0 .addImm(NewOffset1) // offset1 - .addOperand(*M0Reg) // M0 + .addImm(0) // gds .addMemOperand(*I->memoperands_begin()) .addMemOperand(*Paired->memoperands_begin()); - LIS->InsertMachineInstrInMaps(Read2); - unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; updateRegDefsUses(DestReg0, DestReg, SubRegIdx0); updateRegDefsUses(DestReg1, DestReg, SubRegIdx1); LIS->RemoveMachineInstrFromMaps(I); - LIS->RemoveMachineInstrFromMaps(Paired); + // Replacing Paired in the maps with Read2 allows us to avoid updating the + // live range for the m0 register. + LIS->ReplaceMachineInstrInMaps(Paired, Read2); I->eraseFromParent(); Paired->eraseFromParent(); LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg()); LIS->shrinkToUses(&AddrRegLI); - LiveInterval &M0RegLI = LIS->getInterval(M0Reg->getReg()); - LIS->shrinkToUses(&M0RegLI); - - // Currently m0 is treated as a register class with one member instead of an - // implicit physical register. We are using the virtual register for the first - // one, but we still need to update the live range of the now unused second m0 - // virtual register to avoid verifier errors. - const MachineOperand *PairedM0Reg - = TII->getNamedOperand(*Paired, AMDGPU::OpName::m0); - LiveInterval &PairedM0RegLI = LIS->getInterval(PairedM0Reg->getReg()); - LIS->shrinkToUses(&PairedM0RegLI); - LIS->getInterval(DestReg); // Create new LI DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); @@ -309,7 +286,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( // Be sure to use .addOperand(), and not .addReg() with these. We want to be // sure we preserve the subregister index and any register flags set on them. const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr); - const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0); const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0); const MachineOperand *Data1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0); @@ -340,29 +316,40 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( const MCInstrDesc &Write2Desc = TII->get(Opc); DebugLoc DL = I->getDebugLoc(); + // repairLiveintervalsInRange() doesn't handle physical register, so we have + // to update the M0 range manually. + SlotIndex PairedIndex = LIS->getInstructionIndex(Paired); + LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); + LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); + bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); + MachineInstrBuilder Write2 = BuildMI(*MBB, I, DL, Write2Desc) - .addImm(0) // gds .addOperand(*Addr) // addr .addOperand(*Data0) // data0 .addOperand(*Data1) // data1 .addImm(NewOffset0) // offset0 .addImm(NewOffset1) // offset1 - .addOperand(*M0Reg) // m0 + .addImm(0) // gds .addMemOperand(*I->memoperands_begin()) .addMemOperand(*Paired->memoperands_begin()); // XXX - How do we express subregisters here? - unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg(), - M0Reg->getReg()}; + unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() }; LIS->RemoveMachineInstrFromMaps(I); LIS->RemoveMachineInstrFromMaps(Paired); I->eraseFromParent(); Paired->eraseFromParent(); + // This doesn't handle physical registers like M0 LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs); + if (UpdateM0Range) { + SlotIndex Write2Index = LIS->getInstructionIndex(Write2); + M0Segment->end = Write2Index.getRegSlot(); + } + DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); return Write2.getInstr(); } @@ -414,9 +401,9 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { } bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { - const TargetSubtargetInfo *STM = MF.getTarget().getSubtargetImpl(); - TRI = static_cast<const SIRegisterInfo*>(STM->getRegisterInfo()); - TII = static_cast<const SIInstrInfo*>(STM->getInstrInfo()); + const TargetSubtargetInfo &STM = MF.getSubtarget(); + TRI = static_cast<const SIRegisterInfo *>(STM.getRegisterInfo()); + TII = static_cast<const SIInstrInfo *>(STM.getInstrInfo()); MRI = &MF.getRegInfo(); LIS = &getAnalysis<LiveIntervals>(); diff --git a/contrib/llvm/lib/Target/R600/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/R600/SIMachineFunctionInfo.cpp index 198dd56..587ea63 100644 --- a/contrib/llvm/lib/Target/R600/SIMachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/R600/SIMachineFunctionInfo.cpp @@ -39,8 +39,8 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( unsigned FrameIndex, unsigned SubIdx) { const MachineFrameInfo *FrameInfo = MF->getFrameInfo(); - const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>( - MF->getTarget().getSubtarget<AMDGPUSubtarget>().getRegisterInfo()); + const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>( + MF->getSubtarget<AMDGPUSubtarget>().getRegisterInfo()); MachineRegisterInfo &MRI = MF->getRegInfo(); int64_t Offset = FrameInfo->getObjectOffset(FrameIndex); Offset += SubIdx * 4; @@ -70,7 +70,7 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize( const MachineFunction &MF) const { - const AMDGPUSubtarget &ST = MF.getTarget().getSubtarget<AMDGPUSubtarget>(); + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); // FIXME: We should get this information from kernel attributes if it // is available. return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize(); diff --git a/contrib/llvm/lib/Target/R600/SIPrepareScratchRegs.cpp b/contrib/llvm/lib/Target/R600/SIPrepareScratchRegs.cpp index 0a57a5b..0a7f684 100644 --- a/contrib/llvm/lib/Target/R600/SIPrepareScratchRegs.cpp +++ b/contrib/llvm/lib/Target/R600/SIPrepareScratchRegs.cpp @@ -128,80 +128,66 @@ bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) { MachineInstr &MI = *I; RS.forward(I); DebugLoc DL = MI.getDebugLoc(); - switch(MI.getOpcode()) { - default: break; - case AMDGPU::SI_SPILL_V512_SAVE: - case AMDGPU::SI_SPILL_V256_SAVE: - case AMDGPU::SI_SPILL_V128_SAVE: - case AMDGPU::SI_SPILL_V96_SAVE: - case AMDGPU::SI_SPILL_V64_SAVE: - case AMDGPU::SI_SPILL_V32_SAVE: - case AMDGPU::SI_SPILL_V32_RESTORE: - case AMDGPU::SI_SPILL_V64_RESTORE: - case AMDGPU::SI_SPILL_V128_RESTORE: - case AMDGPU::SI_SPILL_V256_RESTORE: - case AMDGPU::SI_SPILL_V512_RESTORE: - - // Scratch resource - unsigned ScratchRsrcReg = - RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0); - - uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE | - 0xffffffff; // Size - - unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); - unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); - unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); - unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0) - .addExternalSymbol("SCRATCH_RSRC_DWORD0") - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1) - .addExternalSymbol("SCRATCH_RSRC_DWORD1") - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2) - .addImm(Rsrc & 0xffffffff) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3) - .addImm(Rsrc >> 32) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - // Scratch Offset - if (ScratchOffsetReg == AMDGPU::NoRegister) { - ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); - BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE), - ScratchOffsetReg) - .addFrameIndex(ScratchOffsetFI) - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - } else if (!MBB.isLiveIn(ScratchOffsetReg)) { - MBB.addLiveIn(ScratchOffsetReg); - } - - if (ScratchRsrcReg == AMDGPU::NoRegister || - ScratchOffsetReg == AMDGPU::NoRegister) { - LLVMContext &Ctx = MF.getFunction()->getContext(); - Ctx.emitError("ran out of SGPRs for spilling VGPRs"); - ScratchRsrcReg = AMDGPU::SGPR0; - ScratchOffsetReg = AMDGPU::SGPR0; - } - MI.getOperand(2).setReg(ScratchRsrcReg); - MI.getOperand(2).setIsKill(true); - MI.getOperand(2).setIsUndef(false); - MI.getOperand(3).setReg(ScratchOffsetReg); - MI.getOperand(3).setIsUndef(false); - MI.getOperand(3).setIsKill(false); - MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true)); - - break; + if (!TII->isVGPRSpill(MI.getOpcode())) + continue; + + // Scratch resource + unsigned ScratchRsrcReg = + RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0); + + uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE | + 0xffffffff; // Size + + unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); + unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0) + .addExternalSymbol("SCRATCH_RSRC_DWORD0") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1) + .addExternalSymbol("SCRATCH_RSRC_DWORD1") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2) + .addImm(Rsrc & 0xffffffff) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3) + .addImm(Rsrc >> 32) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + // Scratch Offset + if (ScratchOffsetReg == AMDGPU::NoRegister) { + ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); + BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE), + ScratchOffsetReg) + .addFrameIndex(ScratchOffsetFI) + .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) + .addReg(AMDGPU::SGPR0, RegState::Undef); + } else if (!MBB.isLiveIn(ScratchOffsetReg)) { + MBB.addLiveIn(ScratchOffsetReg); } + + if (ScratchRsrcReg == AMDGPU::NoRegister || + ScratchOffsetReg == AMDGPU::NoRegister) { + LLVMContext &Ctx = MF.getFunction()->getContext(); + Ctx.emitError("ran out of SGPRs for spilling VGPRs"); + ScratchRsrcReg = AMDGPU::SGPR0; + ScratchOffsetReg = AMDGPU::SGPR0; + } + MI.getOperand(2).setReg(ScratchRsrcReg); + MI.getOperand(2).setIsKill(true); + MI.getOperand(2).setIsUndef(false); + MI.getOperand(3).setReg(ScratchOffsetReg); + MI.getOperand(3).setIsUndef(false); + MI.getOperand(3).setIsKill(false); + MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true)); + MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true)); + MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true)); + MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true)); } } return true; diff --git a/contrib/llvm/lib/Target/R600/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/R600/SIRegisterInfo.cpp index f502991..db2ff0b 100644 --- a/contrib/llvm/lib/Target/R600/SIRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/R600/SIRegisterInfo.cpp @@ -24,9 +24,7 @@ using namespace llvm; -SIRegisterInfo::SIRegisterInfo(const AMDGPUSubtarget &st) -: AMDGPURegisterInfo(st) - { } +SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {} BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); @@ -48,7 +46,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { // Tonga and Iceland can only allocate a fixed number of SGPRs due // to a hw bug. - if (ST.hasSGPRInitBug()) { + if (MF.getSubtarget<AMDGPUSubtarget>().hasSGPRInitBug()) { unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); // Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs). // Assume XNACK_MASK is unused. @@ -66,12 +64,14 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { return Reserved; } -unsigned SIRegisterInfo::getRegPressureSetLimit(unsigned Idx) const { +unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const { + const AMDGPUSubtarget &STI = MF.getSubtarget<AMDGPUSubtarget>(); // FIXME: We should adjust the max number of waves based on LDS size. - unsigned SGPRLimit = getNumSGPRsAllowed(ST.getGeneration(), - ST.getMaxWavesPerCU()); - unsigned VGPRLimit = getNumVGPRsAllowed(ST.getMaxWavesPerCU()); + unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(), + STI.getMaxWavesPerCU()); + unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU()); for (regclass_iterator I = regclass_begin(), E = regclass_end(); I != E; ++I) { @@ -142,9 +142,10 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, int64_t Offset, RegScavenger *RS) const { - const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo()); MachineBasicBlock *MBB = MI->getParent(); const MachineFunction *MF = MI->getParent()->getParent(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo()); LLVMContext &Ctx = MF->getFunction()->getContext(); DebugLoc DL = MI->getDebugLoc(); bool IsLoad = TII->get(LoadStoreOp).mayLoad(); @@ -179,8 +180,8 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) .addReg(SubReg, getDefRegState(IsLoad)) .addReg(ScratchRsrcReg, getKillRegState(IsKill)) - .addImm(Offset) .addReg(SOffset) + .addImm(Offset) .addImm(0) // glc .addImm(0) // slc .addImm(0) // tfe @@ -195,7 +196,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo *FrameInfo = MF->getFrameInfo(); - const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo()); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo()); DebugLoc DL = MI->getDebugLoc(); MachineOperand &FIOp = MI->getOperand(FIOperandNum); @@ -243,7 +245,6 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), &AMDGPU::SGPR_32RegClass, i); - bool isM0 = SubReg == AMDGPU::M0; struct SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); @@ -252,23 +253,16 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, Ctx.emitError("Ran out of VGPRs for spilling SGPR"); } - if (isM0) - SubReg = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0); - BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), SubReg) .addReg(Spill.VGPR) .addImm(Spill.Lane) .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); - if (isM0) { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addReg(SubReg); - } } // TODO: only do this when it is needed - switch (ST.getGeneration()) { + switch (MF->getSubtarget<AMDGPUSubtarget>().getGeneration()) { case AMDGPUSubtarget::SOUTHERN_ISLANDS: // "VALU writes SGPR" -> "SMRD reads that SGPR" needs "S_NOP 3" on SI TII->insertNOPs(MI, 3); diff --git a/contrib/llvm/lib/Target/R600/SIRegisterInfo.h b/contrib/llvm/lib/Target/R600/SIRegisterInfo.h index 1dfe530..bfdb67c 100644 --- a/contrib/llvm/lib/Target/R600/SIRegisterInfo.h +++ b/contrib/llvm/lib/Target/R600/SIRegisterInfo.h @@ -24,11 +24,12 @@ namespace llvm { struct SIRegisterInfo : public AMDGPURegisterInfo { - SIRegisterInfo(const AMDGPUSubtarget &st); + SIRegisterInfo(); BitVector getReservedRegs(const MachineFunction &MF) const override; - unsigned getRegPressureSetLimit(unsigned Idx) const override; + unsigned getRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const override; bool requiresRegisterScavenging(const MachineFunction &Fn) const override; diff --git a/contrib/llvm/lib/Target/R600/SIRegisterInfo.td b/contrib/llvm/lib/Target/R600/SIRegisterInfo.td index c63f305..2a9017f 100644 --- a/contrib/llvm/lib/Target/R600/SIRegisterInfo.td +++ b/contrib/llvm/lib/Target/R600/SIRegisterInfo.td @@ -21,7 +21,7 @@ def VCC_LO : SIReg<"vcc_lo", 106>; def VCC_HI : SIReg<"vcc_hi", 107>; // VCC for 64-bit instructions -def VCC : RegisterWithSubRegs<"VCC", [VCC_LO, VCC_HI]> { +def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; let HWEncoding = 106; @@ -36,14 +36,14 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> { let HWEncoding = 126; } -def SCC : SIReg<"SCC", 253>; -def M0 : SIReg <"M0", 124>; +def SCC : SIReg<"scc", 253>; +def M0 : SIReg <"m0", 124>; def FLAT_SCR_LO : SIReg<"flat_scr_lo", 104>; // Offset in units of 256-bytes. def FLAT_SCR_HI : SIReg<"flat_scr_hi", 105>; // Size is the per-thread scratch size, in bytes. // Pair to indicate location of scratch space for flat accesses. -def FLAT_SCR : RegisterWithSubRegs <"FLAT_SCR", [FLAT_SCR_LO, FLAT_SCR_HI]> { +def FLAT_SCR : RegisterWithSubRegs <"flat_scr", [FLAT_SCR_LO, FLAT_SCR_HI]> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; let HWEncoding = 104; @@ -66,7 +66,7 @@ foreach Index = 0-255 in { //===----------------------------------------------------------------------===// // SGPR 32-bit registers -def SGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32, +def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add (sequence "SGPR%u", 0, 101))>; // SGPR 64-bit registers @@ -113,7 +113,7 @@ def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, (add (decimate (shl SGPR_32, 15), 4))]>; // VGPR 32-bit registers -def VGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32, +def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add (sequence "VGPR%u", 0, 255))>; // VGPR 64-bit registers @@ -169,6 +169,11 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, // Register classes used as source and destination //===----------------------------------------------------------------------===// +class RegImmMatcher<string name> : AsmOperandClass { + let Name = name; + let RenderMethod = "addRegOrImmOperands"; +} + // Special register classes for predicates and the M0 register def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)> { let CopyCost = -1; // Theoretically it is possible to read from SCC, @@ -177,11 +182,10 @@ def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)> { def VCCReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add VCC)>; def EXECReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add EXEC)>; -def M0Reg : RegisterClass<"AMDGPU", [i32], 32, (add M0)>; // Register class for all scalar registers (SGPRs + Special Registers) -def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32, - (add SGPR_32, M0Reg, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI) +def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32, + (add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI) >; def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 64, (add SGPR_64Regs)>; @@ -227,15 +231,21 @@ class RegInlineOperand <RegisterClass rc> : RegisterOperand<rc> { // SSrc_* Operands with an SGPR or a 32-bit immediate //===----------------------------------------------------------------------===// -def SSrc_32 : RegImmOperand<SReg_32>; +def SSrc_32 : RegImmOperand<SReg_32> { + let ParserMatchClass = RegImmMatcher<"SSrc32">; +} -def SSrc_64 : RegImmOperand<SReg_64>; +def SSrc_64 : RegImmOperand<SReg_64> { + let ParserMatchClass = RegImmMatcher<"SSrc64">; +} //===----------------------------------------------------------------------===// // SCSrc_* Operands with an SGPR or a inline constant //===----------------------------------------------------------------------===// -def SCSrc_32 : RegInlineOperand<SReg_32>; +def SCSrc_32 : RegInlineOperand<SReg_32> { + let ParserMatchClass = RegImmMatcher<"SCSrc32">; +} //===----------------------------------------------------------------------===// // VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate @@ -245,21 +255,30 @@ def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)>; def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>; -def VSrc_32 : RegImmOperand<VS_32>; +def VSrc_32 : RegisterOperand<VS_32> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_IMM32"; + let ParserMatchClass = RegImmMatcher<"VSrc32">; +} -def VSrc_64 : RegImmOperand<VS_64>; +def VSrc_64 : RegisterOperand<VS_64> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_IMM32"; + let ParserMatchClass = RegImmMatcher<"VSrc64">; +} //===----------------------------------------------------------------------===// // VCSrc_* Operands with an SGPR, VGPR or an inline constant //===----------------------------------------------------------------------===// -def VCSrc_32 : RegInlineOperand<VS_32>; - -def VCSrc_64 : RegInlineOperand<VS_64>; - -//===----------------------------------------------------------------------===// -// SGPR and VGPR register classes -//===----------------------------------------------------------------------===// +def VCSrc_32 : RegisterOperand<VS_32> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_INLINE_C"; + let ParserMatchClass = RegImmMatcher<"VCSrc32">; +} -def VSrc_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, - (add VReg_128, SReg_128)>; +def VCSrc_64 : RegisterOperand<VS_64> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_INLINE_C"; + let ParserMatchClass = RegImmMatcher<"VCSrc64">; +} diff --git a/contrib/llvm/lib/Target/R600/SIShrinkInstructions.cpp b/contrib/llvm/lib/Target/R600/SIShrinkInstructions.cpp index 6a34106..51e72cd 100644 --- a/contrib/llvm/lib/Target/R600/SIShrinkInstructions.cpp +++ b/contrib/llvm/lib/Target/R600/SIShrinkInstructions.cpp @@ -18,9 +18,10 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #define DEBUG_TYPE "si-shrink-instructions" @@ -88,6 +89,11 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); // Can't shrink instruction with three operands. + // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add + // a special case for it. It can only be shrunk if the third operand + // is vcc. We should handle this the same way we handle vopc, by addding + // a register allocation hint pre-regalloc and then do the shrining + // post-regalloc. if (Src2) return false; @@ -127,30 +133,31 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, TII->isVOPC(MI.getOpcode())); const SIRegisterInfo &TRI = TII->getRegisterInfo(); - MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); + MachineOperand &Src0 = MI.getOperand(Src0Idx); // Only one literal constant is allowed per instruction, so if src0 is a // literal constant then we can't do any folding. - if (Src0->isImm() && TII->isLiteralConstant(*Src0)) + if (Src0.isImm() && + TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx))) return; - // Literal constants and SGPRs can only be used in Src0, so if Src0 is an // SGPR, we cannot commute the instruction, so we can't fold any literal // constants. - if (Src0->isReg() && !isVGPR(Src0, TRI, MRI)) + if (Src0.isReg() && !isVGPR(&Src0, TRI, MRI)) return; // Try to fold Src0 - if (Src0->isReg()) { - unsigned Reg = Src0->getReg(); + if (Src0.isReg()) { + unsigned Reg = Src0.getReg(); MachineInstr *Def = MRI.getUniqueVRegDef(Reg); if (Def && Def->isMoveImmediate()) { MachineOperand &MovSrc = Def->getOperand(1); bool ConstantFolded = false; if (MovSrc.isImm() && isUInt<32>(MovSrc.getImm())) { - Src0->ChangeToImmediate(MovSrc.getImm()); + Src0.ChangeToImmediate(MovSrc.getImm()); ConstantFolded = true; } if (ConstantFolded) { @@ -189,7 +196,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { const MachineOperand &Src = MI.getOperand(1); if (Src.isImm()) { - if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src)) + if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4)) MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); } diff --git a/contrib/llvm/lib/Target/R600/SITypeRewriter.cpp b/contrib/llvm/lib/Target/R600/SITypeRewriter.cpp index 9318dc1..591ce85 100644 --- a/contrib/llvm/lib/Target/R600/SITypeRewriter.cpp +++ b/contrib/llvm/lib/Target/R600/SITypeRewriter.cpp @@ -61,8 +61,7 @@ bool SITypeRewriter::doInitialization(Module &M) { } bool SITypeRewriter::runOnFunction(Function &F) { - AttributeSet Set = F.getAttributes(); - Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, "ShaderType"); + Attribute A = F.getFnAttribute("ShaderType"); unsigned ShaderType = ShaderType::COMPUTE; if (A.isStringAttribute()) { @@ -105,7 +104,7 @@ void SITypeRewriter::visitCallInst(CallInst &I) { SmallVector <Type*, 8> Types; bool NeedToReplace = false; Function *F = I.getCalledFunction(); - std::string Name = F->getName().str(); + std::string Name = F->getName(); for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) { Value *Arg = I.getArgOperand(i); if (Arg->getType() == v16i8) { diff --git a/contrib/llvm/lib/Target/R600/VIInstrFormats.td b/contrib/llvm/lib/Target/R600/VIInstrFormats.td index c242235..d8738f9 100644 --- a/contrib/llvm/lib/Target/R600/VIInstrFormats.td +++ b/contrib/llvm/lib/Target/R600/VIInstrFormats.td @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// class DSe_vi <bits<8> op> : Enc64 { - bits<8> vdst; bits<1> gds; bits<8> addr; @@ -33,7 +32,6 @@ class DSe_vi <bits<8> op> : Enc64 { } class MUBUFe_vi <bits<7> op> : Enc64 { - bits<12> offset; bits<1> offen; bits<1> idxen; @@ -62,7 +60,6 @@ class MUBUFe_vi <bits<7> op> : Enc64 { } class MTBUFe_vi <bits<4> op> : Enc64 { - bits<12> offset; bits<1> offen; bits<1> idxen; @@ -93,7 +90,6 @@ class MTBUFe_vi <bits<4> op> : Enc64 { } class SMEMe_vi <bits<8> op, bit imm> : Enc64 { - bits<7> sbase; bits<7> sdata; bits<1> glc; @@ -109,8 +105,7 @@ class SMEMe_vi <bits<8> op, bit imm> : Enc64 { } class VOP3e_vi <bits<10> op> : Enc64 { - - bits<8> dst; + bits<8> vdst; bits<2> src0_modifiers; bits<9> src0; bits<2> src1_modifiers; @@ -120,7 +115,7 @@ class VOP3e_vi <bits<10> op> : Enc64 { bits<1> clamp; bits<2> omod; - let Inst{7-0} = dst; + let Inst{7-0} = vdst; let Inst{8} = src0_modifiers{1}; let Inst{9} = src1_modifiers{1}; let Inst{10} = src2_modifiers{1}; diff --git a/contrib/llvm/lib/Target/R600/VIInstructions.td b/contrib/llvm/lib/Target/R600/VIInstructions.td index 4a6e933..5bf86e6 100644 --- a/contrib/llvm/lib/Target/R600/VIInstructions.td +++ b/contrib/llvm/lib/Target/R600/VIInstructions.td @@ -9,6 +9,87 @@ // Instruction definitions for VI and newer. //===----------------------------------------------------------------------===// +let SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI in { + +//===----------------------------------------------------------------------===// +// VOP1 Instructions +//===----------------------------------------------------------------------===// + +defm V_CVT_F16_U16 : VOP1Inst <vop1<0, 0x39>, "v_cvt_f16_u16", VOP_F16_I16>; +defm V_CVT_F16_I16 : VOP1Inst <vop1<0, 0x3a>, "v_cvt_f16_i16", VOP_F16_I16>; +defm V_CVT_U16_F16 : VOP1Inst <vop1<0, 0x3b>, "v_cvt_u16_f16", VOP_I16_F16>; +defm V_CVT_I16_F16 : VOP1Inst <vop1<0, 0x3c>, "v_cvt_i16_f16", VOP_I16_F16>; +defm V_RCP_F16 : VOP1Inst <vop1<0, 0x3d>, "v_rcp_f16", VOP_F16_F16>; +defm V_SQRT_F16 : VOP1Inst <vop1<0, 0x3e>, "v_sqrt_f16", VOP_F16_F16>; +defm V_RSQ_F16 : VOP1Inst <vop1<0, 0x3f>, "v_rsq_f16", VOP_F16_F16>; +defm V_LOG_F16 : VOP1Inst <vop1<0, 0x40>, "v_log_f16", VOP_F16_F16>; +defm V_EXP_F16 : VOP1Inst <vop1<0, 0x41>, "v_exp_f16", VOP_F16_F16>; +defm V_FREXP_MANT_F16 : VOP1Inst <vop1<0, 0x42>, "v_frexp_mant_f16", + VOP_F16_F16 +>; +defm V_FREXP_EXP_I16_F16 : VOP1Inst <vop1<0, 0x43>, "v_frexp_exp_i16_f16", + VOP_I16_F16 +>; +defm V_FLOOR_F16 : VOP1Inst <vop1<0, 0x44>, "v_floor_f16", VOP_F16_F16>; +defm V_CEIL_F16 : VOP1Inst <vop1<0, 0x45>, "v_ceil_f16", VOP_F16_F16>; +defm V_TRUNC_F16 : VOP1Inst <vop1<0, 0x46>, "v_trunc_f16", VOP_F16_F16>; +defm V_RNDNE_F16 : VOP1Inst <vop1<0, 0x47>, "v_rndne_f16", VOP_F16_F16>; +defm V_FRACT_F16 : VOP1Inst <vop1<0, 0x48>, "v_fract_f16", VOP_F16_F16>; +defm V_SIN_F16 : VOP1Inst <vop1<0, 0x49>, "v_sin_f16", VOP_F16_F16>; +defm V_COS_F16 : VOP1Inst <vop1<0, 0x4a>, "v_cos_f16", VOP_F16_F16>; + +//===----------------------------------------------------------------------===// +// VOP2 Instructions +//===----------------------------------------------------------------------===// + +let isCommutable = 1 in { + +defm V_ADD_F16 : VOP2Inst <vop2<0, 0x1f>, "v_add_f16", VOP_F16_F16_F16>; +defm V_SUB_F16 : VOP2Inst <vop2<0, 0x20>, "v_sub_f16", VOP_F16_F16_F16>; +defm V_SUBREV_F16 : VOP2Inst <vop2<0, 0x21>, "v_subrev_f16", VOP_F16_F16_F16, + null_frag, "v_sub_f16" +>; +defm V_MUL_F16 : VOP2Inst <vop2<0, 0x22>, "v_mul_f16", VOP_F16_F16_F16>; +defm V_MAC_F16 : VOP2Inst <vop2<0, 0x23>, "v_mac_f16", VOP_F16_F16_F16>; +} // End isCommutable = 1 +defm V_MADMK_F16 : VOP2MADK <vop2<0,0x24>, "v_madmk_f16">; +let isCommutable = 1 in { +defm V_MADAK_F16 : VOP2MADK <vop2<0,0x25>, "v_madak_f16">; +defm V_ADD_U16 : VOP2Inst <vop2<0,0x26>, "v_add_u16", VOP_I16_I16_I16>; +defm V_SUB_U16 : VOP2Inst <vop2<0,0x27>, "v_sub_u16" , VOP_I16_I16_I16>; +defm V_SUBREV_U16 : VOP2Inst <vop2<0,0x28>, "v_subrev_u16", VOP_I16_I16_I16>; +defm V_MUL_LO_U16 : VOP2Inst <vop2<0,0x29>, "v_mul_lo_u16", VOP_I16_I16_I16>; +} // End isCommutable = 1 +defm V_LSHLREV_B16 : VOP2Inst <vop2<0,0x2a>, "v_lshlrev_b16", VOP_I16_I16_I16>; +defm V_LSHRREV_B16 : VOP2Inst <vop2<0,0x2b>, "v_lshrrev_b16", VOP_I16_I16_I16>; +defm V_ASHRREV_B16 : VOP2Inst <vop2<0,0x2c>, "v_ashrrev_b16", VOP_I16_I16_I16>; +let isCommutable = 1 in { +defm V_MAX_F16 : VOP2Inst <vop2<0,0x2d>, "v_max_f16", VOP_F16_F16_F16>; +defm V_MIN_F16 : VOP2Inst <vop2<0,0x2e>, "v_min_f16", VOP_F16_F16_F16>; +defm V_MAX_U16 : VOP2Inst <vop2<0,0x2f>, "v_max_u16", VOP_I16_I16_I16>; +defm V_MAX_I16 : VOP2Inst <vop2<0,0x30>, "v_max_i16", VOP_I16_I16_I16>; +defm V_MIN_U16 : VOP2Inst <vop2<0,0x31>, "v_min_u16", VOP_I16_I16_I16>; +defm V_MIN_I16 : VOP2Inst <vop2<0,0x32>, "v_min_i16", VOP_I16_I16_I16>; +} // End isCommutable = 1 +defm V_LDEXP_F16 : VOP2Inst <vop2<0,0x33>, "v_ldexp_f16", VOP_F16_F16_I16>; + +// Aliases to simplify matching of floating-pint instructions that are VOP2 on +// SI and VOP3 on VI. + +class SI2_VI3Alias <string name, Instruction inst> : InstAlias < + name#" $dst, $src0, $src1", + (inst VGPR_32:$dst, 0, VCSrc_32:$src0, 0, VCSrc_32:$src1, 0, 0) +>, PredicateControl { + let UseInstAsmMatchConverter = 0; +} + +def : SI2_VI3Alias <"v_ldexp_f32", V_LDEXP_F32_e64_vi>; +def : SI2_VI3Alias <"v_cvt_pkaccum_u8_f32", V_CVT_PKACCUM_U8_F32_e64_vi>; +def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>; +def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>; +def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>; + +} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI //===----------------------------------------------------------------------===// // SMEM Patterns |