diff options
author | dim <dim@FreeBSD.org> | 2014-03-21 17:53:59 +0000 |
---|---|---|
committer | dim <dim@FreeBSD.org> | 2014-03-21 17:53:59 +0000 |
commit | 9cedb8bb69b89b0f0c529937247a6a80cabdbaec (patch) | |
tree | c978f0e9ec1ab92dc8123783f30b08a7fd1e2a39 /contrib/llvm/lib/Target/R600/SIISelLowering.cpp | |
parent | 03fdc2934eb61c44c049a02b02aa974cfdd8a0eb (diff) | |
download | FreeBSD-src-9cedb8bb69b89b0f0c529937247a6a80cabdbaec.zip FreeBSD-src-9cedb8bb69b89b0f0c529937247a6a80cabdbaec.tar.gz |
MFC 261991:
Upgrade our copy of llvm/clang to 3.4 release. This version supports
all of the features in the current working draft of the upcoming C++
standard, provisionally named C++1y.
The code generator's performance is greatly increased, and the loop
auto-vectorizer is now enabled at -Os and -O2 in addition to -O3. The
PowerPC backend has made several major improvements to code generation
quality and compile time, and the X86, SPARC, ARM32, Aarch64 and SystemZ
backends have all seen major feature work.
Release notes for llvm and clang can be found here:
<http://llvm.org/releases/3.4/docs/ReleaseNotes.html>
<http://llvm.org/releases/3.4/tools/clang/docs/ReleaseNotes.html>
MFC 262121 (by emaste):
Update lldb for clang/llvm 3.4 import
This commit largely restores the lldb source to the upstream r196259
snapshot with the addition of threaded inferior support and a few bug
fixes.
Specific upstream lldb revisions restored include:
SVN git
181387 779e6ac
181703 7bef4e2
182099 b31044e
182650 f2dcf35
182683 0d91b80
183862 15c1774
183929 99447a6
184177 0b2934b
184948 4dc3761
184954 007e7bc
186990 eebd175
Sponsored by: DARPA, AFRL
MFC 262186 (by emaste):
Fix mismerge in r262121
A break statement was lost in the merge. The error had no functional
impact, but restore it to reduce the diff against upstream.
MFC 262303:
Pull in r197521 from upstream clang trunk (by rdivacky):
Use the integrated assembler by default on FreeBSD/ppc and ppc64.
Requested by: jhibbits
MFC 262611:
Pull in r196874 from upstream llvm trunk:
Fix a crash that occurs when PWD is invalid.
MCJIT needs to be able to run in hostile environments, even when PWD
is invalid. There's no need to crash MCJIT in this case.
The obvious fix is to simply leave MCContext's CompilationDir empty
when PWD can't be determined. This way, MCJIT clients,
and other clients that link with LLVM don't need a valid working directory.
If we do want to guarantee valid CompilationDir, that should be done
only for clients of getCompilationDir(). This is as simple as checking
for an empty string.
The only current use of getCompilationDir is EmitGenDwarfInfo, which
won't conceivably run with an invalid working dir. However, in the
purely hypothetically and untestable case that this happens, the
AT_comp_dir will be omitted from the compilation_unit DIE.
This should help fix assertions occurring with ports-mgmt/tinderbox,
when it is using jails, and sometimes invalidates clang's current
working directory.
Reported by: decke
MFC 262809:
Pull in r203007 from upstream clang trunk:
Don't produce an alias between destructors with different calling conventions.
Fixes pr19007.
(Please note that is an LLVM PR identifier, not a FreeBSD one.)
This should fix Firefox and/or libxul crashes (due to problems with
regparm/stdcall calling conventions) on i386.
Reported by: multiple users on freebsd-current
PR: bin/187103
MFC 263048:
Repair recognition of "CC" as an alias for the C++ compiler, since it
was silently broken by upstream for a Windows-specific use-case.
Apparently some versions of CMake still rely on this archaic feature...
Reported by: rakuco
MFC 263049:
Garbage collect the old way of adding the libstdc++ include directories
in clang's InitHeaderSearch.cpp. This has been superseded by David
Chisnall's commit in r255321.
Moreover, if libc++ is used, the libstdc++ include directories should
not be in the search path at all. These directories are now only used
if you pass -stdlib=libstdc++.
Diffstat (limited to 'contrib/llvm/lib/Target/R600/SIISelLowering.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/R600/SIISelLowering.cpp | 756 |
1 files changed, 669 insertions, 87 deletions
diff --git a/contrib/llvm/lib/Target/R600/SIISelLowering.cpp b/contrib/llvm/lib/Target/R600/SIISelLowering.cpp index 6bd82a5..d5d2b68 100644 --- a/contrib/llvm/lib/Target/R600/SIISelLowering.cpp +++ b/contrib/llvm/lib/Target/R600/SIISelLowering.cpp @@ -13,39 +13,36 @@ //===----------------------------------------------------------------------===// #include "SIISelLowering.h" -#include "AMDIL.h" #include "AMDGPU.h" #include "AMDILIntrinsicInfo.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" -#include "llvm/IR/Function.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/IR/Function.h" + +const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; using namespace llvm; SITargetLowering::SITargetLowering(TargetMachine &TM) : - AMDGPUTargetLowering(TM), - TII(static_cast<const SIInstrInfo*>(TM.getInstrInfo())), - TRI(TM.getRegisterInfo()) { + AMDGPUTargetLowering(TM) { addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass); - addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); + addRegisterClass(MVT::i64, &AMDGPU::VSrc_64RegClass); - addRegisterClass(MVT::v16i8, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); - addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass); - addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass); + addRegisterClass(MVT::i32, &AMDGPU::VSrc_32RegClass); + addRegisterClass(MVT::f32, &AMDGPU::VSrc_32RegClass); - addRegisterClass(MVT::v1i32, &AMDGPU::VReg_32RegClass); - - addRegisterClass(MVT::v2i32, &AMDGPU::VReg_64RegClass); - addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::f64, &AMDGPU::VSrc_64RegClass); + addRegisterClass(MVT::v2i32, &AMDGPU::VSrc_64RegClass); + addRegisterClass(MVT::v2f32, &AMDGPU::VSrc_64RegClass); addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass); addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); @@ -59,6 +56,21 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : computeRegisterProperties(); + // Condition Codes + setCondCodeAction(ISD::SETONE, MVT::f32, Expand); + setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); + setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); + setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); + setCondCodeAction(ISD::SETULE, MVT::f32, Expand); + setCondCodeAction(ISD::SETULT, MVT::f32, Expand); + + setCondCodeAction(ISD::SETONE, MVT::f64, Expand); + setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand); + setCondCodeAction(ISD::SETUGE, MVT::f64, Expand); + setCondCodeAction(ISD::SETUGT, MVT::f64, Expand); + setCondCodeAction(ISD::SETULE, MVT::f64, Expand); + setCondCodeAction(ISD::SETULT, MVT::f64, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); @@ -66,14 +78,66 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::ADD, MVT::i64, Legal); setOperationAction(ISD::ADD, MVT::i32, Legal); + setOperationAction(ISD::ADDC, MVT::i32, Legal); + setOperationAction(ISD::ADDE, MVT::i32, Legal); + + setOperationAction(ISD::BITCAST, MVT::i128, Legal); + + // We need to custom lower vector stores from local memory + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setOperationAction(ISD::LOAD, MVT::v8i32, Custom); + setOperationAction(ISD::LOAD, MVT::v16i32, Custom); + + setOperationAction(ISD::STORE, MVT::v8i32, Custom); + setOperationAction(ISD::STORE, MVT::v16i32, Custom); + + // We need to custom lower loads/stores from private memory + setOperationAction(ISD::LOAD, MVT::i32, Custom); + setOperationAction(ISD::LOAD, MVT::i64, Custom); + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + + setOperationAction(ISD::STORE, MVT::i32, Custom); + setOperationAction(ISD::STORE, MVT::i64, Custom); + setOperationAction(ISD::STORE, MVT::i128, Custom); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); + setOperationAction(ISD::STORE, MVT::v4i32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); - setOperationAction(ISD::STORE, MVT::i32, Custom); - setOperationAction(ISD::STORE, MVT::i64, Custom); + setOperationAction(ISD::SETCC, MVT::v2i1, Expand); + setOperationAction(ISD::SETCC, MVT::v4i1, Expand); + + setOperationAction(ISD::ANY_EXTEND, MVT::i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom); + + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); + + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + + setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + setTruncStoreAction(MVT::i64, MVT::i32, Expand); + setTruncStoreAction(MVT::i128, MVT::i64, Expand); + setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); + setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); + + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setOperationAction(ISD::FrameIndex, MVT::i64, Custom); setTargetDAGCombine(ISD::SELECT_CC); @@ -82,12 +146,45 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setSchedulingPreference(Sched::RegPressure); } +//===----------------------------------------------------------------------===// +// TargetLowering queries +//===----------------------------------------------------------------------===// + +bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT, + bool *IsFast) const { + // XXX: This depends on the address space and also we may want to revist + // the alignment values we specify in the DataLayout. + if (!VT.isSimple() || VT == MVT::Other) + return false; + return VT.bitsGT(MVT::i32); +} + +bool SITargetLowering::shouldSplitVectorElementType(EVT VT) const { + return VT.bitsLE(MVT::i16); +} + +SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, + SDLoc DL, SDValue Chain, + unsigned Offset) const { + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), + AMDGPUAS::CONSTANT_ADDRESS); + SDValue BasePtr = DAG.getCopyFromReg(Chain, DL, + MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64); + SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, + DAG.getConstant(Offset, MVT::i64)); + return DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain, Ptr, + MachinePointerInfo(UndefValue::get(PtrTy)), MemVT, + false, false, MemVT.getSizeInBits() >> 3); + +} + SDValue SITargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc DL, SelectionDAG &DAG, + SDLoc DL, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); @@ -103,9 +200,10 @@ SDValue SITargetLowering::LowerFormalArguments( for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { const ISD::InputArg &Arg = Ins[i]; - - // First check if it's a PS input addr - if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg()) { + + // First check if it's a PS input addr + if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg() && + !Arg.Flags.isByVal()) { assert((PSInputNum <= 15) && "Too many PS inputs!"); @@ -120,7 +218,7 @@ SDValue SITargetLowering::LowerFormalArguments( } // Second split vertices into their elements - if (Arg.VT.isVector()) { + if (Info->ShaderType != ShaderType::COMPUTE && Arg.VT.isVector()) { ISD::InputArg NewArg = Arg; NewArg.Flags.setSplit(); NewArg.VT = Arg.VT.getVectorElementType(); @@ -136,7 +234,7 @@ SDValue SITargetLowering::LowerFormalArguments( NewArg.PartOffset += NewArg.VT.getStoreSize(); } - } else { + } else if (Info->ShaderType != ShaderType::COMPUTE) { Splits.push_back(Arg); } } @@ -152,20 +250,44 @@ SDValue SITargetLowering::LowerFormalArguments( CCInfo.AllocateReg(AMDGPU::VGPR1); } + // The pointer to the list of arguments is stored in SGPR0, SGPR1 + if (Info->ShaderType == ShaderType::COMPUTE) { + CCInfo.AllocateReg(AMDGPU::SGPR0); + CCInfo.AllocateReg(AMDGPU::SGPR1); + MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass); + } + + if (Info->ShaderType == ShaderType::COMPUTE) { + getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, + Splits); + } + AnalyzeFormalArguments(CCInfo, Splits); for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { + const ISD::InputArg &Arg = Ins[i]; if (Skipped & (1 << i)) { - InVals.push_back(SDValue()); + InVals.push_back(DAG.getUNDEF(Arg.VT)); continue; } CCValAssign &VA = ArgLocs[ArgIdx++]; + EVT VT = VA.getLocVT(); + + if (VA.isMemLoc()) { + VT = Ins[i].VT; + EVT MemVT = Splits[i].VT; + // The first 36 bytes of the input buffer contains information about + // thread group and global sizes. + SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(), + 36 + VA.getLocMemOffset()); + InVals.push_back(Arg); + continue; + } assert(VA.isRegLoc() && "Parameter must be in a register!"); unsigned Reg = VA.getLocReg(); - MVT VT = VA.getLocVT(); if (VT == MVT::i64) { // For now assume it is a pointer @@ -181,7 +303,6 @@ SDValue SITargetLowering::LowerFormalArguments( Reg = MF.addLiveIn(Reg, RC); SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); - const ISD::InputArg &Arg = Ins[i]; if (Arg.VT.isVector()) { // Build a vector from the registers @@ -200,7 +321,7 @@ SDValue SITargetLowering::LowerFormalArguments( NumElements = Arg.VT.getVectorNumElements() - NumElements; for (unsigned j = 0; j != NumElements; ++j) Regs.push_back(DAG.getUNDEF(VT)); - + InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs.data(), Regs.size())); continue; @@ -214,36 +335,274 @@ SDValue SITargetLowering::LowerFormalArguments( MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { + MachineBasicBlock::iterator I = *MI; + switch (MI->getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); case AMDGPU::BRANCH: return BB; + case AMDGPU::SI_ADDR64_RSRC: { + const SIInstrInfo *TII = + static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + unsigned SuperReg = MI->getOperand(0).getReg(); + unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), SubRegLo) + .addOperand(MI->getOperand(1)); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiLo) + .addImm(0); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiHi) + .addImm(RSRC_DATA_FORMAT >> 32); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SubRegHi) + .addReg(SubRegHiLo) + .addImm(AMDGPU::sub0) + .addReg(SubRegHiHi) + .addImm(AMDGPU::sub1); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SuperReg) + .addReg(SubRegLo) + .addImm(AMDGPU::sub0_sub1) + .addReg(SubRegHi) + .addImm(AMDGPU::sub2_sub3); + MI->eraseFromParent(); + break; + } + case AMDGPU::V_SUB_F64: { + const SIInstrInfo *TII = + static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), + MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addReg(MI->getOperand(2).getReg()) + .addImm(0) /* src2 */ + .addImm(0) /* ABS */ + .addImm(0) /* CLAMP */ + .addImm(0) /* OMOD */ + .addImm(2); /* NEG */ + MI->eraseFromParent(); + break; + } + case AMDGPU::SI_RegisterStorePseudo: { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + MachineInstrBuilder MIB = + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), + Reg); + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) + MIB.addOperand(MI->getOperand(i)); + + MI->eraseFromParent(); + } } return BB; } -EVT SITargetLowering::getSetCCResultType(EVT VT) const { - return MVT::i1; +EVT SITargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { + if (!VT.isVector()) { + return MVT::i1; + } + return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); } MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { return MVT::i32; } +bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { + VT = VT.getScalarType(); + + if (!VT.isSimple()) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f32: + return false; /* There is V_MAD_F32 for f32 */ + case MVT::f64: + return true; + default: + break; + } + + return false; +} + //===----------------------------------------------------------------------===// // Custom DAG Lowering Operations //===----------------------------------------------------------------------===// SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case ISD::ADD: return LowerADD(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); + case ISD::LOAD: { + LoadSDNode *Load = dyn_cast<LoadSDNode>(Op); + if ((Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && + Op.getValueType().isVector()) { + SDValue MergedValues[2] = { + SplitVectorLoad(Op, DAG), + Load->getChain() + }; + return DAG.getMergeValues(MergedValues, 2, SDLoc(Op)); + } else { + return LowerLOAD(Op, DAG); + } + } + case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); + case ISD::ANY_EXTEND: // Fall-through + case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG); + case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntrinsicID = + cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + EVT VT = Op.getValueType(); + SDLoc DL(Op); + //XXX: Hardcoded we only use two to store the pointer to the parameters. + unsigned NumUserSGPRs = 2; + switch (IntrinsicID) { + default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case Intrinsic::r600_read_ngroups_x: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0); + case Intrinsic::r600_read_ngroups_y: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4); + case Intrinsic::r600_read_ngroups_z: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8); + case Intrinsic::r600_read_global_size_x: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12); + case Intrinsic::r600_read_global_size_y: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16); + case Intrinsic::r600_read_global_size_z: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20); + case Intrinsic::r600_read_local_size_x: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24); + case Intrinsic::r600_read_local_size_y: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28); + case Intrinsic::r600_read_local_size_z: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32); + case Intrinsic::r600_read_tgid_x: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT); + case Intrinsic::r600_read_tgid_y: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT); + case Intrinsic::r600_read_tgid_z: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT); + case Intrinsic::r600_read_tidig_x: + return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, + AMDGPU::VGPR0, VT); + case Intrinsic::r600_read_tidig_y: + return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, + AMDGPU::VGPR1, VT); + case Intrinsic::r600_read_tidig_z: + return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, + AMDGPU::VGPR2, VT); + case AMDGPUIntrinsic::SI_load_const: { + SDValue Ops [] = { + ResourceDescriptorToi128(Op.getOperand(1), DAG), + Op.getOperand(2) + }; + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, + VT.getSizeInBits() / 8, 4); + return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, + Op->getVTList(), Ops, 2, VT, MMO); + } + case AMDGPUIntrinsic::SI_sample: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); + case AMDGPUIntrinsic::SI_sampleb: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); + case AMDGPUIntrinsic::SI_sampled: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); + case AMDGPUIntrinsic::SI_samplel: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); + case AMDGPUIntrinsic::SI_vs_load_input: + return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, + ResourceDescriptorToi128(Op.getOperand(1), DAG), + Op.getOperand(2), + Op.getOperand(3)); + } + } + + case ISD::INTRINSIC_VOID: + SDValue Chain = Op.getOperand(0); + unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + + switch (IntrinsicID) { + case AMDGPUIntrinsic::SI_tbuffer_store: { + SDLoc DL(Op); + SDValue Ops [] = { + Chain, + ResourceDescriptorToi128(Op.getOperand(2), DAG), + Op.getOperand(3), + Op.getOperand(4), + Op.getOperand(5), + Op.getOperand(6), + Op.getOperand(7), + Op.getOperand(8), + Op.getOperand(9), + Op.getOperand(10), + Op.getOperand(11), + Op.getOperand(12), + Op.getOperand(13), + Op.getOperand(14) + }; + EVT VT = Op.getOperand(3).getValueType(); + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOStore, + VT.getSizeInBits() / 8, 4); + return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, + Op->getVTList(), Ops, + sizeof(Ops)/sizeof(Ops[0]), VT, MMO); + } + default: + break; + } } return SDValue(); } +SDValue SITargetLowering::LowerADD(SDValue Op, + SelectionDAG &DAG) const { + if (Op.getValueType() != MVT::i64) + return SDValue(); + + SDLoc DL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + SDValue Zero = DAG.getConstant(0, MVT::i32); + SDValue One = DAG.getConstant(1, MVT::i32); + + SDValue Lo0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, Zero); + SDValue Hi0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, One); + + SDValue Lo1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, Zero); + SDValue Hi1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, One); + + SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Glue); + + SDValue AddLo = DAG.getNode(ISD::ADDC, DL, VTList, Lo0, Lo1); + SDValue Carry = AddLo.getValue(1); + SDValue AddHi = DAG.getNode(ISD::ADDE, DL, VTList, Hi0, Hi1, Carry); + + return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddLo, AddHi.getValue(0)); +} + /// \brief Helper function for LowerBRCOND static SDNode *findUser(SDValue Value, unsigned Opcode) { @@ -265,7 +624,7 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) { SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const { - DebugLoc DL = BRCOND.getDebugLoc(); + SDLoc DL(BRCOND); SDNode *Intr = BRCOND.getOperand(1).getNode(); SDValue Target = BRCOND.getOperand(2); @@ -338,30 +697,51 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, return Chain; } -#define RSRC_DATA_FORMAT 0xf00000000000 - -SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { - StoreSDNode *StoreNode = cast<StoreSDNode>(Op); - SDValue Chain = Op.getOperand(0); - SDValue Value = Op.getOperand(1); - SDValue VirtualAddress = Op.getOperand(2); - DebugLoc DL = Op.getDebugLoc(); +SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + LoadSDNode *Load = cast<LoadSDNode>(Op); - if (StoreNode->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) { + if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) return SDValue(); - } - SDValue SrcSrc = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, - DAG.getConstant(0, MVT::i64), - DAG.getConstant(RSRC_DATA_FORMAT, MVT::i64)); + SDValue TruncPtr = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, + Load->getBasePtr(), DAG.getConstant(0, MVT::i32)); + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr, + DAG.getConstant(2, MVT::i32)); + + SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), + Load->getChain(), Ptr, + DAG.getTargetConstant(0, MVT::i32), + Op.getOperand(2)); + SDValue MergedValues[2] = { + Ret, + Load->getChain() + }; + return DAG.getMergeValues(MergedValues, 2, DL); + +} + +SDValue SITargetLowering::ResourceDescriptorToi128(SDValue Op, + SelectionDAG &DAG) const { - SDValue Ops[2]; - Ops[0] = DAG.getNode(AMDGPUISD::BUFFER_STORE, DL, MVT::Other, Chain, - Value, SrcSrc, VirtualAddress); - Ops[1] = Chain; + if (Op.getValueType() == MVT::i128) { + return Op; + } - return DAG.getMergeValues(Ops, 2, DL); + assert(Op.getOpcode() == ISD::UNDEF); + return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), MVT::i128, + DAG.getConstant(0, MVT::i64), + DAG.getConstant(0, MVT::i64)); +} + +SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, + const SDValue &Op, + SelectionDAG &DAG) const { + return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), + Op.getOperand(2), + ResourceDescriptorToi128(Op.getOperand(3), DAG), + Op.getOperand(4)); } SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { @@ -371,7 +751,7 @@ SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDValue False = Op.getOperand(3); SDValue CC = Op.getOperand(4); EVT VT = Op.getValueType(); - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); // Possible Min/Max pattern SDValue MinMax = LowerMinMax(Op, DAG); @@ -383,6 +763,84 @@ SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False); } +SDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDLoc DL(Op); + + if (VT != MVT::i64) { + return SDValue(); + } + + SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i32, Op.getOperand(0), + DAG.getConstant(31, MVT::i32)); + + return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi); +} + +SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + StoreSDNode *Store = cast<StoreSDNode>(Op); + EVT VT = Store->getMemoryVT(); + + SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); + if (Ret.getNode()) + return Ret; + + if (VT.isVector() && VT.getVectorNumElements() >= 8) + return SplitVectorStore(Op, DAG); + + if (Store->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) + return SDValue(); + + SDValue TruncPtr = DAG.getZExtOrTrunc(Store->getBasePtr(), DL, MVT::i32); + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr, + DAG.getConstant(2, MVT::i32)); + SDValue Chain = Store->getChain(); + SmallVector<SDValue, 8> Values; + + if (VT == MVT::i64) { + for (unsigned i = 0; i < 2; ++i) { + Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, + Store->getValue(), DAG.getConstant(i, MVT::i32))); + } + } else if (VT == MVT::i128) { + for (unsigned i = 0; i < 2; ++i) { + for (unsigned j = 0; j < 2; ++j) { + Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, + DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, + Store->getValue(), DAG.getConstant(i, MVT::i32)), + DAG.getConstant(j, MVT::i32))); + } + } + } else { + Values.push_back(Store->getValue()); + } + + for (unsigned i = 0; i < Values.size(); ++i) { + SDValue PartPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, + Ptr, DAG.getConstant(i, MVT::i32)); + Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, + Chain, Values[i], PartPtr, + DAG.getTargetConstant(0, MVT::i32)); + } + return Chain; +} + + +SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDLoc DL(Op); + + if (VT != MVT::i64) { + return SDValue(); + } + + return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), + DAG.getConstant(0, MVT::i32)); +} + //===----------------------------------------------------------------------===// // Custom DAG optimizations //===----------------------------------------------------------------------===// @@ -390,13 +848,12 @@ SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); EVT VT = N->getValueType(0); switch (N->getOpcode()) { default: break; case ISD::SELECT_CC: { - N->dump(); ConstantSDNode *True, *False; // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc) if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2))) @@ -433,13 +890,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); } -/// \brief Test if RegClass is one of the VSrc classes +/// \brief Test if RegClass is one of the VSrc classes static bool isVSrc(unsigned RegClass) { return AMDGPU::VSrc_32RegClassID == RegClass || AMDGPU::VSrc_64RegClassID == RegClass; } -/// \brief Test if RegClass is one of the SSrc classes +/// \brief Test if RegClass is one of the SSrc classes static bool isSSrc(unsigned RegClass) { return AMDGPU::SSrc_32RegClassID == RegClass || AMDGPU::SSrc_64RegClassID == RegClass; @@ -481,6 +938,8 @@ bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate, bool &ScalarSlotUsed) const { MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); if (Mov == 0 || !TII->isMov(Mov->getMachineOpcode())) return false; @@ -512,30 +971,67 @@ bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate, return false; } +const TargetRegisterClass *SITargetLowering::getRegClassForNode( + SelectionDAG &DAG, const SDValue &Op) const { + const SIInstrInfo *TII = + static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + + if (!Op->isMachineOpcode()) { + switch(Op->getOpcode()) { + case ISD::CopyFromReg: { + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + unsigned Reg = cast<RegisterSDNode>(Op->getOperand(1))->getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + return MRI.getRegClass(Reg); + } + return TRI.getPhysRegClass(Reg); + } + default: return NULL; + } + } + const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode()); + int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass; + if (OpClassID != -1) { + return TRI.getRegClass(OpClassID); + } + switch(Op.getMachineOpcode()) { + case AMDGPU::COPY_TO_REGCLASS: + // Operand 1 is the register class id for COPY_TO_REGCLASS instructions. + OpClassID = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue(); + + // If the COPY_TO_REGCLASS instruction is copying to a VSrc register + // class, then the register class for the value could be either a + // VReg or and SReg. In order to get a more accurate + if (OpClassID == AMDGPU::VSrc_32RegClassID || + OpClassID == AMDGPU::VSrc_64RegClassID) { + return getRegClassForNode(DAG, Op.getOperand(0)); + } + return TRI.getRegClass(OpClassID); + case AMDGPU::EXTRACT_SUBREG: { + int SubIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + const TargetRegisterClass *SuperClass = + getRegClassForNode(DAG, Op.getOperand(0)); + return TRI.getSubClassWithSubReg(SuperClass, SubIdx); + } + case AMDGPU::REG_SEQUENCE: + // Operand 0 is the register class id for REG_SEQUENCE instructions. + return TRI.getRegClass( + cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()); + default: + return getRegClassFor(Op.getSimpleValueType()); + } +} + /// \brief Does "Op" fit into register class "RegClass" ? -bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, SDValue &Op, +bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op, unsigned RegClass) const { - - MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); - SDNode *Node = Op.getNode(); - - const TargetRegisterClass *OpClass; - if (MachineSDNode *MN = dyn_cast<MachineSDNode>(Node)) { - const MCInstrDesc &Desc = TII->get(MN->getMachineOpcode()); - int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass; - if (OpClassID == -1) - OpClass = getRegClassFor(Op.getSimpleValueType()); - else - OpClass = TRI->getRegClass(OpClassID); - - } else if (Node->getOpcode() == ISD::CopyFromReg) { - RegisterSDNode *Reg = cast<RegisterSDNode>(Node->getOperand(1).getNode()); - OpClass = MRI.getRegClass(Reg->getReg()); - - } else + const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const TargetRegisterClass *RC = getRegClassForNode(DAG, Op); + if (!RC) { return false; - - return TRI->getRegClass(RegClass)->hasSubClassEq(OpClass); + } + return TRI->getRegClass(RegClass)->hasSubClassEq(RC); } /// \brief Make sure that we don't exeed the number of allowed scalars @@ -561,20 +1057,33 @@ void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, return; } - // This is a conservative aproach, it is possible that we can't determine - // the correct register class and copy too often, but better save than sorry. + // This is a conservative aproach. It is possible that we can't determine the + // correct register class and copy too often, but better safe than sorry. SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); - SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DebugLoc(), + SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), Operand.getValueType(), Operand, RC); Operand = SDValue(Node, 0); } +/// \returns true if \p Node's operands are different from the SDValue list +/// \p Ops +static bool isNodeChanged(const SDNode *Node, const std::vector<SDValue> &Ops) { + for (unsigned i = 0, e = Node->getNumOperands(); i < e; ++i) { + if (Ops[i].getNode() != Node->getOperand(i).getNode()) { + return true; + } + } + return false; +} + /// \brief Try to fold the Nodes operands into the Node SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, SelectionDAG &DAG) const { // Original encoding (either e32 or e64) int Opcode = Node->getMachineOpcode(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); const MCInstrDesc *Desc = &TII->get(Opcode); unsigned NumDefs = Desc->getNumDefs(); @@ -700,13 +1209,19 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i) Ops.push_back(Node->getOperand(i)); + // Nodes that have a glue result are not CSE'd by getMachineNode(), so in + // this case a brand new node is always be created, even if the operands + // are the same as before. So, manually check if anything has been changed. + if (Desc->Opcode == Opcode && !isNodeChanged(Node, Ops)) { + return Node; + } + // Create a complete new instruction - return DAG.getMachineNode(Desc->Opcode, Node->getDebugLoc(), - Node->getVTList(), Ops); + return DAG.getMachineNode(Desc->Opcode, SDLoc(Node), Node->getVTList(), Ops); } /// \brief Helper function for adjustWritemask -unsigned SubIdx2Lane(unsigned Idx) { +static unsigned SubIdx2Lane(unsigned Idx) { switch (Idx) { default: return 0; case AMDGPU::sub0: return 0; @@ -720,7 +1235,9 @@ unsigned SubIdx2Lane(unsigned Idx) { void SITargetLowering::adjustWritemask(MachineSDNode *&Node, SelectionDAG &DAG) const { SDNode *Users[4] = { }; - unsigned Writemask = 0, Lane = 0; + unsigned Lane = 0; + unsigned OldDmask = Node->getConstantOperandVal(0); + unsigned NewDmask = 0; // Try to figure out the used register components for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); @@ -731,32 +1248,45 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) return; + // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. + // Note that subregs are packed, i.e. Lane==0 is the first bit set + // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit + // set, etc. Lane = SubIdx2Lane(I->getConstantOperandVal(1)); + // Set which texture component corresponds to the lane. + unsigned Comp; + for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { + assert(Dmask); + Comp = countTrailingZeros(Dmask); + Dmask &= ~(1 << Comp); + } + // Abort if we have more than one user per component if (Users[Lane]) return; Users[Lane] = *I; - Writemask |= 1 << Lane; + NewDmask |= 1 << Comp; } - // Abort if all components are used - if (Writemask == 0xf) + // Abort if there's no change + if (NewDmask == OldDmask) return; // Adjust the writemask in the node std::vector<SDValue> Ops; - Ops.push_back(DAG.getTargetConstant(Writemask, MVT::i32)); + Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32)); for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) Ops.push_back(Node->getOperand(i)); Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size()); // If we only got one lane, replace it with a copy - if (Writemask == (1U << Lane)) { + // (if NewDmask has only one bit set...) + if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32); SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, - DebugLoc(), Users[Lane]->getValueType(0), + SDLoc(), Users[Lane]->getValueType(0), SDValue(Node, 0), RC); DAG.ReplaceAllUsesWith(Users[Lane], Copy); return; @@ -784,8 +1314,11 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, /// \brief Fold the instructions after slecting them SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { + const SIInstrInfo *TII = + static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + Node = AdjustRegClass(Node, DAG); - if (AMDGPU::isMIMG(Node->getMachineOpcode()) != -1) + if (TII->isMIMG(Node->getMachineOpcode())) adjustWritemask(Node, DAG); return foldOperands(Node, DAG); @@ -795,7 +1328,9 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, /// bits set in the writemask void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, SDNode *Node) const { - if (AMDGPU::isMIMG(MI->getOpcode()) == -1) + const SIInstrInfo *TII = + static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + if (!TII->isMIMG(MI->getOpcode())) return; unsigned VReg = MI->getOperand(0).getReg(); @@ -812,6 +1347,53 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, case 3: RC = &AMDGPU::VReg_96RegClass; break; } + unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); + MI->setDesc(TII->get(NewOpcode)); MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); MRI.setRegClass(VReg, RC); } + +MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N, + SelectionDAG &DAG) const { + + SDLoc DL(N); + unsigned NewOpcode = N->getMachineOpcode(); + + switch (N->getMachineOpcode()) { + default: return N; + case AMDGPU::S_LOAD_DWORD_IMM: + NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_ADDR64; + // Fall-through + case AMDGPU::S_LOAD_DWORDX2_SGPR: + if (NewOpcode == N->getMachineOpcode()) { + NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; + } + // Fall-through + case AMDGPU::S_LOAD_DWORDX4_IMM: + case AMDGPU::S_LOAD_DWORDX4_SGPR: { + if (NewOpcode == N->getMachineOpcode()) { + NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; + } + if (fitsRegClass(DAG, N->getOperand(0), AMDGPU::SReg_64RegClassID)) { + return N; + } + ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1)); + SDValue Ops[] = { + SDValue(DAG.getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::i128, + DAG.getConstant(0, MVT::i64)), 0), + N->getOperand(0), + DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32) + }; + return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops); + } + } +} + +SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, + const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const { + SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); + + return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), + cast<RegisterSDNode>(VReg)->getReg(), VT); +} |