diff options
Diffstat (limited to 'contrib/llvm/lib/Target/R600/SIISelLowering.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/R600/SIISelLowering.cpp | 960 |
1 files changed, 691 insertions, 269 deletions
diff --git a/contrib/llvm/lib/Target/R600/SIISelLowering.cpp b/contrib/llvm/lib/Target/R600/SIISelLowering.cpp index d5d2b68..5a148a2 100644 --- a/contrib/llvm/lib/Target/R600/SIISelLowering.cpp +++ b/contrib/llvm/lib/Target/R600/SIISelLowering.cpp @@ -12,9 +12,16 @@ // //===----------------------------------------------------------------------===// +#ifdef _MSC_VER +// Provide M_PI. +#define _USE_MATH_DEFINES +#include <cmath> +#endif + #include "SIISelLowering.h" #include "AMDGPU.h" -#include "AMDILIntrinsicInfo.h" +#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" @@ -23,30 +30,27 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/IR/Function.h" - -const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; +#include "llvm/ADT/SmallString.h" using namespace llvm; SITargetLowering::SITargetLowering(TargetMachine &TM) : AMDGPUTargetLowering(TM) { - - addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass); - addRegisterClass(MVT::i64, &AMDGPU::VSrc_64RegClass); + addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); + addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); - addRegisterClass(MVT::i32, &AMDGPU::VSrc_32RegClass); - addRegisterClass(MVT::f32, &AMDGPU::VSrc_32RegClass); + addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass); - addRegisterClass(MVT::f64, &AMDGPU::VSrc_64RegClass); - addRegisterClass(MVT::v2i32, &AMDGPU::VSrc_64RegClass); - addRegisterClass(MVT::v2f32, &AMDGPU::VSrc_64RegClass); + addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); + addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); - addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass); + addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); - addRegisterClass(MVT::i128, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass); addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); @@ -76,15 +80,16 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); - setOperationAction(ISD::ADD, MVT::i64, Legal); setOperationAction(ISD::ADD, MVT::i32, Legal); setOperationAction(ISD::ADDC, MVT::i32, Legal); setOperationAction(ISD::ADDE, MVT::i32, Legal); + setOperationAction(ISD::SUBC, MVT::i32, Legal); + setOperationAction(ISD::SUBE, MVT::i32, Legal); - setOperationAction(ISD::BITCAST, MVT::i128, Legal); + setOperationAction(ISD::FSIN, MVT::f32, Custom); + setOperationAction(ISD::FCOS, MVT::f32, Custom); // We need to custom lower vector stores from local memory - setOperationAction(ISD::LOAD, MVT::v2i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setOperationAction(ISD::LOAD, MVT::v8i32, Custom); setOperationAction(ISD::LOAD, MVT::v16i32, Custom); @@ -92,30 +97,40 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::STORE, MVT::v8i32, Custom); setOperationAction(ISD::STORE, MVT::v16i32, Custom); - // We need to custom lower loads/stores from private memory - setOperationAction(ISD::LOAD, MVT::i32, Custom); - setOperationAction(ISD::LOAD, MVT::i64, Custom); - setOperationAction(ISD::LOAD, MVT::v2i32, Custom); - setOperationAction(ISD::LOAD, MVT::v4i32, Custom); - + setOperationAction(ISD::STORE, MVT::i1, Custom); setOperationAction(ISD::STORE, MVT::i32, Custom); - setOperationAction(ISD::STORE, MVT::i64, Custom); - setOperationAction(ISD::STORE, MVT::i128, Custom); setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); + setOperationAction(ISD::SELECT, MVT::f32, Promote); + AddPromotedToType(ISD::SELECT, MVT::f32, MVT::i32); + setOperationAction(ISD::SELECT, MVT::i64, Custom); + setOperationAction(ISD::SELECT, MVT::f64, Promote); + AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); - setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); - setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); - - setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); setOperationAction(ISD::SETCC, MVT::v2i1, Expand); setOperationAction(ISD::SETCC, MVT::v4i1, Expand); - setOperationAction(ISD::ANY_EXTEND, MVT::i64, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); @@ -123,26 +138,101 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); + setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom); setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand); setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand); setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote); + setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); + + setTruncStoreAction(MVT::i32, MVT::i8, Custom); + setTruncStoreAction(MVT::i32, MVT::i16, Custom); setTruncStoreAction(MVT::f64, MVT::f32, Expand); setTruncStoreAction(MVT::i64, MVT::i32, Expand); - setTruncStoreAction(MVT::i128, MVT::i64, Expand); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); + setOperationAction(ISD::LOAD, MVT::i1, Custom); + + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand); + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); - setOperationAction(ISD::FrameIndex, MVT::i64, Custom); + setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); + setOperationAction(ISD::FrameIndex, MVT::i32, Custom); - setTargetDAGCombine(ISD::SELECT_CC); + // These should use UDIVREM, so set them to expand + setOperationAction(ISD::UDIV, MVT::i64, Expand); + setOperationAction(ISD::UREM, MVT::i64, Expand); + + // We only support LOAD/STORE and vector manipulation ops for vectors + // with > 4 elements. + MVT VecTypes[] = { + MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32 + }; + + setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); + setOperationAction(ISD::SELECT, MVT::i1, Promote); + + for (MVT VT : VecTypes) { + for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { + switch(Op) { + case ISD::LOAD: + case ISD::STORE: + case ISD::BUILD_VECTOR: + case ISD::BITCAST: + case ISD::EXTRACT_VECTOR_ELT: + case ISD::INSERT_VECTOR_ELT: + case ISD::CONCAT_VECTORS: + case ISD::INSERT_SUBVECTOR: + case ISD::EXTRACT_SUBVECTOR: + break; + default: + setOperationAction(Op, VT, Expand); + break; + } + } + } + + for (int I = MVT::v1f64; I <= MVT::v8f64; ++I) { + MVT::SimpleValueType VT = static_cast<MVT::SimpleValueType>(I); + setOperationAction(ISD::FTRUNC, VT, Expand); + setOperationAction(ISD::FCEIL, VT, Expand); + setOperationAction(ISD::FFLOOR, VT, Expand); + } + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { + setOperationAction(ISD::FTRUNC, MVT::f64, Legal); + setOperationAction(ISD::FCEIL, MVT::f64, Legal); + setOperationAction(ISD::FFLOOR, MVT::f64, Legal); + setOperationAction(ISD::FRINT, MVT::f64, Legal); + } + + // FIXME: These should be removed and handled the same was as f32 fneg. Source + // modifiers also work for the double instructions. + setOperationAction(ISD::FNEG, MVT::f64, Expand); + setOperationAction(ISD::FABS, MVT::f64, Expand); + setOperationAction(ISD::FDIV, MVT::f32, Custom); + + setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); + setTargetDAGCombine(ISD::UINT_TO_FP); + setSchedulingPreference(Sched::RegPressure); } @@ -151,21 +241,55 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : //===----------------------------------------------------------------------===// bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT, + unsigned AddrSpace, bool *IsFast) const { + if (IsFast) + *IsFast = false; + // XXX: This depends on the address space and also we may want to revist // the alignment values we specify in the DataLayout. + + // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, + // which isn't a simple VT. if (!VT.isSimple() || VT == MVT::Other) return false; + + // XXX - CI changes say "Support for unaligned memory accesses" but I don't + // see what for specifically. The wording everywhere else seems to be the + // same. + + // XXX - The only mention I see of this in the ISA manual is for LDS direct + // reads the "byte address and must be dword aligned". Is it also true for the + // normal loads and stores? + if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) + return false; + + // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the + // byte-address are ignored, thus forcing Dword alignment. + // This applies to private, global, and constant memory. + if (IsFast) + *IsFast = true; return VT.bitsGT(MVT::i32); } -bool SITargetLowering::shouldSplitVectorElementType(EVT VT) const { - return VT.bitsLE(MVT::i16); +TargetLoweringBase::LegalizeTypeAction +SITargetLowering::getPreferredVectorAction(EVT VT) const { + if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) + return TypeSplitVector; + + return TargetLoweringBase::getPreferredVectorAction(VT); +} + +bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const { + const SIInstrInfo *TII = + static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + return TII->isInlineConstant(Imm); } SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL, SDValue Chain, - unsigned Offset) const { + unsigned Offset, bool Signed) const { MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), AMDGPUAS::CONSTANT_ADDRESS); @@ -173,7 +297,7 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64); SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, DAG.getConstant(Offset, MVT::i64)); - return DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain, Ptr, + return DAG.getExtLoad(Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL, VT, Chain, Ptr, MachinePointerInfo(UndefValue::get(PtrTy)), MemVT, false, false, MemVT.getSizeInBits() >> 3); @@ -202,7 +326,7 @@ SDValue SITargetLowering::LowerFormalArguments( const ISD::InputArg &Arg = Ins[i]; // First check if it's a PS input addr - if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg() && + if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() && !Arg.Flags.isByVal()) { assert((PSInputNum <= 15) && "Too many PS inputs!"); @@ -218,7 +342,7 @@ SDValue SITargetLowering::LowerFormalArguments( } // Second split vertices into their elements - if (Info->ShaderType != ShaderType::COMPUTE && Arg.VT.isVector()) { + if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) { ISD::InputArg NewArg = Arg; NewArg.Flags.setSplit(); NewArg.VT = Arg.VT.getVectorElementType(); @@ -234,7 +358,7 @@ SDValue SITargetLowering::LowerFormalArguments( NewArg.PartOffset += NewArg.VT.getStoreSize(); } - } else if (Info->ShaderType != ShaderType::COMPUTE) { + } else if (Info->getShaderType() != ShaderType::COMPUTE) { Splits.push_back(Arg); } } @@ -244,20 +368,26 @@ SDValue SITargetLowering::LowerFormalArguments( getTargetMachine(), ArgLocs, *DAG.getContext()); // At least one interpolation mode must be enabled or else the GPU will hang. - if (Info->ShaderType == ShaderType::PIXEL && (Info->PSInputAddr & 0x7F) == 0) { + if (Info->getShaderType() == ShaderType::PIXEL && + (Info->PSInputAddr & 0x7F) == 0) { Info->PSInputAddr |= 1; CCInfo.AllocateReg(AMDGPU::VGPR0); CCInfo.AllocateReg(AMDGPU::VGPR1); } // The pointer to the list of arguments is stored in SGPR0, SGPR1 - if (Info->ShaderType == ShaderType::COMPUTE) { + // The pointer to the scratch buffer is stored in SGPR2, SGPR3 + if (Info->getShaderType() == ShaderType::COMPUTE) { + Info->NumUserSGPRs = 4; CCInfo.AllocateReg(AMDGPU::SGPR0); CCInfo.AllocateReg(AMDGPU::SGPR1); + CCInfo.AllocateReg(AMDGPU::SGPR2); + CCInfo.AllocateReg(AMDGPU::SGPR3); MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass); + MF.addLiveIn(AMDGPU::SGPR2_SGPR3, &AMDGPU::SReg_64RegClass); } - if (Info->ShaderType == ShaderType::COMPUTE) { + if (Info->getShaderType() == ShaderType::COMPUTE) { getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, Splits); } @@ -281,7 +411,8 @@ SDValue SITargetLowering::LowerFormalArguments( // The first 36 bytes of the input buffer contains information about // thread group and global sizes. SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(), - 36 + VA.getLocMemOffset()); + 36 + VA.getLocMemOffset(), + Ins[i].Flags.isSExt()); InVals.push_back(Arg); continue; } @@ -322,8 +453,7 @@ SDValue SITargetLowering::LowerFormalArguments( for (unsigned j = 0; j != NumElements; ++j) Regs.push_back(DAG.getUNDEF(VT)); - InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, - Regs.data(), Regs.size())); + InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs)); continue; } @@ -336,26 +466,26 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { MachineBasicBlock::iterator I = *MI; + const SIInstrInfo *TII = + static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); switch (MI->getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); case AMDGPU::BRANCH: return BB; case AMDGPU::SI_ADDR64_RSRC: { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); unsigned SuperReg = MI->getOperand(0).getReg(); - unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); + unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); + unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), SubRegLo) .addOperand(MI->getOperand(1)); BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiLo) .addImm(0); BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiHi) - .addImm(RSRC_DATA_FORMAT >> 32); + .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32); BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SubRegHi) .addReg(SubRegHiLo) .addImm(AMDGPU::sub0) @@ -369,25 +499,52 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MI->eraseFromParent(); break; } + case AMDGPU::SI_BUFFER_RSRC: { + unsigned SuperReg = MI->getOperand(0).getReg(); + unsigned Args[4]; + for (unsigned i = 0, e = 4; i < e; ++i) { + MachineOperand &Arg = MI->getOperand(i + 1); + + if (Arg.isReg()) { + Args[i] = Arg.getReg(); + continue; + } + + assert(Arg.isImm()); + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), Reg) + .addImm(Arg.getImm()); + Args[i] = Reg; + } + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), + SuperReg) + .addReg(Args[0]) + .addImm(AMDGPU::sub0) + .addReg(Args[1]) + .addImm(AMDGPU::sub1) + .addReg(Args[2]) + .addImm(AMDGPU::sub2) + .addReg(Args[3]) + .addImm(AMDGPU::sub3); + MI->eraseFromParent(); + break; + } case AMDGPU::V_SUB_F64: { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), - MI->getOperand(0).getReg()) - .addReg(MI->getOperand(1).getReg()) - .addReg(MI->getOperand(2).getReg()) - .addImm(0) /* src2 */ - .addImm(0) /* ABS */ - .addImm(0) /* CLAMP */ - .addImm(0) /* OMOD */ - .addImm(2); /* NEG */ + unsigned DestReg = MI->getOperand(0).getReg(); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg) + .addImm(0) // SRC0 modifiers + .addReg(MI->getOperand(1).getReg()) + .addImm(1) // SRC1 modifiers + .addReg(MI->getOperand(2).getReg()) + .addImm(0) // SRC2 modifiers + .addImm(0) // src2 + .addImm(0) // CLAMP + .addImm(0); // OMOD MI->eraseFromParent(); break; } case AMDGPU::SI_RegisterStorePseudo: { MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); MachineInstrBuilder MIB = BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), @@ -396,6 +553,50 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MIB.addOperand(MI->getOperand(i)); MI->eraseFromParent(); + break; + } + case AMDGPU::FABS_SI: { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), + Reg) + .addImm(0x7fffffff); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_AND_B32_e32), + MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addReg(Reg); + MI->eraseFromParent(); + break; + } + case AMDGPU::FNEG_SI: { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), + Reg) + .addImm(0x80000000); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_XOR_B32_e32), + MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addReg(Reg); + MI->eraseFromParent(); + break; + } + case AMDGPU::FCLAMP_SI: { + const SIInstrInfo *TII = + static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F32_e64), + MI->getOperand(0).getReg()) + .addImm(0) // SRC0 modifiers + .addOperand(MI->getOperand(1)) + .addImm(0) // SRC1 modifiers + .addImm(0) // SRC1 + .addImm(1) // CLAMP + .addImm(0); // OMOD + MI->eraseFromParent(); } } return BB; @@ -439,65 +640,57 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case ISD::ADD: return LowerADD(Op, DAG); + case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::LOAD: { - LoadSDNode *Load = dyn_cast<LoadSDNode>(Op); - if ((Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || - Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && - Op.getValueType().isVector()) { - SDValue MergedValues[2] = { - SplitVectorLoad(Op, DAG), - Load->getChain() - }; - return DAG.getMergeValues(MergedValues, 2, SDLoc(Op)); - } else { - return LowerLOAD(Op, DAG); - } + SDValue Result = LowerLOAD(Op, DAG); + assert((!Result.getNode() || + Result.getNode()->getNumValues() == 2) && + "Load should return a value and a chain"); + return Result; } - case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); - case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG); + case ISD::FSIN: + case ISD::FCOS: + return LowerTrig(Op, DAG); + case ISD::SELECT: return LowerSELECT(Op, DAG); + case ISD::FDIV: return LowerFDIV(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); - case ISD::ANY_EXTEND: // Fall-through - case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); case ISD::INTRINSIC_WO_CHAIN: { unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); EVT VT = Op.getValueType(); SDLoc DL(Op); - //XXX: Hardcoded we only use two to store the pointer to the parameters. - unsigned NumUserSGPRs = 2; switch (IntrinsicID) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); case Intrinsic::r600_read_ngroups_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0, false); case Intrinsic::r600_read_ngroups_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4, false); case Intrinsic::r600_read_ngroups_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8, false); case Intrinsic::r600_read_global_size_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12, false); case Intrinsic::r600_read_global_size_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16, false); case Intrinsic::r600_read_global_size_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20, false); case Intrinsic::r600_read_local_size_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24, false); case Intrinsic::r600_read_local_size_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28, false); case Intrinsic::r600_read_local_size_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32, false); case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT); + AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0), VT); case Intrinsic::r600_read_tgid_y: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT); + AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1), VT); case Intrinsic::r600_read_tgid_z: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT); + AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2), VT); case Intrinsic::r600_read_tidig_x: return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, AMDGPU::VGPR0, VT); @@ -509,7 +702,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { AMDGPU::VGPR2, VT); case AMDGPUIntrinsic::SI_load_const: { SDValue Ops [] = { - ResourceDescriptorToi128(Op.getOperand(1), DAG), + Op.getOperand(1), Op.getOperand(2) }; @@ -518,7 +711,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, VT.getSizeInBits() / 8, 4); return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, - Op->getVTList(), Ops, 2, VT, MMO); + Op->getVTList(), Ops, VT, MMO); } case AMDGPUIntrinsic::SI_sample: return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); @@ -530,7 +723,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); case AMDGPUIntrinsic::SI_vs_load_input: return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, - ResourceDescriptorToi128(Op.getOperand(1), DAG), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } @@ -545,7 +738,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); SDValue Ops [] = { Chain, - ResourceDescriptorToi128(Op.getOperand(2), DAG), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), Op.getOperand(5), @@ -566,8 +759,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { MachineMemOperand::MOStore, VT.getSizeInBits() / 8, 4); return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, - Op->getVTList(), Ops, - sizeof(Ops)/sizeof(Ops[0]), VT, MMO); + Op->getVTList(), Ops, VT, MMO); } default: break; @@ -576,33 +768,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -SDValue SITargetLowering::LowerADD(SDValue Op, - SelectionDAG &DAG) const { - if (Op.getValueType() != MVT::i64) - return SDValue(); - - SDLoc DL(Op); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - - SDValue Zero = DAG.getConstant(0, MVT::i32); - SDValue One = DAG.getConstant(1, MVT::i32); - - SDValue Lo0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, Zero); - SDValue Hi0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, One); - - SDValue Lo1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, Zero); - SDValue Hi1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, One); - - SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Glue); - - SDValue AddLo = DAG.getNode(ISD::ADDC, DL, VTList, Lo0, Lo1); - SDValue Carry = AddLo.getValue(1); - SDValue AddHi = DAG.getNode(ISD::ADDE, DL, VTList, Hi0, Hi1, Carry); - - return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddLo, AddHi.getValue(0)); -} - /// \brief Helper function for LowerBRCOND static SDNode *findUser(SDValue Value, unsigned Opcode) { @@ -616,7 +781,22 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) { if (I->getOpcode() == Opcode) return *I; } - return 0; + return nullptr; +} + +SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { + + MachineFunction &MF = DAG.getMachineFunction(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op); + unsigned FrameIndex = FINode->getIndex(); + + CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + TRI.getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET), MVT::i32); + + return DAG.getTargetFrameIndex(FrameIndex, MVT::i32); } /// This transforms the control flow intrinsics to get the branch destination as @@ -628,7 +808,7 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SDNode *Intr = BRCOND.getOperand(1).getNode(); SDValue Target = BRCOND.getOperand(2); - SDNode *BR = 0; + SDNode *BR = nullptr; if (Intr->getOpcode() == ISD::SETCC) { // As long as we negate the condition everything is fine @@ -661,7 +841,7 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, // build the new intrinsic call SDNode *Result = DAG.getNode( Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, - DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode(); + DAG.getVTList(Res), Ops).getNode(); if (BR) { // Give the branch instruction our target @@ -669,7 +849,7 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, BR->getOperand(0), BRCOND.getOperand(2) }; - DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2); + DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops); } SDValue Chain = SDValue(Result, Result->getNumValues() - 1); @@ -697,42 +877,57 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, return Chain; } -SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - LoadSDNode *Load = cast<LoadSDNode>(Op); +SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, + SDValue Op, + SelectionDAG &DAG) const { + GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); - if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) - return SDValue(); + if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) + return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); - SDValue TruncPtr = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, - Load->getBasePtr(), DAG.getConstant(0, MVT::i32)); - SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr, - DAG.getConstant(2, MVT::i32)); - - SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), - Load->getChain(), Ptr, - DAG.getTargetConstant(0, MVT::i32), - Op.getOperand(2)); - SDValue MergedValues[2] = { - Ret, - Load->getChain() - }; - return DAG.getMergeValues(MergedValues, 2, DL); + SDLoc DL(GSD); + const GlobalValue *GV = GSD->getGlobal(); + MVT PtrVT = getPointerTy(GSD->getAddressSpace()); + + SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT); + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); + + SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, + DAG.getConstant(0, MVT::i32)); + SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, + DAG.getConstant(1, MVT::i32)); + SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue), + PtrLo, GA); + SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue), + PtrHi, DAG.getConstant(0, MVT::i32), + SDValue(Lo.getNode(), 1)); + return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi); } -SDValue SITargetLowering::ResourceDescriptorToi128(SDValue Op, - SelectionDAG &DAG) const { +SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + LoadSDNode *Load = cast<LoadSDNode>(Op); - if (Op.getValueType() == MVT::i128) { - return Op; + if (Op.getValueType().isVector()) { + assert(Op.getValueType().getVectorElementType() == MVT::i32 && + "Custom lowering for non-i32 vectors hasn't been implemented."); + unsigned NumElements = Op.getValueType().getVectorNumElements(); + assert(NumElements != 2 && "v2 loads are supported for all address spaces."); + switch (Load->getAddressSpace()) { + default: break; + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::PRIVATE_ADDRESS: + // v4 loads are supported for private and global memory. + if (NumElements <= 4) + break; + // fall-through + case AMDGPUAS::LOCAL_ADDRESS: + return SplitVectorLoad(Op, DAG); + } } - assert(Op.getOpcode() == ISD::UNDEF); - - return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), MVT::i128, - DAG.getConstant(0, MVT::i64), - DAG.getConstant(0, MVT::i64)); + return AMDGPUTargetLowering::LowerLOAD(Op, DAG); } SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, @@ -740,42 +935,129 @@ SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, SelectionDAG &DAG) const { return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2), - ResourceDescriptorToi128(Op.getOperand(3), DAG), + Op.getOperand(3), Op.getOperand(4)); } -SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { +SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { + if (Op.getValueType() != MVT::i64) + return SDValue(); + + SDLoc DL(Op); + SDValue Cond = Op.getOperand(0); + + SDValue Zero = DAG.getConstant(0, MVT::i32); + SDValue One = DAG.getConstant(1, MVT::i32); + + SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); + SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); + + SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); + SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); + + SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); + + SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); + SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); + + SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); + + SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi); + return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); +} + +// Catch division cases where we can use shortcuts with rcp and rsq +// instructions. +SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); - SDValue True = Op.getOperand(2); - SDValue False = Op.getOperand(3); - SDValue CC = Op.getOperand(4); EVT VT = Op.getValueType(); - SDLoc DL(Op); + bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; + + if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { + if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) && + CLHS->isExactlyValue(1.0)) { + // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to + // the CI documentation has a worst case error of 1 ulp. + // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to + // use it as long as we aren't trying to use denormals. + + // 1.0 / sqrt(x) -> rsq(x) + // + // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP + // error seems really high at 2^29 ULP. + if (RHS.getOpcode() == ISD::FSQRT) + return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); + + // 1.0 / x -> rcp(x) + return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); + } + } - // Possible Min/Max pattern - SDValue MinMax = LowerMinMax(Op, DAG); - if (MinMax.getNode()) { - return MinMax; + if (Unsafe) { + // Turn into multiply by the reciprocal. + // x / y -> x * (1.0 / y) + SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); + return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip); } - SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC); - return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False); + return SDValue(); } -SDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op, - SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - SDLoc DL(Op); +SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { + SDValue FastLowered = LowerFastFDIV(Op, DAG); + if (FastLowered.getNode()) + return FastLowered; - if (VT != MVT::i64) { + // This uses v_rcp_f32 which does not handle denormals. Let this hit a + // selection error for now rather than do something incorrect. + if (Subtarget->hasFP32Denormals()) return SDValue(); - } - SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i32, Op.getOperand(0), - DAG.getConstant(31, MVT::i32)); + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); - return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi); + const APFloat K0Val(BitsToFloat(0x6f800000)); + const SDValue K0 = DAG.getConstantFP(K0Val, MVT::f32); + + const APFloat K1Val(BitsToFloat(0x2f800000)); + const SDValue K1 = DAG.getConstantFP(K1Val, MVT::f32); + + const SDValue One = DAG.getTargetConstantFP(1.0, MVT::f32); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); + + SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); + + SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + + r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); + + SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); + + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + + return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); +} + +SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { + return SDValue(); +} + +SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (VT == MVT::f32) + return LowerFDIV32(Op, DAG); + + if (VT == MVT::f64) + return LowerFDIV64(Op, DAG); + + llvm_unreachable("Unexpected type for fdiv"); } SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { @@ -783,6 +1065,18 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { StoreSDNode *Store = cast<StoreSDNode>(Op); EVT VT = Store->getMemoryVT(); + // These stores are legal. + if (Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + VT.isVector() && VT.getVectorNumElements() == 2 && + VT.getVectorElementType() == MVT::i32) + return SDValue(); + + if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + if (VT.isVector() && VT.getVectorNumElements() > 4) + return SplitVectorStore(Op, DAG); + return SDValue(); + } + SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); if (Ret.getNode()) return Ret; @@ -790,61 +1084,125 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if (VT.isVector() && VT.getVectorNumElements() >= 8) return SplitVectorStore(Op, DAG); - if (Store->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) + if (VT == MVT::i1) + return DAG.getTruncStore(Store->getChain(), DL, + DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), + Store->getBasePtr(), MVT::i1, Store->getMemOperand()); + + return SDValue(); +} + +SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDValue Arg = Op.getOperand(0); + SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT, + DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg, + DAG.getConstantFP(0.5 / M_PI, VT))); + + switch (Op.getOpcode()) { + case ISD::FCOS: + return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart); + case ISD::FSIN: + return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart); + default: + llvm_unreachable("Wrong trig opcode"); + } +} + +//===----------------------------------------------------------------------===// +// Custom DAG optimizations +//===----------------------------------------------------------------------===// + +SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, + DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + EVT ScalarVT = VT.getScalarType(); + if (ScalarVT != MVT::f32) return SDValue(); - SDValue TruncPtr = DAG.getZExtOrTrunc(Store->getBasePtr(), DL, MVT::i32); - SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr, - DAG.getConstant(2, MVT::i32)); - SDValue Chain = Store->getChain(); - SmallVector<SDValue, 8> Values; + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); - if (VT == MVT::i64) { - for (unsigned i = 0; i < 2; ++i) { - Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, - Store->getValue(), DAG.getConstant(i, MVT::i32))); - } - } else if (VT == MVT::i128) { - for (unsigned i = 0; i < 2; ++i) { - for (unsigned j = 0; j < 2; ++j) { - Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, - DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, - Store->getValue(), DAG.getConstant(i, MVT::i32)), - DAG.getConstant(j, MVT::i32))); - } + SDValue Src = N->getOperand(0); + EVT SrcVT = Src.getValueType(); + + // TODO: We could try to match extracting the higher bytes, which would be + // easier if i8 vectors weren't promoted to i32 vectors, particularly after + // types are legalized. v4i8 -> v4f32 is probably the only case to worry + // about in practice. + if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) { + if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { + SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); + DCI.AddToWorklist(Cvt.getNode()); + return Cvt; } - } else { - Values.push_back(Store->getValue()); } - for (unsigned i = 0; i < Values.size(); ++i) { - SDValue PartPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, - Ptr, DAG.getConstant(i, MVT::i32)); - Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, - Chain, Values[i], PartPtr, - DAG.getTargetConstant(0, MVT::i32)); + // We are primarily trying to catch operations on illegal vector types + // before they are expanded. + // For scalars, we can use the more flexible method of checking masked bits + // after legalization. + if (!DCI.isBeforeLegalize() || + !SrcVT.isVector() || + SrcVT.getVectorElementType() != MVT::i8) { + return SDValue(); } - return Chain; -} - -SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op, - SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - SDLoc DL(Op); + assert(DCI.isBeforeLegalize() && "Unexpected legal type"); - if (VT != MVT::i64) { + // Weird sized vectors are a pain to handle, but we know 3 is really the same + // size as 4. + unsigned NElts = SrcVT.getVectorNumElements(); + if (!SrcVT.isSimple() && NElts != 3) return SDValue(); + + // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to + // prevent a mess from expanding to v4i32 and repacking. + if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { + EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); + EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); + EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); + + LoadSDNode *Load = cast<LoadSDNode>(Src); + SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, + Load->getChain(), + Load->getBasePtr(), + LoadVT, + Load->getMemOperand()); + + // Make sure successors of the original load stay after it by updating + // them to use the new Chain. + DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1)); + + SmallVector<SDValue, 4> Elts; + if (RegVT.isVector()) + DAG.ExtractVectorElements(NewLoad, Elts); + else + Elts.push_back(NewLoad); + + SmallVector<SDValue, 4> Ops; + + unsigned EltIdx = 0; + for (SDValue Elt : Elts) { + unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx); + for (unsigned I = 0; I < ComponentsInElt; ++I) { + unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I; + SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt); + DCI.AddToWorklist(Cvt.getNode()); + Ops.push_back(Cvt); + } + + ++EltIdx; + } + + assert(Ops.size() == NElts); + + return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops); } - return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), - DAG.getConstant(0, MVT::i32)); + return SDValue(); } -//===----------------------------------------------------------------------===// -// Custom DAG optimizations -//===----------------------------------------------------------------------===// - SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -852,26 +1210,12 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, EVT VT = N->getValueType(0); switch (N->getOpcode()) { - default: break; - case ISD::SELECT_CC: { - ConstantSDNode *True, *False; - // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc) - if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2))) - && (False = dyn_cast<ConstantSDNode>(N->getOperand(3))) - && True->isAllOnesValue() - && False->isNullValue() - && VT == MVT::i1) { - return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0), - N->getOperand(1), N->getOperand(4)); - - } - break; - } + default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); case ISD::SETCC: { SDValue Arg0 = N->getOperand(0); SDValue Arg1 = N->getOperand(1); SDValue CC = N->getOperand(2); - ConstantSDNode * C = NULL; + ConstantSDNode * C = nullptr; ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get(); // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne) @@ -886,8 +1230,34 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, } break; } + + case AMDGPUISD::CVT_F32_UBYTE0: + case AMDGPUISD::CVT_F32_UBYTE1: + case AMDGPUISD::CVT_F32_UBYTE2: + case AMDGPUISD::CVT_F32_UBYTE3: { + unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; + + SDValue Src = N->getOperand(0); + APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); + + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLO.ShrinkDemandedConstant(Src, Demanded) || + TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { + DCI.CommitTargetLoweringOpt(TLO); + } + + break; } - return SDValue(); + + case ISD::UINT_TO_FP: { + return performUCharToFloatCombine(N, DCI); + } + } + + return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } /// \brief Test if RegClass is one of the VSrc classes @@ -918,9 +1288,11 @@ int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { return -1; } Imm.I = Node->getSExtValue(); - } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) + } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) { + if (N->getValueType(0) != MVT::f32) + return -1; Imm.F = Node->getValueAPF().convertToFloat(); - else + } else return -1; // It isn't an immediate if ((Imm.I >= -16 && Imm.I <= 64) || @@ -940,7 +1312,7 @@ bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate, MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand); const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); - if (Mov == 0 || !TII->isMov(Mov->getMachineOpcode())) + if (!Mov || !TII->isMov(Mov->getMachineOpcode())) return false; const SDValue &Op = Mov->getOperand(0); @@ -987,7 +1359,7 @@ const TargetRegisterClass *SITargetLowering::getRegClassForNode( } return TRI.getPhysRegClass(Reg); } - default: return NULL; + default: return nullptr; } } const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode()); @@ -1047,7 +1419,7 @@ void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, else return; - // Nothing todo if they fit naturaly + // Nothing to do if they fit naturally if (fitsRegClass(DAG, Operand, RegClass)) return; @@ -1059,9 +1431,19 @@ void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, // This is a conservative aproach. It is possible that we can't determine the // correct register class and copy too often, but better safe than sorry. - SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); - SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), - Operand.getValueType(), Operand, RC); + + SDNode *Node; + // We can't use COPY_TO_REGCLASS with FrameIndex arguments. + if (isa<FrameIndexSDNode>(Operand)) { + unsigned Opcode = Operand.getValueType() == MVT::i32 ? + AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + Node = DAG.getMachineNode(Opcode, SDLoc(), Operand.getValueType(), + Operand); + } else { + SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); + Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), + Operand.getValueType(), Operand, RC); + } Operand = SDValue(Node, 0); } @@ -1091,22 +1473,22 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, // Commuted opcode if available int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1; - const MCInstrDesc *DescRev = OpcodeRev == -1 ? 0 : &TII->get(OpcodeRev); + const MCInstrDesc *DescRev = OpcodeRev == -1 ? nullptr : &TII->get(OpcodeRev); assert(!DescRev || DescRev->getNumDefs() == NumDefs); assert(!DescRev || DescRev->getNumOperands() == NumOps); // e64 version if available, -1 otherwise int OpcodeE64 = AMDGPU::getVOPe64(Opcode); - const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? 0 : &TII->get(OpcodeE64); + const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? nullptr : &TII->get(OpcodeE64); + int InputModifiers[3] = {0}; assert(!DescE64 || DescE64->getNumDefs() == NumDefs); - assert(!DescE64 || DescE64->getNumOperands() == (NumOps + 4)); int32_t Immediate = Desc->getSize() == 4 ? 0 : -1; bool HaveVSrc = false, HaveSSrc = false; - // First figure out what we alread have in this instruction + // First figure out what we already have in this instruction. for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; i != e && Op < NumOps; ++i, ++Op) { @@ -1125,7 +1507,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, } } - // If we neither have VSrc nor SSrc it makes no sense to continue + // If we neither have VSrc nor SSrc, it makes no sense to continue. if (!HaveVSrc && !HaveSSrc) return Node; @@ -1141,20 +1523,28 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, const SDValue &Operand = Node->getOperand(i); Ops.push_back(Operand); - // Already folded immediate ? + // Already folded immediate? if (isa<ConstantSDNode>(Operand.getNode()) || isa<ConstantFPSDNode>(Operand.getNode())) continue; - // Is this a VSrc or SSrc operand ? + // Is this a VSrc or SSrc operand? unsigned RegClass = Desc->OpInfo[Op].RegClass; if (isVSrc(RegClass) || isSSrc(RegClass)) { // Try to fold the immediates if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) { - // Folding didn't worked, make sure we don't hit the SReg limit + // Folding didn't work, make sure we don't hit the SReg limit. ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed); } continue; + } else { + // If it's not a VSrc or SSrc operand check if we have a GlobalAddress. + // These will be lowered to immediates, so we will need to insert a MOV. + if (isa<GlobalAddressSDNode>(Ops[i])) { + SDNode *Node = DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(), + Operand.getValueType(), Operand); + Ops[i] = SDValue(Node, 0); + } } if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) { @@ -1168,18 +1558,18 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, fitsRegClass(DAG, Ops[1], OtherRegClass))) { // Swap commutable operands - SDValue Tmp = Ops[1]; - Ops[1] = Ops[0]; - Ops[0] = Tmp; + std::swap(Ops[0], Ops[1]); Desc = DescRev; - DescRev = 0; + DescRev = nullptr; continue; } } - if (DescE64 && !Immediate) { + if (Immediate) + continue; + if (DescE64) { // Test if it makes sense to switch to e64 encoding unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass; if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass)) @@ -1194,14 +1584,46 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, Immediate = -1; Promote2e64 = true; Desc = DescE64; - DescE64 = 0; + DescE64 = nullptr; } } + + if (!DescE64 && !Promote2e64) + continue; + if (!Operand.isMachineOpcode()) + continue; + if (Operand.getMachineOpcode() == AMDGPU::FNEG_SI) { + Ops.pop_back(); + Ops.push_back(Operand.getOperand(0)); + InputModifiers[i] = 1; + Promote2e64 = true; + if (!DescE64) + continue; + Desc = DescE64; + DescE64 = nullptr; + } + else if (Operand.getMachineOpcode() == AMDGPU::FABS_SI) { + Ops.pop_back(); + Ops.push_back(Operand.getOperand(0)); + InputModifiers[i] = 2; + Promote2e64 = true; + if (!DescE64) + continue; + Desc = DescE64; + DescE64 = nullptr; + } } if (Promote2e64) { + std::vector<SDValue> OldOps(Ops); + Ops.clear(); + for (unsigned i = 0; i < OldOps.size(); ++i) { + // src_modifier + Ops.push_back(DAG.getTargetConstant(InputModifiers[i], MVT::i32)); + Ops.push_back(OldOps[i]); + } // Add the modifier flags while promoting - for (unsigned i = 0; i < 4; ++i) + for (unsigned i = 0; i < 2; ++i) Ops.push_back(DAG.getTargetConstant(0, MVT::i32)); } @@ -1279,7 +1701,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32)); for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) Ops.push_back(Node->getOperand(i)); - Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size()); + Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); // If we only got one lane, replace it with a copy // (if NewDmask has only one bit set...) @@ -1311,7 +1733,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, } } -/// \brief Fold the instructions after slecting them +/// \brief Fold the instructions after selecting them. SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { const SIInstrInfo *TII = |