diff options
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1078 |
1 files changed, 763 insertions, 315 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 352423ed..54caa2c 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -37,7 +37,7 @@ static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT, MachineFunction &MF = State.getMachineFunction(); AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); - uint64_t Offset = MFI->allocateKernArg(ValVT.getStoreSize(), + uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(), ArgFlags.getOrigAlign()); State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo)); return true; @@ -55,14 +55,6 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); } -EVT AMDGPUTargetLowering::getEquivalentBitType(LLVMContext &Ctx, EVT VT) { - unsigned StoreSize = VT.getStoreSizeInBits(); - if (StoreSize <= 32) - return EVT::getIntegerVT(Ctx, StoreSize); - - return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); -} - AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -180,16 +172,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v2f64, Promote); AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32); - setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); - setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); - - setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); - setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); - - setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); - setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); - setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); - setTruncStoreAction(MVT::i64, MVT::i1, Expand); setTruncStoreAction(MVT::i64, MVT::i8, Expand); setTruncStoreAction(MVT::i64, MVT::i16, Expand); @@ -287,6 +269,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, } setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom); const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; for (MVT VT : ScalarIntVTs) { @@ -367,6 +350,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, VT, Expand); setOperationAction(ISD::FP_TO_UINT, VT, Expand); setOperationAction(ISD::MUL, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); + setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::OR, VT, Expand); setOperationAction(ISD::SHL, VT, Expand); setOperationAction(ISD::SRA, VT, Expand); @@ -440,22 +425,31 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v4f32, Promote); AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32); + // There are no libcalls of any kind. + for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) + setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr); + setBooleanContents(ZeroOrNegativeOneBooleanContent); setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); setSchedulingPreference(Sched::RegPressure); setJumpIsExpensive(true); + // FIXME: This is only partially true. If we have to do vector compares, any + // SGPR pair can be a condition register. If we have a uniform condition, we + // are better off doing SALU operations, where there is only one SCC. For now, + // we don't have a way of knowing during instruction selection if a condition + // will be uniform and we always use vector compares. Assume we are using + // vector compares until that is fixed. + setHasMultipleConditionRegisters(true); + // SI at least has hardware support for floating point exceptions, but no way // of using or handling them is implemented. They are also optional in OpenCL // (Section 7.3) setHasFloatingPointExceptions(Subtarget->hasFPExceptions()); - setSelectIsExpensive(false); PredictableSelectIsExpensive = false; - setFsqrtIsCheap(true); - // We want to find all load dependencies for long chains of stores to enable // merging into very wide vectors. The problem is with vectors with > 4 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16 @@ -472,22 +466,42 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, MaxStoresPerMemset = 4096; setTargetDAGCombine(ISD::BITCAST); - setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::MULHU); + setTargetDAGCombine(ISD::MULHS); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); + setTargetDAGCombine(ISD::FNEG); } //===----------------------------------------------------------------------===// // Target Information //===----------------------------------------------------------------------===// +static bool fnegFoldsIntoOp(unsigned Opc) { + switch (Opc) { + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + case ISD::FMA: + case ISD::FMAD: + case ISD::FSIN: + case AMDGPUISD::RCP: + case AMDGPUISD::RCP_LEGACY: + case AMDGPUISD::SIN_HW: + case AMDGPUISD::FMUL_LEGACY: + return true; + default: + return false; + } +} + MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const { return MVT::i32; } @@ -500,7 +514,8 @@ bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const { // FIXME: Why are we reporting vectors of FP immediates as legal? bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { EVT ScalarVT = VT.getScalarType(); - return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64); + return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 || + (ScalarVT == MVT::f16 && Subtarget->has16BitInsts())); } // We don't want to shrink f64 / f32 constants. @@ -565,12 +580,12 @@ bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const { bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { assert(VT.isFloatingPoint()); - return VT == MVT::f32 || VT == MVT::f64; + return VT == MVT::f32 || VT == MVT::f64 || (Subtarget->has16BitInsts() && + VT == MVT::f16); } bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { - assert(VT.isFloatingPoint()); - return VT == MVT::f32 || VT == MVT::f64; + return isFAbsFree(VT); } bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT, @@ -593,19 +608,32 @@ bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { // Truncate is just accessing a subregister. - return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0); + + unsigned SrcSize = Source.getSizeInBits(); + unsigned DestSize = Dest.getSizeInBits(); + + return DestSize < SrcSize && DestSize % 32 == 0 ; } bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { // Truncate is just accessing a subregister. - return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() && - (Dest->getPrimitiveSizeInBits() % 32 == 0); + + unsigned SrcSize = Source->getScalarSizeInBits(); + unsigned DestSize = Dest->getScalarSizeInBits(); + + if (DestSize== 16 && Subtarget->has16BitInsts()) + return SrcSize >= 32; + + return DestSize < SrcSize && DestSize % 32 == 0; } bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { unsigned SrcSize = Src->getScalarSizeInBits(); unsigned DestSize = Dest->getScalarSizeInBits(); + if (SrcSize == 16 && Subtarget->has16BitInsts()) + return DestSize >= 32; + return SrcSize == 32 && DestSize == 64; } @@ -614,6 +642,10 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { // practical purposes, the extra mov 0 to load a 64-bit is free. As used, // this will enable reducing 64-bit operations the 32-bit, which is always // good. + + if (Src == MVT::i16) + return Dest == MVT::i32 ||Dest == MVT::i64 ; + return Src == MVT::i32 && Dest == MVT::i64; } @@ -635,9 +667,105 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { // TargetLowering Callbacks //===---------------------------------------------------------------------===// -void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State, +/// The SelectionDAGBuilder will automatically promote function arguments +/// with illegal types. However, this does not work for the AMDGPU targets +/// since the function arguments are stored in memory as these illegal types. +/// In order to handle this properly we need to get the original types sizes +/// from the LLVM IR Function and fixup the ISD:InputArg values before +/// passing them to AnalyzeFormalArguments() + +/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting +/// input values across multiple registers. Each item in the Ins array +/// represents a single value that will be stored in regsters. Ins[x].VT is +/// the value type of the value that will be stored in the register, so +/// whatever SDNode we lower the argument to needs to be this type. +/// +/// In order to correctly lower the arguments we need to know the size of each +/// argument. Since Ins[x].VT gives us the size of the register that will +/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type +/// for the orignal function argument so that we can deduce the correct memory +/// type to use for Ins[x]. In most cases the correct memory type will be +/// Ins[x].ArgVT. However, this will not always be the case. If, for example, +/// we have a kernel argument of type v8i8, this argument will be split into +/// 8 parts and each part will be represented by its own item in the Ins array. +/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of +/// the argument before it was split. From this, we deduce that the memory type +/// for each individual part is i8. We pass the memory type as LocVT to the +/// calling convention analysis function and the register type (Ins[x].VT) as +/// the ValVT. +void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl<ISD::InputArg> &Ins) const { + for (unsigned i = 0, e = Ins.size(); i != e; ++i) { + const ISD::InputArg &In = Ins[i]; + EVT MemVT; + + unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT); + + if (!Subtarget->isAmdHsaOS() && + (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) { + // The ABI says the caller will extend these values to 32-bits. + MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32; + } else if (NumRegs == 1) { + // This argument is not split, so the IR type is the memory type. + assert(!In.Flags.isSplit()); + if (In.ArgVT.isExtended()) { + // We have an extended type, like i24, so we should just use the register type + MemVT = In.VT; + } else { + MemVT = In.ArgVT; + } + } else if (In.ArgVT.isVector() && In.VT.isVector() && + In.ArgVT.getScalarType() == In.VT.getScalarType()) { + assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements()); + // We have a vector value which has been split into a vector with + // the same scalar type, but fewer elements. This should handle + // all the floating-point vector types. + MemVT = In.VT; + } else if (In.ArgVT.isVector() && + In.ArgVT.getVectorNumElements() == NumRegs) { + // This arg has been split so that each element is stored in a separate + // register. + MemVT = In.ArgVT.getScalarType(); + } else if (In.ArgVT.isExtended()) { + // We have an extended type, like i65. + MemVT = In.VT; + } else { + unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs; + assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0); + if (In.VT.isInteger()) { + MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits); + } else if (In.VT.isVector()) { + assert(!In.VT.getScalarType().isFloatingPoint()); + unsigned NumElements = In.VT.getVectorNumElements(); + assert(MemoryBits % NumElements == 0); + // This vector type has been split into another vector type with + // a different elements size. + EVT ScalarVT = EVT::getIntegerVT(State.getContext(), + MemoryBits / NumElements); + MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements); + } else { + llvm_unreachable("cannot deduce memory type."); + } + } + // Convert one element vectors to scalar. + if (MemVT.isVector() && MemVT.getVectorNumElements() == 1) + MemVT = MemVT.getScalarType(); + + if (MemVT.isExtended()) { + // This should really only happen if we have vec3 arguments + assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3); + MemVT = MemVT.getPow2VectorType(State.getContext()); + } + + assert(MemVT.isSimple()); + allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags, + State); + } +} + +void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State, + const SmallVectorImpl<ISD::InputArg> &Ins) const { State.AnalyzeFormalArguments(Ins, CC_AMDGPU); } @@ -678,8 +806,10 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, Fn, "unsupported call to function " + FuncName, CLI.DL.getDebugLoc()); DAG.getContext()->diagnose(NoCalls); - for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I) - InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT)); + if (!CLI.IsTailCall) { + for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I) + InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT)); + } return DAG.getEntryNode(); } @@ -718,6 +848,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); + case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); case ISD::CTLZ: @@ -745,94 +876,6 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, } } -// FIXME: This implements accesses to initialized globals in the constant -// address space by copying them to private and accessing that. It does not -// properly handle illegal types or vectors. The private vector loads are not -// scalarized, and the illegal scalars hit an assertion. This technique will not -// work well with large initializers, and this should eventually be -// removed. Initialized globals should be placed into a data section that the -// runtime will load into a buffer before the kernel is executed. Uses of the -// global need to be replaced with a pointer loaded from an implicit kernel -// argument into this buffer holding the copy of the data, which will remove the -// need for any of this. -SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, - const GlobalValue *GV, - const SDValue &InitPtr, - SDValue Chain, - SelectionDAG &DAG) const { - const DataLayout &TD = DAG.getDataLayout(); - SDLoc DL(InitPtr); - Type *InitTy = Init->getType(); - - if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) { - EVT VT = EVT::getEVT(InitTy); - PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); - return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), - TD.getPrefTypeAlignment(InitTy)); - } - - if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) { - EVT VT = EVT::getEVT(CFP->getType()); - PointerType *PtrTy = PointerType::get(CFP->getType(), 0); - return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), - TD.getPrefTypeAlignment(CFP->getType())); - } - - if (StructType *ST = dyn_cast<StructType>(InitTy)) { - const StructLayout *SL = TD.getStructLayout(ST); - - EVT PtrVT = InitPtr.getValueType(); - SmallVector<SDValue, 8> Chains; - - for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) { - SDValue Offset = DAG.getConstant(SL->getElementOffset(I), DL, PtrVT); - SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); - - Constant *Elt = Init->getAggregateElement(I); - Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); - } - - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); - } - - if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) { - EVT PtrVT = InitPtr.getValueType(); - - unsigned NumElements; - if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy)) - NumElements = AT->getNumElements(); - else if (VectorType *VT = dyn_cast<VectorType>(SeqTy)) - NumElements = VT->getNumElements(); - else - llvm_unreachable("Unexpected type"); - - unsigned EltSize = TD.getTypeAllocSize(SeqTy->getElementType()); - SmallVector<SDValue, 8> Chains; - for (unsigned i = 0; i < NumElements; ++i) { - SDValue Offset = DAG.getConstant(i * EltSize, DL, PtrVT); - SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); - - Constant *Elt = Init->getAggregateElement(i); - Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); - } - - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); - } - - if (isa<UndefValue>(Init)) { - EVT VT = EVT::getEVT(InitTy); - PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); - return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), - TD.getPrefTypeAlignment(InitTy)); - } - - Init->dump(); - llvm_unreachable("Unhandled constant initializer"); -} - static bool hasDefinedInitializer(const GlobalValue *GV) { const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV); if (!GVar || !GVar->hasInitializer()) @@ -850,11 +893,6 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, const GlobalValue *GV = G->getGlobal(); switch (G->getAddressSpace()) { - case AMDGPUAS::CONSTANT_ADDRESS: { - MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); - SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(G), ConstPtrVT); - return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(G), ConstPtrVT, GA); - } case AMDGPUAS::LOCAL_ADDRESS: { // XXX: What does the value of G->getOffset() mean? assert(G->getOffset() == 0 && @@ -864,24 +902,8 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, if (hasDefinedInitializer(GV)) break; - unsigned Offset; - if (MFI->LocalMemoryObjects.count(GV) == 0) { - unsigned Align = GV->getAlignment(); - if (Align == 0) - Align = DL.getABITypeAlignment(GV->getValueType()); - - /// TODO: We should sort these to minimize wasted space due to alignment - /// padding. Currently the padding is decided by the first encountered use - /// during lowering. - Offset = MFI->LDSSize = alignTo(MFI->LDSSize, Align); - MFI->LocalMemoryObjects[GV] = Offset; - MFI->LDSSize += DL.getTypeAllocSize(GV->getValueType()); - } else { - Offset = MFI->LocalMemoryObjects[GV]; - } - - return DAG.getConstant(Offset, SDLoc(Op), - getPointerTy(DL, AMDGPUAS::LOCAL_ADDRESS)); + unsigned Offset = MFI->allocateLDSGlobal(DL, *GV); + return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType()); } } @@ -1097,65 +1119,6 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, return DAG.getMergeValues(Ops, SL); } -// FIXME: This isn't doing anything for SI. This should be used in a target -// combine during type legalization. -SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, - SelectionDAG &DAG) const { - StoreSDNode *Store = cast<StoreSDNode>(Op); - EVT MemVT = Store->getMemoryVT(); - unsigned MemBits = MemVT.getSizeInBits(); - - // Byte stores are really expensive, so if possible, try to pack 32-bit vector - // truncating store into an i32 store. - // XXX: We could also handle optimize other vector bitwidths. - if (!MemVT.isVector() || MemBits > 32) { - return SDValue(); - } - - SDLoc DL(Op); - SDValue Value = Store->getValue(); - EVT VT = Value.getValueType(); - EVT ElemVT = VT.getVectorElementType(); - SDValue Ptr = Store->getBasePtr(); - EVT MemEltVT = MemVT.getVectorElementType(); - unsigned MemEltBits = MemEltVT.getSizeInBits(); - unsigned MemNumElements = MemVT.getVectorNumElements(); - unsigned PackedSize = MemVT.getStoreSizeInBits(); - SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, DL, MVT::i32); - - assert(Value.getValueType().getScalarSizeInBits() >= 32); - - SDValue PackedValue; - for (unsigned i = 0; i < MemNumElements; ++i) { - SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value, - DAG.getConstant(i, DL, MVT::i32)); - Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32); - Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg - - SDValue Shift = DAG.getConstant(MemEltBits * i, DL, MVT::i32); - Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift); - - if (i == 0) { - PackedValue = Elt; - } else { - PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt); - } - } - - if (PackedSize < 32) { - EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize); - return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr, - Store->getMemOperand()->getPointerInfo(), PackedVT, - Store->getAlignment(), - Store->getMemOperand()->getFlags()); - } - - return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr, - Store->getMemOperand()->getPointerInfo(), - Store->getAlignment(), - Store->getMemOperand()->getFlags()); -} - SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, SelectionDAG &DAG) const { StoreSDNode *Store = cast<StoreSDNode>(Op); @@ -1670,7 +1633,7 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType() == MVT::f64); - APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52"); + APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64); SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src); @@ -1681,7 +1644,7 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src); - APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51"); + APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64); EVT SetCCVT = @@ -1988,14 +1951,26 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, assert(Op.getOperand(0).getValueType() == MVT::i64 && "operation should be legal"); + // TODO: Factor out code common with LowerSINT_TO_FP. + EVT DestVT = Op.getValueType(); - if (DestVT == MVT::f64) - return LowerINT_TO_FP64(Op, DAG, false); + if (Subtarget->has16BitInsts() && DestVT == MVT::f16) { + SDLoc DL(Op); + SDValue Src = Op.getOperand(0); + + SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); + SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op)); + SDValue FPRound = + DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag); + + return FPRound; + } if (DestVT == MVT::f32) return LowerINT_TO_FP32(Op, DAG, false); - return SDValue(); + assert(DestVT == MVT::f64); + return LowerINT_TO_FP64(Op, DAG, false); } SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, @@ -2003,14 +1978,26 @@ SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, assert(Op.getOperand(0).getValueType() == MVT::i64 && "operation should be legal"); + // TODO: Factor out code common with LowerUINT_TO_FP. + EVT DestVT = Op.getValueType(); + if (Subtarget->has16BitInsts() && DestVT == MVT::f16) { + SDLoc DL(Op); + SDValue Src = Op.getOperand(0); + + SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); + SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op)); + SDValue FPRound = + DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag); + + return FPRound; + } + if (DestVT == MVT::f32) return LowerINT_TO_FP32(Op, DAG, true); - if (DestVT == MVT::f64) - return LowerINT_TO_FP64(Op, DAG, true); - - return SDValue(); + assert(DestVT == MVT::f64); + return LowerINT_TO_FP64(Op, DAG, true); } SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, @@ -2042,10 +2029,118 @@ SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result); } +SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const { + + if (getTargetMachine().Options.UnsafeFPMath) { + // There is a generic expand for FP_TO_FP16 with unsafe fast math. + return SDValue(); + } + + SDLoc DL(Op); + SDValue N0 = Op.getOperand(0); + assert (N0.getSimpleValueType() == MVT::f64); + + // f64 -> f16 conversion using round-to-nearest-even rounding mode. + const unsigned ExpMask = 0x7ff; + const unsigned ExpBiasf64 = 1023; + const unsigned ExpBiasf16 = 15; + SDValue Zero = DAG.getConstant(0, DL, MVT::i32); + SDValue One = DAG.getConstant(1, DL, MVT::i32); + SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0); + SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U, + DAG.getConstant(32, DL, MVT::i64)); + UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32); + U = DAG.getZExtOrTrunc(U, DL, MVT::i32); + SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, + DAG.getConstant(20, DL, MVT::i64)); + E = DAG.getNode(ISD::AND, DL, MVT::i32, E, + DAG.getConstant(ExpMask, DL, MVT::i32)); + // Subtract the fp64 exponent bias (1023) to get the real exponent and + // add the f16 bias (15) to get the biased exponent for the f16 format. + E = DAG.getNode(ISD::ADD, DL, MVT::i32, E, + DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32)); + + SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, + DAG.getConstant(8, DL, MVT::i32)); + M = DAG.getNode(ISD::AND, DL, MVT::i32, M, + DAG.getConstant(0xffe, DL, MVT::i32)); + + SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH, + DAG.getConstant(0x1ff, DL, MVT::i32)); + MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U); + + SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ); + M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set); + + // (M != 0 ? 0x0200 : 0) | 0x7c00; + SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32, + DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32), + Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32)); + + // N = M | (E << 12); + SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M, + DAG.getNode(ISD::SHL, DL, MVT::i32, E, + DAG.getConstant(12, DL, MVT::i32))); + + // B = clamp(1-E, 0, 13); + SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32, + One, E); + SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero); + B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B, + DAG.getConstant(13, DL, MVT::i32)); + + SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M, + DAG.getConstant(0x1000, DL, MVT::i32)); + + SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B); + SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B); + SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE); + D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1); + + SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT); + SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V, + DAG.getConstant(0x7, DL, MVT::i32)); + V = DAG.getNode(ISD::SRL, DL, MVT::i32, V, + DAG.getConstant(2, DL, MVT::i32)); + SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32), + One, Zero, ISD::SETEQ); + SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32), + One, Zero, ISD::SETGT); + V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1); + V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1); + + V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32), + DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT); + V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32), + I, V, ISD::SETEQ); + + // Extract the sign bit. + SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, + DAG.getConstant(16, DL, MVT::i32)); + Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign, + DAG.getConstant(0x8000, DL, MVT::i32)); + + V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V); + return DAG.getZExtOrTrunc(V, DL, Op.getValueType()); +} + SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const { SDValue Src = Op.getOperand(0); + // TODO: Factor out code common with LowerFP_TO_UINT. + + EVT SrcVT = Src.getValueType(); + if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) { + SDLoc DL(Op); + + SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src); + SDValue FpToInt32 = + DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend); + + return FpToInt32; + } + if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) return LowerFP64_TO_INT(Op, DAG, true); @@ -2056,6 +2151,19 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const { SDValue Src = Op.getOperand(0); + // TODO: Factor out code common with LowerFP_TO_SINT. + + EVT SrcVT = Src.getValueType(); + if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) { + SDLoc DL(Op); + + SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src); + SDValue FpToInt32 = + DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend); + + return FpToInt32; + } + if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) return LowerFP64_TO_INT(Op, DAG, false); @@ -2068,8 +2176,7 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, MVT VT = Op.getSimpleValueType(); MVT ScalarVT = VT.getScalarType(); - if (!VT.isVector()) - return SDValue(); + assert(VT.isVector()); SDValue Src = Op.getOperand(0); SDLoc DL(Op); @@ -2108,17 +2215,20 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) { (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24; } -static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) { +static bool simplifyI24(SDNode *Node24, unsigned OpIdx, + TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Op = Node24->getOperand(OpIdx); EVT VT = Op.getValueType(); APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24); APInt KnownZero, KnownOne; TargetLowering::TargetLoweringOpt TLO(DAG, true, true); - if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) - DCI.CommitTargetLoweringOpt(TLO); + if (TLO.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI)) + return true; + + return false; } template <typename IntTy> @@ -2188,6 +2298,9 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, // problems during legalization, the emitted instructions to pack and unpack // the bytes again are not eliminated in the case of an unaligned copy. if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) { + if (VT.isVector()) + return scalarizeVectorLoad(LN, DAG); + SDValue Ops[2]; std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG); return DAG.getMergeValues(Ops, SDLoc(N)); @@ -2236,8 +2349,12 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, // order problems during legalization, the emitted instructions to pack and // unpack the bytes again are not eliminated in the case of an unaligned // copy. - if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) + if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) { + if (VT.isVector()) + return scalarizeVectorStore(SN, DAG); + return expandUnalignedStore(SN, DAG); + } if (!IsFast) return SDValue(); @@ -2262,38 +2379,21 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, SN->getBasePtr(), SN->getMemOperand()); } -// TODO: Should repeat for other bit ops. -SDValue AMDGPUTargetLowering::performAndCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - if (N->getValueType(0) != MVT::i64) - return SDValue(); - - // Break up 64-bit and of a constant into two 32-bit ands. This will typically - // happen anyway for a VALU 64-bit and. This exposes other 32-bit integer - // combine opportunities since most 64-bit operations are decomposed this way. - // TODO: We won't want this for SALU especially if it is an inline immediate. - const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); - if (!RHS) - return SDValue(); - - uint64_t Val = RHS->getZExtValue(); - if (Lo_32(Val) != 0 && Hi_32(Val) != 0 && !RHS->hasOneUse()) { - // If either half of the constant is 0, this is really a 32-bit and, so - // split it. If we can re-use the full materialized constant, keep it. - return SDValue(); - } - - SDLoc SL(N); +/// Split the 64-bit value \p LHS into two 32-bit components, and perform the +/// binary operation \p Opc to it with the corresponding constant operands. +SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( + DAGCombinerInfo &DCI, const SDLoc &SL, + unsigned Opc, SDValue LHS, + uint32_t ValLo, uint32_t ValHi) const { SelectionDAG &DAG = DCI.DAG; - SDValue Lo, Hi; - std::tie(Lo, Hi) = split64BitValue(N->getOperand(0), DAG); + std::tie(Lo, Hi) = split64BitValue(LHS, DAG); - SDValue LoRHS = DAG.getConstant(Lo_32(Val), SL, MVT::i32); - SDValue HiRHS = DAG.getConstant(Hi_32(Val), SL, MVT::i32); + SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32); + SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32); - SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, LoRHS); - SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, HiRHS); + SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS); + SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS); // Re-visit the ands. It's possible we eliminated one of them and it could // simplify the vector. @@ -2408,11 +2508,40 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair); } +// We need to specifically handle i64 mul here to avoid unnecessary conversion +// instructions. If we only match on the legalized i64 mul expansion, +// SimplifyDemandedBits will be unable to remove them because there will be +// multiple uses due to the separate mul + mulh[su]. +static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, + SDValue N0, SDValue N1, unsigned Size, bool Signed) { + if (Size <= 32) { + unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24; + return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1); + } + + // Because we want to eliminate extension instructions before the + // operation, we need to create a single user here (i.e. not the separate + // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it. + + unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24; + + SDValue Mul = DAG.getNode(MulOpc, SL, + DAG.getVTList(MVT::i32, MVT::i32), N0, N1); + + return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, + Mul.getValue(0), Mul.getValue(1)); +} + SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); - if (VT.isVector() || VT.getSizeInBits() > 32) + unsigned Size = VT.getSizeInBits(); + if (VT.isVector() || Size > 64) + return SDValue(); + + // There are i16 integer mul/mad. + if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16)) return SDValue(); SelectionDAG &DAG = DCI.DAG; @@ -2425,11 +2554,11 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); - Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1); + Mul = getMul24(DAG, DL, N0, N1, Size, false); } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); - Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1); + Mul = getMul24(DAG, DL, N0, N1, Size, true); } else { return SDValue(); } @@ -2439,6 +2568,77 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, return DAG.getSExtOrTrunc(Mul, DL, VT); } +SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + + if (!Subtarget->hasMulI24() || VT.isVector()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (!isI24(N0, DAG) || !isI24(N1, DAG)) + return SDValue(); + + N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); + + SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1); + DCI.AddToWorklist(Mulhi.getNode()); + return DAG.getSExtOrTrunc(Mulhi, DL, VT); +} + +SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + + if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (!isU24(N0, DAG) || !isU24(N1, DAG)) + return SDValue(); + + N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); + + SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1); + DCI.AddToWorklist(Mulhi.getNode()); + return DAG.getZExtOrTrunc(Mulhi, DL, VT); +} + +SDValue AMDGPUTargetLowering::performMulLoHi24Combine( + SDNode *N, DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + + // Simplify demanded bits before splitting into multiple users. + if (simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI)) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24); + + unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24; + unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24; + + SDLoc SL(N); + + SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1); + SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1); + return DAG.getMergeValues({ MulLo, MulHi }, SL); +} + static bool isNegativeOne(SDValue Val) { if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) return C->isAllOnesValue(); @@ -2449,23 +2649,21 @@ static bool isCtlzOpc(unsigned Opc) { return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; } -// Get FFBH node if the incoming op may have been type legalized from a smaller -// type VT. -// Need to match pre-legalized type because the generic legalization inserts the -// add/sub between the select and compare. -static SDValue getFFBH_U32(const TargetLowering &TLI, SelectionDAG &DAG, - const SDLoc &SL, SDValue Op) { +SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG, + SDValue Op, + const SDLoc &DL) const { EVT VT = Op.getValueType(); - EVT LegalVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); - if (LegalVT != MVT::i32) + EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT); + if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() && + LegalVT != MVT::i16)) return SDValue(); if (VT != MVT::i32) - Op = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Op); + Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op); - SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Op); + SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, DL, MVT::i32, Op); if (VT != MVT::i32) - FFBH = DAG.getNode(ISD::TRUNCATE, SL, VT, FFBH); + FFBH = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBH); return FFBH; } @@ -2493,7 +2691,7 @@ SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond, isCtlzOpc(RHS.getOpcode()) && RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) { - return getFFBH_U32(*this, DAG, SL, CmpLHS); + return getFFBH_U32(DAG, CmpLHS, SL); } // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x @@ -2501,14 +2699,99 @@ SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond, isCtlzOpc(LHS.getOpcode()) && LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) { - return getFFBH_U32(*this, DAG, SL, CmpLHS); + return getFFBH_U32(DAG, CmpLHS, SL); + } + + return SDValue(); +} + +static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, + unsigned Op, + const SDLoc &SL, + SDValue Cond, + SDValue N1, + SDValue N2) { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N1.getValueType(); + + SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond, + N1.getOperand(0), N2.getOperand(0)); + DCI.AddToWorklist(NewSelect.getNode()); + return DAG.getNode(Op, SL, VT, NewSelect); +} + +// Pull a free FP operation out of a select so it may fold into uses. +// +// select c, (fneg x), (fneg y) -> fneg (select c, x, y) +// select c, (fneg x), k -> fneg (select c, x, (fneg k)) +// +// select c, (fabs x), (fabs y) -> fabs (select c, x, y) +// select c, (fabs x), +k -> fabs (select c, x, k) +static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, + SDValue N) { + SelectionDAG &DAG = DCI.DAG; + SDValue Cond = N.getOperand(0); + SDValue LHS = N.getOperand(1); + SDValue RHS = N.getOperand(2); + + EVT VT = N.getValueType(); + if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) || + (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) { + return distributeOpThroughSelect(DCI, LHS.getOpcode(), + SDLoc(N), Cond, LHS, RHS); + } + + bool Inv = false; + if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) { + std::swap(LHS, RHS); + Inv = true; + } + + // TODO: Support vector constants. + ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); + if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) { + SDLoc SL(N); + // If one side is an fneg/fabs and the other is a constant, we can push the + // fneg/fabs down. If it's an fabs, the constant needs to be non-negative. + SDValue NewLHS = LHS.getOperand(0); + SDValue NewRHS = RHS; + + // Careful: if the neg can be folded up, don't try to pull it back down. + bool ShouldFoldNeg = true; + + if (NewLHS.hasOneUse()) { + unsigned Opc = NewLHS.getOpcode(); + if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc)) + ShouldFoldNeg = false; + if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL) + ShouldFoldNeg = false; + } + + if (ShouldFoldNeg) { + if (LHS.getOpcode() == ISD::FNEG) + NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + else if (CRHS->isNegative()) + return SDValue(); + + if (Inv) + std::swap(NewLHS, NewRHS); + + SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, + Cond, NewLHS, NewRHS); + DCI.AddToWorklist(NewSelect.getNode()); + return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect); + } } return SDValue(); } + SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const { + if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) + return Folded; + SDValue Cond = N->getOperand(0); if (Cond.getOpcode() != ISD::SETCC) return SDValue(); @@ -2521,6 +2804,25 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, SDValue True = N->getOperand(1); SDValue False = N->getOperand(2); + if (Cond.hasOneUse()) { // TODO: Look for multiple select uses. + SelectionDAG &DAG = DCI.DAG; + if ((DAG.isConstantValueOfAnyType(True) || + DAG.isConstantValueOfAnyType(True)) && + (!DAG.isConstantValueOfAnyType(False) && + !DAG.isConstantValueOfAnyType(False))) { + // Swap cmp + select pair to move constant to false input. + // This will allow using VOPC cndmasks more often. + // select (setcc x, y), k, x -> select (setcc y, x) x, x + + SDLoc SL(N); + ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), + LHS.getValueType().isInteger()); + + SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC); + return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True); + } + } + if (VT == MVT::f32 && Cond.hasOneUse()) { SDValue MinMax = CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); @@ -2533,6 +2835,135 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, return performCtlzCombine(SDLoc(N), Cond, True, False, DCI); } +SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + unsigned Opc = N0.getOpcode(); + + // If the input has multiple uses and we can either fold the negate down, or + // the other uses cannot, give up. This both prevents unprofitable + // transformations and infinite loops: we won't repeatedly try to fold around + // a negate that has no 'good' form. + // + // TODO: Check users can fold + if (fnegFoldsIntoOp(Opc) && !N0.hasOneUse()) + return SDValue(); + + SDLoc SL(N); + switch (Opc) { + case ISD::FADD: { + if (!mayIgnoreSignedZero(N0)) + return SDValue(); + + // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y)) + SDValue LHS = N0.getOperand(0); + SDValue RHS = N0.getOperand(1); + + if (LHS.getOpcode() != ISD::FNEG) + LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS); + else + LHS = LHS.getOperand(0); + + if (RHS.getOpcode() != ISD::FNEG) + RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + else + RHS = RHS.getOperand(0); + + SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS); + if (!N0.hasOneUse()) + DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); + return Res; + } + case ISD::FMUL: + case AMDGPUISD::FMUL_LEGACY: { + // (fneg (fmul x, y)) -> (fmul x, (fneg y)) + // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y)) + SDValue LHS = N0.getOperand(0); + SDValue RHS = N0.getOperand(1); + + if (LHS.getOpcode() == ISD::FNEG) + LHS = LHS.getOperand(0); + else if (RHS.getOpcode() == ISD::FNEG) + RHS = RHS.getOperand(0); + else + RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + + SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS); + if (!N0.hasOneUse()) + DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); + return Res; + } + case ISD::FMA: + case ISD::FMAD: { + if (!mayIgnoreSignedZero(N0)) + return SDValue(); + + // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z)) + SDValue LHS = N0.getOperand(0); + SDValue MHS = N0.getOperand(1); + SDValue RHS = N0.getOperand(2); + + if (LHS.getOpcode() == ISD::FNEG) + LHS = LHS.getOperand(0); + else if (MHS.getOpcode() == ISD::FNEG) + MHS = MHS.getOperand(0); + else + MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS); + + if (RHS.getOpcode() != ISD::FNEG) + RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + else + RHS = RHS.getOperand(0); + + SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS); + if (!N0.hasOneUse()) + DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); + return Res; + } + case ISD::FP_EXTEND: + case AMDGPUISD::RCP: + case AMDGPUISD::RCP_LEGACY: + case ISD::FSIN: + case AMDGPUISD::SIN_HW: { + SDValue CvtSrc = N0.getOperand(0); + if (CvtSrc.getOpcode() == ISD::FNEG) { + // (fneg (fp_extend (fneg x))) -> (fp_extend x) + // (fneg (rcp (fneg x))) -> (rcp x) + return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0)); + } + + if (!N0.hasOneUse()) + return SDValue(); + + // (fneg (fp_extend x)) -> (fp_extend (fneg x)) + // (fneg (rcp x)) -> (rcp (fneg x)) + SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); + return DAG.getNode(Opc, SL, VT, Neg); + } + case ISD::FP_ROUND: { + SDValue CvtSrc = N0.getOperand(0); + + if (CvtSrc.getOpcode() == ISD::FNEG) { + // (fneg (fp_round (fneg x))) -> (fp_round x) + return DAG.getNode(ISD::FP_ROUND, SL, VT, + CvtSrc.getOperand(0), N0.getOperand(1)); + } + + if (!N0.hasOneUse()) + return SDValue(); + + // (fneg (fp_round x)) -> (fp_round (fneg x)) + SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); + return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1)); + } + default: + return SDValue(); + } +} + SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -2543,6 +2974,33 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, break; case ISD::BITCAST: { EVT DestVT = N->getValueType(0); + + // Push casts through vector builds. This helps avoid emitting a large + // number of copies when materializing floating point vector constants. + // + // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) => + // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y)) + if (DestVT.isVector()) { + SDValue Src = N->getOperand(0); + if (Src.getOpcode() == ISD::BUILD_VECTOR) { + EVT SrcVT = Src.getValueType(); + unsigned NElts = DestVT.getVectorNumElements(); + + if (SrcVT.getVectorNumElements() == NElts) { + EVT DestEltVT = DestVT.getVectorElementType(); + + SmallVector<SDValue, 8> CastedElts; + SDLoc SL(N); + for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) { + SDValue Elt = Src.getOperand(I); + CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt)); + } + + return DAG.getBuildVector(DestVT, SL, CastedElts); + } + } + } + if (DestVT.getSizeInBits() != 64 && !DestVT.isVector()) break; @@ -2591,24 +3049,28 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return performSraCombine(N, DCI); } - case ISD::AND: { - if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) - break; - - return performAndCombine(N, DCI); - } case ISD::MUL: return performMulCombine(N, DCI); + case ISD::MULHS: + return performMulhsCombine(N, DCI); + case ISD::MULHU: + return performMulhuCombine(N, DCI); case AMDGPUISD::MUL_I24: - case AMDGPUISD::MUL_U24: { - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - simplifyI24(N0, DCI); - simplifyI24(N1, DCI); + case AMDGPUISD::MUL_U24: + case AMDGPUISD::MULHI_I24: + case AMDGPUISD::MULHI_U24: { + // If the first call to simplify is successfull, then N may end up being + // deleted, so we shouldn't call simplifyI24 again. + simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI); return SDValue(); } + case AMDGPUISD::MUL_LOHI_I24: + case AMDGPUISD::MUL_LOHI_U24: + return performMulLoHi24Combine(N, DCI); case ISD::SELECT: return performSelectCombine(N, DCI); + case ISD::FNEG: + return performFNegCombine(N, DCI); case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { assert(!N->getValueType(0).isVector() && @@ -2705,38 +3167,6 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, // Helper functions //===----------------------------------------------------------------------===// -void AMDGPUTargetLowering::getOriginalFunctionArgs( - SelectionDAG &DAG, - const Function *F, - const SmallVectorImpl<ISD::InputArg> &Ins, - SmallVectorImpl<ISD::InputArg> &OrigIns) const { - - for (unsigned i = 0, e = Ins.size(); i < e; ++i) { - if (Ins[i].ArgVT == Ins[i].VT) { - OrigIns.push_back(Ins[i]); - continue; - } - - EVT VT; - if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) { - // Vector has been split into scalars. - VT = Ins[i].ArgVT.getVectorElementType(); - } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() && - Ins[i].ArgVT.getVectorElementType() != - Ins[i].VT.getVectorElementType()) { - // Vector elements have been promoted - VT = Ins[i].ArgVT; - } else { - // Vector has been spilt into smaller vectors. - VT = Ins[i].VT; - } - - ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used, - Ins[i].OrigArgIndex, Ins[i].PartOffset); - OrigIns.push_back(Arg); - } -} - SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const { @@ -2754,7 +3184,8 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const { - uint64_t ArgOffset = MFI->ABIArgOffset; + unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr(); + uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment); switch (Param) { case GRID_DIM: return ArgOffset; @@ -2779,6 +3210,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(RETURN) NODE_NAME_CASE(DWORDADDR) NODE_NAME_CASE(FRACT) + NODE_NAME_CASE(SETCC) + NODE_NAME_CASE(SETREG) + NODE_NAME_CASE(FMA_W_CHAIN) + NODE_NAME_CASE(FMUL_W_CHAIN) NODE_NAME_CASE(CLAMP) NODE_NAME_CASE(COS_HW) NODE_NAME_CASE(SIN_HW) @@ -2800,7 +3235,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(TRIG_PREOP) NODE_NAME_CASE(RCP) NODE_NAME_CASE(RSQ) + NODE_NAME_CASE(RCP_LEGACY) NODE_NAME_CASE(RSQ_LEGACY) + NODE_NAME_CASE(FMUL_LEGACY) NODE_NAME_CASE(RSQ_CLAMP) NODE_NAME_CASE(LDEXP) NODE_NAME_CASE(FP_CLASS) @@ -2812,12 +3249,19 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BFI) NODE_NAME_CASE(BFM) NODE_NAME_CASE(FFBH_U32) + NODE_NAME_CASE(FFBH_I32) NODE_NAME_CASE(MUL_U24) NODE_NAME_CASE(MUL_I24) + NODE_NAME_CASE(MULHI_U24) + NODE_NAME_CASE(MULHI_I24) + NODE_NAME_CASE(MUL_LOHI_U24) + NODE_NAME_CASE(MUL_LOHI_I24) NODE_NAME_CASE(MAD_U24) NODE_NAME_CASE(MAD_I24) NODE_NAME_CASE(TEXTURE_FETCH) NODE_NAME_CASE(EXPORT) + NODE_NAME_CASE(EXPORT_DONE) + NODE_NAME_CASE(R600_EXPORT) NODE_NAME_CASE(CONST_ADDRESS) NODE_NAME_CASE(REGISTER_LOAD) NODE_NAME_CASE(REGISTER_STORE) @@ -2833,8 +3277,11 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(PC_ADD_REL_OFFSET) + NODE_NAME_CASE(KILL) + NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; NODE_NAME_CASE(SENDMSG) + NODE_NAME_CASE(SENDMSGHALT) NODE_NAME_CASE(INTERP_MOV) NODE_NAME_CASE(INTERP_P1) NODE_NAME_CASE(INTERP_P2) @@ -2844,16 +3291,18 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(ATOMIC_CMP_SWAP) NODE_NAME_CASE(ATOMIC_INC) NODE_NAME_CASE(ATOMIC_DEC) + NODE_NAME_CASE(BUFFER_LOAD) + NODE_NAME_CASE(BUFFER_LOAD_FORMAT) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } return nullptr; } -SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand, - DAGCombinerInfo &DCI, - unsigned &RefinementSteps, - bool &UseOneConstNR) const { - SelectionDAG &DAG = DCI.DAG; +SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand, + SelectionDAG &DAG, int Enabled, + int &RefinementSteps, + bool &UseOneConstNR, + bool Reciprocal) const { EVT VT = Operand.getValueType(); if (VT == MVT::f32) { @@ -2868,9 +3317,8 @@ SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand, } SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, - DAGCombinerInfo &DCI, - unsigned &RefinementSteps) const { - SelectionDAG &DAG = DCI.DAG; + SelectionDAG &DAG, int Enabled, + int &RefinementSteps) const { EVT VT = Operand.getValueType(); if (VT == MVT::f32) { |