summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp')
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp1078
1 files changed, 763 insertions, 315 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 352423ed..54caa2c 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -37,7 +37,7 @@ static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
MachineFunction &MF = State.getMachineFunction();
AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
- uint64_t Offset = MFI->allocateKernArg(ValVT.getStoreSize(),
+ uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),
ArgFlags.getOrigAlign());
State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
return true;
@@ -55,14 +55,6 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
}
-EVT AMDGPUTargetLowering::getEquivalentBitType(LLVMContext &Ctx, EVT VT) {
- unsigned StoreSize = VT.getStoreSizeInBits();
- if (StoreSize <= 32)
- return EVT::getIntegerVT(Ctx, StoreSize);
-
- return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
-}
-
AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
const AMDGPUSubtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
@@ -180,16 +172,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v2f64, Promote);
AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
- setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
- setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
-
- setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
- setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
-
- setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
- setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
- setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
-
setTruncStoreAction(MVT::i64, MVT::i1, Expand);
setTruncStoreAction(MVT::i64, MVT::i8, Expand);
setTruncStoreAction(MVT::i64, MVT::i16, Expand);
@@ -287,6 +269,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
}
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+ setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
for (MVT VT : ScalarIntVTs) {
@@ -367,6 +350,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT, VT, Expand);
setOperationAction(ISD::FP_TO_UINT, VT, Expand);
setOperationAction(ISD::MUL, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
+ setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::OR, VT, Expand);
setOperationAction(ISD::SHL, VT, Expand);
setOperationAction(ISD::SRA, VT, Expand);
@@ -440,22 +425,31 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
+ // There are no libcalls of any kind.
+ for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
+ setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
+
setBooleanContents(ZeroOrNegativeOneBooleanContent);
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
setSchedulingPreference(Sched::RegPressure);
setJumpIsExpensive(true);
+ // FIXME: This is only partially true. If we have to do vector compares, any
+ // SGPR pair can be a condition register. If we have a uniform condition, we
+ // are better off doing SALU operations, where there is only one SCC. For now,
+ // we don't have a way of knowing during instruction selection if a condition
+ // will be uniform and we always use vector compares. Assume we are using
+ // vector compares until that is fixed.
+ setHasMultipleConditionRegisters(true);
+
// SI at least has hardware support for floating point exceptions, but no way
// of using or handling them is implemented. They are also optional in OpenCL
// (Section 7.3)
setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
- setSelectIsExpensive(false);
PredictableSelectIsExpensive = false;
- setFsqrtIsCheap(true);
-
// We want to find all load dependencies for long chains of stores to enable
// merging into very wide vectors. The problem is with vectors with > 4
// elements. MergeConsecutiveStores will attempt to merge these because x8/x16
@@ -472,22 +466,42 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MaxStoresPerMemset = 4096;
setTargetDAGCombine(ISD::BITCAST);
- setTargetDAGCombine(ISD::AND);
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRA);
setTargetDAGCombine(ISD::SRL);
setTargetDAGCombine(ISD::MUL);
+ setTargetDAGCombine(ISD::MULHU);
+ setTargetDAGCombine(ISD::MULHS);
setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);
+ setTargetDAGCombine(ISD::FNEG);
}
//===----------------------------------------------------------------------===//
// Target Information
//===----------------------------------------------------------------------===//
+static bool fnegFoldsIntoOp(unsigned Opc) {
+ switch (Opc) {
+ case ISD::FADD:
+ case ISD::FSUB:
+ case ISD::FMUL:
+ case ISD::FMA:
+ case ISD::FMAD:
+ case ISD::FSIN:
+ case AMDGPUISD::RCP:
+ case AMDGPUISD::RCP_LEGACY:
+ case AMDGPUISD::SIN_HW:
+ case AMDGPUISD::FMUL_LEGACY:
+ return true;
+ default:
+ return false;
+ }
+}
+
MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
return MVT::i32;
}
@@ -500,7 +514,8 @@ bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
// FIXME: Why are we reporting vectors of FP immediates as legal?
bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
EVT ScalarVT = VT.getScalarType();
- return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64);
+ return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
+ (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
}
// We don't want to shrink f64 / f32 constants.
@@ -565,12 +580,12 @@ bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
assert(VT.isFloatingPoint());
- return VT == MVT::f32 || VT == MVT::f64;
+ return VT == MVT::f32 || VT == MVT::f64 || (Subtarget->has16BitInsts() &&
+ VT == MVT::f16);
}
bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
- assert(VT.isFloatingPoint());
- return VT == MVT::f32 || VT == MVT::f64;
+ return isFAbsFree(VT);
}
bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
@@ -593,19 +608,32 @@ bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const
bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
// Truncate is just accessing a subregister.
- return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0);
+
+ unsigned SrcSize = Source.getSizeInBits();
+ unsigned DestSize = Dest.getSizeInBits();
+
+ return DestSize < SrcSize && DestSize % 32 == 0 ;
}
bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
// Truncate is just accessing a subregister.
- return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() &&
- (Dest->getPrimitiveSizeInBits() % 32 == 0);
+
+ unsigned SrcSize = Source->getScalarSizeInBits();
+ unsigned DestSize = Dest->getScalarSizeInBits();
+
+ if (DestSize== 16 && Subtarget->has16BitInsts())
+ return SrcSize >= 32;
+
+ return DestSize < SrcSize && DestSize % 32 == 0;
}
bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
unsigned SrcSize = Src->getScalarSizeInBits();
unsigned DestSize = Dest->getScalarSizeInBits();
+ if (SrcSize == 16 && Subtarget->has16BitInsts())
+ return DestSize >= 32;
+
return SrcSize == 32 && DestSize == 64;
}
@@ -614,6 +642,10 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
// practical purposes, the extra mov 0 to load a 64-bit is free. As used,
// this will enable reducing 64-bit operations the 32-bit, which is always
// good.
+
+ if (Src == MVT::i16)
+ return Dest == MVT::i32 ||Dest == MVT::i64 ;
+
return Src == MVT::i32 && Dest == MVT::i64;
}
@@ -635,9 +667,105 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
// TargetLowering Callbacks
//===---------------------------------------------------------------------===//
-void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
+/// The SelectionDAGBuilder will automatically promote function arguments
+/// with illegal types. However, this does not work for the AMDGPU targets
+/// since the function arguments are stored in memory as these illegal types.
+/// In order to handle this properly we need to get the original types sizes
+/// from the LLVM IR Function and fixup the ISD:InputArg values before
+/// passing them to AnalyzeFormalArguments()
+
+/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
+/// input values across multiple registers. Each item in the Ins array
+/// represents a single value that will be stored in regsters. Ins[x].VT is
+/// the value type of the value that will be stored in the register, so
+/// whatever SDNode we lower the argument to needs to be this type.
+///
+/// In order to correctly lower the arguments we need to know the size of each
+/// argument. Since Ins[x].VT gives us the size of the register that will
+/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
+/// for the orignal function argument so that we can deduce the correct memory
+/// type to use for Ins[x]. In most cases the correct memory type will be
+/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
+/// we have a kernel argument of type v8i8, this argument will be split into
+/// 8 parts and each part will be represented by its own item in the Ins array.
+/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
+/// the argument before it was split. From this, we deduce that the memory type
+/// for each individual part is i8. We pass the memory type as LocVT to the
+/// calling convention analysis function and the register type (Ins[x].VT) as
+/// the ValVT.
+void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
const SmallVectorImpl<ISD::InputArg> &Ins) const {
+ for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+ const ISD::InputArg &In = Ins[i];
+ EVT MemVT;
+
+ unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT);
+
+ if (!Subtarget->isAmdHsaOS() &&
+ (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) {
+ // The ABI says the caller will extend these values to 32-bits.
+ MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;
+ } else if (NumRegs == 1) {
+ // This argument is not split, so the IR type is the memory type.
+ assert(!In.Flags.isSplit());
+ if (In.ArgVT.isExtended()) {
+ // We have an extended type, like i24, so we should just use the register type
+ MemVT = In.VT;
+ } else {
+ MemVT = In.ArgVT;
+ }
+ } else if (In.ArgVT.isVector() && In.VT.isVector() &&
+ In.ArgVT.getScalarType() == In.VT.getScalarType()) {
+ assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements());
+ // We have a vector value which has been split into a vector with
+ // the same scalar type, but fewer elements. This should handle
+ // all the floating-point vector types.
+ MemVT = In.VT;
+ } else if (In.ArgVT.isVector() &&
+ In.ArgVT.getVectorNumElements() == NumRegs) {
+ // This arg has been split so that each element is stored in a separate
+ // register.
+ MemVT = In.ArgVT.getScalarType();
+ } else if (In.ArgVT.isExtended()) {
+ // We have an extended type, like i65.
+ MemVT = In.VT;
+ } else {
+ unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;
+ assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0);
+ if (In.VT.isInteger()) {
+ MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
+ } else if (In.VT.isVector()) {
+ assert(!In.VT.getScalarType().isFloatingPoint());
+ unsigned NumElements = In.VT.getVectorNumElements();
+ assert(MemoryBits % NumElements == 0);
+ // This vector type has been split into another vector type with
+ // a different elements size.
+ EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
+ MemoryBits / NumElements);
+ MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
+ } else {
+ llvm_unreachable("cannot deduce memory type.");
+ }
+ }
+ // Convert one element vectors to scalar.
+ if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
+ MemVT = MemVT.getScalarType();
+
+ if (MemVT.isExtended()) {
+ // This should really only happen if we have vec3 arguments
+ assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
+ MemVT = MemVT.getPow2VectorType(State.getContext());
+ }
+
+ assert(MemVT.isSimple());
+ allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags,
+ State);
+ }
+}
+
+void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
+ const SmallVectorImpl<ISD::InputArg> &Ins) const {
State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
}
@@ -678,8 +806,10 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
Fn, "unsupported call to function " + FuncName, CLI.DL.getDebugLoc());
DAG.getContext()->diagnose(NoCalls);
- for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
- InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
+ if (!CLI.IsTailCall) {
+ for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
+ InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
+ }
return DAG.getEntryNode();
}
@@ -718,6 +848,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
+ case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
case ISD::CTLZ:
@@ -745,94 +876,6 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
}
}
-// FIXME: This implements accesses to initialized globals in the constant
-// address space by copying them to private and accessing that. It does not
-// properly handle illegal types or vectors. The private vector loads are not
-// scalarized, and the illegal scalars hit an assertion. This technique will not
-// work well with large initializers, and this should eventually be
-// removed. Initialized globals should be placed into a data section that the
-// runtime will load into a buffer before the kernel is executed. Uses of the
-// global need to be replaced with a pointer loaded from an implicit kernel
-// argument into this buffer holding the copy of the data, which will remove the
-// need for any of this.
-SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
- const GlobalValue *GV,
- const SDValue &InitPtr,
- SDValue Chain,
- SelectionDAG &DAG) const {
- const DataLayout &TD = DAG.getDataLayout();
- SDLoc DL(InitPtr);
- Type *InitTy = Init->getType();
-
- if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) {
- EVT VT = EVT::getEVT(InitTy);
- PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
- return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr,
- MachinePointerInfo(UndefValue::get(PtrTy)),
- TD.getPrefTypeAlignment(InitTy));
- }
-
- if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
- EVT VT = EVT::getEVT(CFP->getType());
- PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
- return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr,
- MachinePointerInfo(UndefValue::get(PtrTy)),
- TD.getPrefTypeAlignment(CFP->getType()));
- }
-
- if (StructType *ST = dyn_cast<StructType>(InitTy)) {
- const StructLayout *SL = TD.getStructLayout(ST);
-
- EVT PtrVT = InitPtr.getValueType();
- SmallVector<SDValue, 8> Chains;
-
- for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) {
- SDValue Offset = DAG.getConstant(SL->getElementOffset(I), DL, PtrVT);
- SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
-
- Constant *Elt = Init->getAggregateElement(I);
- Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
- }
-
- return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
- }
-
- if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) {
- EVT PtrVT = InitPtr.getValueType();
-
- unsigned NumElements;
- if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy))
- NumElements = AT->getNumElements();
- else if (VectorType *VT = dyn_cast<VectorType>(SeqTy))
- NumElements = VT->getNumElements();
- else
- llvm_unreachable("Unexpected type");
-
- unsigned EltSize = TD.getTypeAllocSize(SeqTy->getElementType());
- SmallVector<SDValue, 8> Chains;
- for (unsigned i = 0; i < NumElements; ++i) {
- SDValue Offset = DAG.getConstant(i * EltSize, DL, PtrVT);
- SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
-
- Constant *Elt = Init->getAggregateElement(i);
- Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
- }
-
- return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
- }
-
- if (isa<UndefValue>(Init)) {
- EVT VT = EVT::getEVT(InitTy);
- PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
- return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr,
- MachinePointerInfo(UndefValue::get(PtrTy)),
- TD.getPrefTypeAlignment(InitTy));
- }
-
- Init->dump();
- llvm_unreachable("Unhandled constant initializer");
-}
-
static bool hasDefinedInitializer(const GlobalValue *GV) {
const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
if (!GVar || !GVar->hasInitializer())
@@ -850,11 +893,6 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
const GlobalValue *GV = G->getGlobal();
switch (G->getAddressSpace()) {
- case AMDGPUAS::CONSTANT_ADDRESS: {
- MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
- SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(G), ConstPtrVT);
- return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(G), ConstPtrVT, GA);
- }
case AMDGPUAS::LOCAL_ADDRESS: {
// XXX: What does the value of G->getOffset() mean?
assert(G->getOffset() == 0 &&
@@ -864,24 +902,8 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
if (hasDefinedInitializer(GV))
break;
- unsigned Offset;
- if (MFI->LocalMemoryObjects.count(GV) == 0) {
- unsigned Align = GV->getAlignment();
- if (Align == 0)
- Align = DL.getABITypeAlignment(GV->getValueType());
-
- /// TODO: We should sort these to minimize wasted space due to alignment
- /// padding. Currently the padding is decided by the first encountered use
- /// during lowering.
- Offset = MFI->LDSSize = alignTo(MFI->LDSSize, Align);
- MFI->LocalMemoryObjects[GV] = Offset;
- MFI->LDSSize += DL.getTypeAllocSize(GV->getValueType());
- } else {
- Offset = MFI->LocalMemoryObjects[GV];
- }
-
- return DAG.getConstant(Offset, SDLoc(Op),
- getPointerTy(DL, AMDGPUAS::LOCAL_ADDRESS));
+ unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
+ return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
}
}
@@ -1097,65 +1119,6 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
return DAG.getMergeValues(Ops, SL);
}
-// FIXME: This isn't doing anything for SI. This should be used in a target
-// combine during type legalization.
-SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
- SelectionDAG &DAG) const {
- StoreSDNode *Store = cast<StoreSDNode>(Op);
- EVT MemVT = Store->getMemoryVT();
- unsigned MemBits = MemVT.getSizeInBits();
-
- // Byte stores are really expensive, so if possible, try to pack 32-bit vector
- // truncating store into an i32 store.
- // XXX: We could also handle optimize other vector bitwidths.
- if (!MemVT.isVector() || MemBits > 32) {
- return SDValue();
- }
-
- SDLoc DL(Op);
- SDValue Value = Store->getValue();
- EVT VT = Value.getValueType();
- EVT ElemVT = VT.getVectorElementType();
- SDValue Ptr = Store->getBasePtr();
- EVT MemEltVT = MemVT.getVectorElementType();
- unsigned MemEltBits = MemEltVT.getSizeInBits();
- unsigned MemNumElements = MemVT.getVectorNumElements();
- unsigned PackedSize = MemVT.getStoreSizeInBits();
- SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, DL, MVT::i32);
-
- assert(Value.getValueType().getScalarSizeInBits() >= 32);
-
- SDValue PackedValue;
- for (unsigned i = 0; i < MemNumElements; ++i) {
- SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value,
- DAG.getConstant(i, DL, MVT::i32));
- Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32);
- Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg
-
- SDValue Shift = DAG.getConstant(MemEltBits * i, DL, MVT::i32);
- Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift);
-
- if (i == 0) {
- PackedValue = Elt;
- } else {
- PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt);
- }
- }
-
- if (PackedSize < 32) {
- EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize);
- return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr,
- Store->getMemOperand()->getPointerInfo(), PackedVT,
- Store->getAlignment(),
- Store->getMemOperand()->getFlags());
- }
-
- return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
- Store->getMemOperand()->getPointerInfo(),
- Store->getAlignment(),
- Store->getMemOperand()->getFlags());
-}
-
SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
SelectionDAG &DAG) const {
StoreSDNode *Store = cast<StoreSDNode>(Op);
@@ -1670,7 +1633,7 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getValueType() == MVT::f64);
- APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52");
+ APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
@@ -1681,7 +1644,7 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
- APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51");
+ APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
EVT SetCCVT =
@@ -1988,14 +1951,26 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
assert(Op.getOperand(0).getValueType() == MVT::i64 &&
"operation should be legal");
+ // TODO: Factor out code common with LowerSINT_TO_FP.
+
EVT DestVT = Op.getValueType();
- if (DestVT == MVT::f64)
- return LowerINT_TO_FP64(Op, DAG, false);
+ if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
+ SDLoc DL(Op);
+ SDValue Src = Op.getOperand(0);
+
+ SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
+ SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
+ SDValue FPRound =
+ DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
+
+ return FPRound;
+ }
if (DestVT == MVT::f32)
return LowerINT_TO_FP32(Op, DAG, false);
- return SDValue();
+ assert(DestVT == MVT::f64);
+ return LowerINT_TO_FP64(Op, DAG, false);
}
SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
@@ -2003,14 +1978,26 @@ SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
assert(Op.getOperand(0).getValueType() == MVT::i64 &&
"operation should be legal");
+ // TODO: Factor out code common with LowerUINT_TO_FP.
+
EVT DestVT = Op.getValueType();
+ if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
+ SDLoc DL(Op);
+ SDValue Src = Op.getOperand(0);
+
+ SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
+ SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
+ SDValue FPRound =
+ DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
+
+ return FPRound;
+ }
+
if (DestVT == MVT::f32)
return LowerINT_TO_FP32(Op, DAG, true);
- if (DestVT == MVT::f64)
- return LowerINT_TO_FP64(Op, DAG, true);
-
- return SDValue();
+ assert(DestVT == MVT::f64);
+ return LowerINT_TO_FP64(Op, DAG, true);
}
SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
@@ -2042,10 +2029,118 @@ SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
}
+SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
+
+ if (getTargetMachine().Options.UnsafeFPMath) {
+ // There is a generic expand for FP_TO_FP16 with unsafe fast math.
+ return SDValue();
+ }
+
+ SDLoc DL(Op);
+ SDValue N0 = Op.getOperand(0);
+ assert (N0.getSimpleValueType() == MVT::f64);
+
+ // f64 -> f16 conversion using round-to-nearest-even rounding mode.
+ const unsigned ExpMask = 0x7ff;
+ const unsigned ExpBiasf64 = 1023;
+ const unsigned ExpBiasf16 = 15;
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
+ SDValue One = DAG.getConstant(1, DL, MVT::i32);
+ SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
+ SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
+ DAG.getConstant(32, DL, MVT::i64));
+ UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
+ U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
+ SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
+ DAG.getConstant(20, DL, MVT::i64));
+ E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
+ DAG.getConstant(ExpMask, DL, MVT::i32));
+ // Subtract the fp64 exponent bias (1023) to get the real exponent and
+ // add the f16 bias (15) to get the biased exponent for the f16 format.
+ E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
+ DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
+
+ SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
+ DAG.getConstant(8, DL, MVT::i32));
+ M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
+ DAG.getConstant(0xffe, DL, MVT::i32));
+
+ SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
+ DAG.getConstant(0x1ff, DL, MVT::i32));
+ MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
+
+ SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
+ M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
+
+ // (M != 0 ? 0x0200 : 0) | 0x7c00;
+ SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
+ DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
+ Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
+
+ // N = M | (E << 12);
+ SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
+ DAG.getNode(ISD::SHL, DL, MVT::i32, E,
+ DAG.getConstant(12, DL, MVT::i32)));
+
+ // B = clamp(1-E, 0, 13);
+ SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
+ One, E);
+ SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
+ B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
+ DAG.getConstant(13, DL, MVT::i32));
+
+ SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
+ DAG.getConstant(0x1000, DL, MVT::i32));
+
+ SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
+ SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
+ SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
+ D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
+
+ SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
+ SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
+ DAG.getConstant(0x7, DL, MVT::i32));
+ V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
+ DAG.getConstant(2, DL, MVT::i32));
+ SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
+ One, Zero, ISD::SETEQ);
+ SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
+ One, Zero, ISD::SETGT);
+ V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
+ V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
+
+ V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
+ DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
+ V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
+ I, V, ISD::SETEQ);
+
+ // Extract the sign bit.
+ SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
+ DAG.getConstant(16, DL, MVT::i32));
+ Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
+ DAG.getConstant(0x8000, DL, MVT::i32));
+
+ V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
+ return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
+}
+
SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
SelectionDAG &DAG) const {
SDValue Src = Op.getOperand(0);
+ // TODO: Factor out code common with LowerFP_TO_UINT.
+
+ EVT SrcVT = Src.getValueType();
+ if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
+ SDLoc DL(Op);
+
+ SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
+ SDValue FpToInt32 =
+ DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
+
+ return FpToInt32;
+ }
+
if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
return LowerFP64_TO_INT(Op, DAG, true);
@@ -2056,6 +2151,19 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
SelectionDAG &DAG) const {
SDValue Src = Op.getOperand(0);
+ // TODO: Factor out code common with LowerFP_TO_SINT.
+
+ EVT SrcVT = Src.getValueType();
+ if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
+ SDLoc DL(Op);
+
+ SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
+ SDValue FpToInt32 =
+ DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
+
+ return FpToInt32;
+ }
+
if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
return LowerFP64_TO_INT(Op, DAG, false);
@@ -2068,8 +2176,7 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
MVT VT = Op.getSimpleValueType();
MVT ScalarVT = VT.getScalarType();
- if (!VT.isVector())
- return SDValue();
+ assert(VT.isVector());
SDValue Src = Op.getOperand(0);
SDLoc DL(Op);
@@ -2108,17 +2215,20 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) {
(VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
}
-static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) {
+static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
+ TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Op = Node24->getOperand(OpIdx);
EVT VT = Op.getValueType();
APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
APInt KnownZero, KnownOne;
TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
- if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
- DCI.CommitTargetLoweringOpt(TLO);
+ if (TLO.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI))
+ return true;
+
+ return false;
}
template <typename IntTy>
@@ -2188,6 +2298,9 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
// problems during legalization, the emitted instructions to pack and unpack
// the bytes again are not eliminated in the case of an unaligned copy.
if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
+ if (VT.isVector())
+ return scalarizeVectorLoad(LN, DAG);
+
SDValue Ops[2];
std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
return DAG.getMergeValues(Ops, SDLoc(N));
@@ -2236,8 +2349,12 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
// order problems during legalization, the emitted instructions to pack and
// unpack the bytes again are not eliminated in the case of an unaligned
// copy.
- if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast))
+ if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
+ if (VT.isVector())
+ return scalarizeVectorStore(SN, DAG);
+
return expandUnalignedStore(SN, DAG);
+ }
if (!IsFast)
return SDValue();
@@ -2262,38 +2379,21 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
SN->getBasePtr(), SN->getMemOperand());
}
-// TODO: Should repeat for other bit ops.
-SDValue AMDGPUTargetLowering::performAndCombine(SDNode *N,
- DAGCombinerInfo &DCI) const {
- if (N->getValueType(0) != MVT::i64)
- return SDValue();
-
- // Break up 64-bit and of a constant into two 32-bit ands. This will typically
- // happen anyway for a VALU 64-bit and. This exposes other 32-bit integer
- // combine opportunities since most 64-bit operations are decomposed this way.
- // TODO: We won't want this for SALU especially if it is an inline immediate.
- const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
- if (!RHS)
- return SDValue();
-
- uint64_t Val = RHS->getZExtValue();
- if (Lo_32(Val) != 0 && Hi_32(Val) != 0 && !RHS->hasOneUse()) {
- // If either half of the constant is 0, this is really a 32-bit and, so
- // split it. If we can re-use the full materialized constant, keep it.
- return SDValue();
- }
-
- SDLoc SL(N);
+/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
+/// binary operation \p Opc to it with the corresponding constant operands.
+SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
+ DAGCombinerInfo &DCI, const SDLoc &SL,
+ unsigned Opc, SDValue LHS,
+ uint32_t ValLo, uint32_t ValHi) const {
SelectionDAG &DAG = DCI.DAG;
-
SDValue Lo, Hi;
- std::tie(Lo, Hi) = split64BitValue(N->getOperand(0), DAG);
+ std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
- SDValue LoRHS = DAG.getConstant(Lo_32(Val), SL, MVT::i32);
- SDValue HiRHS = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
+ SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
+ SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
- SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, LoRHS);
- SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, HiRHS);
+ SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
+ SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
// Re-visit the ands. It's possible we eliminated one of them and it could
// simplify the vector.
@@ -2408,11 +2508,40 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
}
+// We need to specifically handle i64 mul here to avoid unnecessary conversion
+// instructions. If we only match on the legalized i64 mul expansion,
+// SimplifyDemandedBits will be unable to remove them because there will be
+// multiple uses due to the separate mul + mulh[su].
+static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
+ SDValue N0, SDValue N1, unsigned Size, bool Signed) {
+ if (Size <= 32) {
+ unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
+ return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
+ }
+
+ // Because we want to eliminate extension instructions before the
+ // operation, we need to create a single user here (i.e. not the separate
+ // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
+
+ unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
+
+ SDValue Mul = DAG.getNode(MulOpc, SL,
+ DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
+
+ return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
+ Mul.getValue(0), Mul.getValue(1));
+}
+
SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
EVT VT = N->getValueType(0);
- if (VT.isVector() || VT.getSizeInBits() > 32)
+ unsigned Size = VT.getSizeInBits();
+ if (VT.isVector() || Size > 64)
+ return SDValue();
+
+ // There are i16 integer mul/mad.
+ if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
return SDValue();
SelectionDAG &DAG = DCI.DAG;
@@ -2425,11 +2554,11 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
- Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
+ Mul = getMul24(DAG, DL, N0, N1, Size, false);
} else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
- Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
+ Mul = getMul24(DAG, DL, N0, N1, Size, true);
} else {
return SDValue();
}
@@ -2439,6 +2568,77 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
return DAG.getSExtOrTrunc(Mul, DL, VT);
}
+SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ EVT VT = N->getValueType(0);
+
+ if (!Subtarget->hasMulI24() || VT.isVector())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ if (!isI24(N0, DAG) || !isI24(N1, DAG))
+ return SDValue();
+
+ N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
+ N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
+
+ SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
+ DCI.AddToWorklist(Mulhi.getNode());
+ return DAG.getSExtOrTrunc(Mulhi, DL, VT);
+}
+
+SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ EVT VT = N->getValueType(0);
+
+ if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ if (!isU24(N0, DAG) || !isU24(N1, DAG))
+ return SDValue();
+
+ N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
+ N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
+
+ SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
+ DCI.AddToWorklist(Mulhi.getNode());
+ return DAG.getZExtOrTrunc(Mulhi, DL, VT);
+}
+
+SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+
+ // Simplify demanded bits before splitting into multiple users.
+ if (simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI))
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
+
+ unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
+ unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
+
+ SDLoc SL(N);
+
+ SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
+ SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
+ return DAG.getMergeValues({ MulLo, MulHi }, SL);
+}
+
static bool isNegativeOne(SDValue Val) {
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
return C->isAllOnesValue();
@@ -2449,23 +2649,21 @@ static bool isCtlzOpc(unsigned Opc) {
return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
}
-// Get FFBH node if the incoming op may have been type legalized from a smaller
-// type VT.
-// Need to match pre-legalized type because the generic legalization inserts the
-// add/sub between the select and compare.
-static SDValue getFFBH_U32(const TargetLowering &TLI, SelectionDAG &DAG,
- const SDLoc &SL, SDValue Op) {
+SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG,
+ SDValue Op,
+ const SDLoc &DL) const {
EVT VT = Op.getValueType();
- EVT LegalVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
- if (LegalVT != MVT::i32)
+ EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
+ if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
+ LegalVT != MVT::i16))
return SDValue();
if (VT != MVT::i32)
- Op = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Op);
+ Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
- SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Op);
+ SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, DL, MVT::i32, Op);
if (VT != MVT::i32)
- FFBH = DAG.getNode(ISD::TRUNCATE, SL, VT, FFBH);
+ FFBH = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBH);
return FFBH;
}
@@ -2493,7 +2691,7 @@ SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
isCtlzOpc(RHS.getOpcode()) &&
RHS.getOperand(0) == CmpLHS &&
isNegativeOne(LHS)) {
- return getFFBH_U32(*this, DAG, SL, CmpLHS);
+ return getFFBH_U32(DAG, CmpLHS, SL);
}
// select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
@@ -2501,14 +2699,99 @@ SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
isCtlzOpc(LHS.getOpcode()) &&
LHS.getOperand(0) == CmpLHS &&
isNegativeOne(RHS)) {
- return getFFBH_U32(*this, DAG, SL, CmpLHS);
+ return getFFBH_U32(DAG, CmpLHS, SL);
+ }
+
+ return SDValue();
+}
+
+static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
+ unsigned Op,
+ const SDLoc &SL,
+ SDValue Cond,
+ SDValue N1,
+ SDValue N2) {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N1.getValueType();
+
+ SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
+ N1.getOperand(0), N2.getOperand(0));
+ DCI.AddToWorklist(NewSelect.getNode());
+ return DAG.getNode(Op, SL, VT, NewSelect);
+}
+
+// Pull a free FP operation out of a select so it may fold into uses.
+//
+// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
+// select c, (fneg x), k -> fneg (select c, x, (fneg k))
+//
+// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
+// select c, (fabs x), +k -> fabs (select c, x, k)
+static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
+ SDValue N) {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue Cond = N.getOperand(0);
+ SDValue LHS = N.getOperand(1);
+ SDValue RHS = N.getOperand(2);
+
+ EVT VT = N.getValueType();
+ if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
+ (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
+ return distributeOpThroughSelect(DCI, LHS.getOpcode(),
+ SDLoc(N), Cond, LHS, RHS);
+ }
+
+ bool Inv = false;
+ if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
+ std::swap(LHS, RHS);
+ Inv = true;
+ }
+
+ // TODO: Support vector constants.
+ ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
+ if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
+ SDLoc SL(N);
+ // If one side is an fneg/fabs and the other is a constant, we can push the
+ // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
+ SDValue NewLHS = LHS.getOperand(0);
+ SDValue NewRHS = RHS;
+
+ // Careful: if the neg can be folded up, don't try to pull it back down.
+ bool ShouldFoldNeg = true;
+
+ if (NewLHS.hasOneUse()) {
+ unsigned Opc = NewLHS.getOpcode();
+ if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
+ ShouldFoldNeg = false;
+ if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
+ ShouldFoldNeg = false;
+ }
+
+ if (ShouldFoldNeg) {
+ if (LHS.getOpcode() == ISD::FNEG)
+ NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+ else if (CRHS->isNegative())
+ return SDValue();
+
+ if (Inv)
+ std::swap(NewLHS, NewRHS);
+
+ SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
+ Cond, NewLHS, NewRHS);
+ DCI.AddToWorklist(NewSelect.getNode());
+ return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
+ }
}
return SDValue();
}
+
SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
+ if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
+ return Folded;
+
SDValue Cond = N->getOperand(0);
if (Cond.getOpcode() != ISD::SETCC)
return SDValue();
@@ -2521,6 +2804,25 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
SDValue True = N->getOperand(1);
SDValue False = N->getOperand(2);
+ if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
+ SelectionDAG &DAG = DCI.DAG;
+ if ((DAG.isConstantValueOfAnyType(True) ||
+ DAG.isConstantValueOfAnyType(True)) &&
+ (!DAG.isConstantValueOfAnyType(False) &&
+ !DAG.isConstantValueOfAnyType(False))) {
+ // Swap cmp + select pair to move constant to false input.
+ // This will allow using VOPC cndmasks more often.
+ // select (setcc x, y), k, x -> select (setcc y, x) x, x
+
+ SDLoc SL(N);
+ ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
+ LHS.getValueType().isInteger());
+
+ SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
+ return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
+ }
+ }
+
if (VT == MVT::f32 && Cond.hasOneUse()) {
SDValue MinMax
= CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
@@ -2533,6 +2835,135 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
}
+SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+
+ unsigned Opc = N0.getOpcode();
+
+ // If the input has multiple uses and we can either fold the negate down, or
+ // the other uses cannot, give up. This both prevents unprofitable
+ // transformations and infinite loops: we won't repeatedly try to fold around
+ // a negate that has no 'good' form.
+ //
+ // TODO: Check users can fold
+ if (fnegFoldsIntoOp(Opc) && !N0.hasOneUse())
+ return SDValue();
+
+ SDLoc SL(N);
+ switch (Opc) {
+ case ISD::FADD: {
+ if (!mayIgnoreSignedZero(N0))
+ return SDValue();
+
+ // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
+ SDValue LHS = N0.getOperand(0);
+ SDValue RHS = N0.getOperand(1);
+
+ if (LHS.getOpcode() != ISD::FNEG)
+ LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
+ else
+ LHS = LHS.getOperand(0);
+
+ if (RHS.getOpcode() != ISD::FNEG)
+ RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+ else
+ RHS = RHS.getOperand(0);
+
+ SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS);
+ if (!N0.hasOneUse())
+ DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+ return Res;
+ }
+ case ISD::FMUL:
+ case AMDGPUISD::FMUL_LEGACY: {
+ // (fneg (fmul x, y)) -> (fmul x, (fneg y))
+ // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
+ SDValue LHS = N0.getOperand(0);
+ SDValue RHS = N0.getOperand(1);
+
+ if (LHS.getOpcode() == ISD::FNEG)
+ LHS = LHS.getOperand(0);
+ else if (RHS.getOpcode() == ISD::FNEG)
+ RHS = RHS.getOperand(0);
+ else
+ RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+
+ SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS);
+ if (!N0.hasOneUse())
+ DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+ return Res;
+ }
+ case ISD::FMA:
+ case ISD::FMAD: {
+ if (!mayIgnoreSignedZero(N0))
+ return SDValue();
+
+ // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
+ SDValue LHS = N0.getOperand(0);
+ SDValue MHS = N0.getOperand(1);
+ SDValue RHS = N0.getOperand(2);
+
+ if (LHS.getOpcode() == ISD::FNEG)
+ LHS = LHS.getOperand(0);
+ else if (MHS.getOpcode() == ISD::FNEG)
+ MHS = MHS.getOperand(0);
+ else
+ MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
+
+ if (RHS.getOpcode() != ISD::FNEG)
+ RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+ else
+ RHS = RHS.getOperand(0);
+
+ SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
+ if (!N0.hasOneUse())
+ DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+ return Res;
+ }
+ case ISD::FP_EXTEND:
+ case AMDGPUISD::RCP:
+ case AMDGPUISD::RCP_LEGACY:
+ case ISD::FSIN:
+ case AMDGPUISD::SIN_HW: {
+ SDValue CvtSrc = N0.getOperand(0);
+ if (CvtSrc.getOpcode() == ISD::FNEG) {
+ // (fneg (fp_extend (fneg x))) -> (fp_extend x)
+ // (fneg (rcp (fneg x))) -> (rcp x)
+ return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
+ }
+
+ if (!N0.hasOneUse())
+ return SDValue();
+
+ // (fneg (fp_extend x)) -> (fp_extend (fneg x))
+ // (fneg (rcp x)) -> (rcp (fneg x))
+ SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
+ return DAG.getNode(Opc, SL, VT, Neg);
+ }
+ case ISD::FP_ROUND: {
+ SDValue CvtSrc = N0.getOperand(0);
+
+ if (CvtSrc.getOpcode() == ISD::FNEG) {
+ // (fneg (fp_round (fneg x))) -> (fp_round x)
+ return DAG.getNode(ISD::FP_ROUND, SL, VT,
+ CvtSrc.getOperand(0), N0.getOperand(1));
+ }
+
+ if (!N0.hasOneUse())
+ return SDValue();
+
+ // (fneg (fp_round x)) -> (fp_round (fneg x))
+ SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
+ return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
+ }
+ default:
+ return SDValue();
+ }
+}
+
SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -2543,6 +2974,33 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
break;
case ISD::BITCAST: {
EVT DestVT = N->getValueType(0);
+
+ // Push casts through vector builds. This helps avoid emitting a large
+ // number of copies when materializing floating point vector constants.
+ //
+ // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
+ // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
+ if (DestVT.isVector()) {
+ SDValue Src = N->getOperand(0);
+ if (Src.getOpcode() == ISD::BUILD_VECTOR) {
+ EVT SrcVT = Src.getValueType();
+ unsigned NElts = DestVT.getVectorNumElements();
+
+ if (SrcVT.getVectorNumElements() == NElts) {
+ EVT DestEltVT = DestVT.getVectorElementType();
+
+ SmallVector<SDValue, 8> CastedElts;
+ SDLoc SL(N);
+ for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
+ SDValue Elt = Src.getOperand(I);
+ CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
+ }
+
+ return DAG.getBuildVector(DestVT, SL, CastedElts);
+ }
+ }
+ }
+
if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
break;
@@ -2591,24 +3049,28 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
return performSraCombine(N, DCI);
}
- case ISD::AND: {
- if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
- break;
-
- return performAndCombine(N, DCI);
- }
case ISD::MUL:
return performMulCombine(N, DCI);
+ case ISD::MULHS:
+ return performMulhsCombine(N, DCI);
+ case ISD::MULHU:
+ return performMulhuCombine(N, DCI);
case AMDGPUISD::MUL_I24:
- case AMDGPUISD::MUL_U24: {
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
- simplifyI24(N0, DCI);
- simplifyI24(N1, DCI);
+ case AMDGPUISD::MUL_U24:
+ case AMDGPUISD::MULHI_I24:
+ case AMDGPUISD::MULHI_U24: {
+ // If the first call to simplify is successfull, then N may end up being
+ // deleted, so we shouldn't call simplifyI24 again.
+ simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI);
return SDValue();
}
+ case AMDGPUISD::MUL_LOHI_I24:
+ case AMDGPUISD::MUL_LOHI_U24:
+ return performMulLoHi24Combine(N, DCI);
case ISD::SELECT:
return performSelectCombine(N, DCI);
+ case ISD::FNEG:
+ return performFNegCombine(N, DCI);
case AMDGPUISD::BFE_I32:
case AMDGPUISD::BFE_U32: {
assert(!N->getValueType(0).isVector() &&
@@ -2705,38 +3167,6 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
// Helper functions
//===----------------------------------------------------------------------===//
-void AMDGPUTargetLowering::getOriginalFunctionArgs(
- SelectionDAG &DAG,
- const Function *F,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- SmallVectorImpl<ISD::InputArg> &OrigIns) const {
-
- for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
- if (Ins[i].ArgVT == Ins[i].VT) {
- OrigIns.push_back(Ins[i]);
- continue;
- }
-
- EVT VT;
- if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) {
- // Vector has been split into scalars.
- VT = Ins[i].ArgVT.getVectorElementType();
- } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() &&
- Ins[i].ArgVT.getVectorElementType() !=
- Ins[i].VT.getVectorElementType()) {
- // Vector elements have been promoted
- VT = Ins[i].ArgVT;
- } else {
- // Vector has been spilt into smaller vectors.
- VT = Ins[i].VT;
- }
-
- ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used,
- Ins[i].OrigArgIndex, Ins[i].PartOffset);
- OrigIns.push_back(Arg);
- }
-}
-
SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
const TargetRegisterClass *RC,
unsigned Reg, EVT VT) const {
@@ -2754,7 +3184,8 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
- uint64_t ArgOffset = MFI->ABIArgOffset;
+ unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();
+ uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment);
switch (Param) {
case GRID_DIM:
return ArgOffset;
@@ -2779,6 +3210,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(RETURN)
NODE_NAME_CASE(DWORDADDR)
NODE_NAME_CASE(FRACT)
+ NODE_NAME_CASE(SETCC)
+ NODE_NAME_CASE(SETREG)
+ NODE_NAME_CASE(FMA_W_CHAIN)
+ NODE_NAME_CASE(FMUL_W_CHAIN)
NODE_NAME_CASE(CLAMP)
NODE_NAME_CASE(COS_HW)
NODE_NAME_CASE(SIN_HW)
@@ -2800,7 +3235,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(TRIG_PREOP)
NODE_NAME_CASE(RCP)
NODE_NAME_CASE(RSQ)
+ NODE_NAME_CASE(RCP_LEGACY)
NODE_NAME_CASE(RSQ_LEGACY)
+ NODE_NAME_CASE(FMUL_LEGACY)
NODE_NAME_CASE(RSQ_CLAMP)
NODE_NAME_CASE(LDEXP)
NODE_NAME_CASE(FP_CLASS)
@@ -2812,12 +3249,19 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BFI)
NODE_NAME_CASE(BFM)
NODE_NAME_CASE(FFBH_U32)
+ NODE_NAME_CASE(FFBH_I32)
NODE_NAME_CASE(MUL_U24)
NODE_NAME_CASE(MUL_I24)
+ NODE_NAME_CASE(MULHI_U24)
+ NODE_NAME_CASE(MULHI_I24)
+ NODE_NAME_CASE(MUL_LOHI_U24)
+ NODE_NAME_CASE(MUL_LOHI_I24)
NODE_NAME_CASE(MAD_U24)
NODE_NAME_CASE(MAD_I24)
NODE_NAME_CASE(TEXTURE_FETCH)
NODE_NAME_CASE(EXPORT)
+ NODE_NAME_CASE(EXPORT_DONE)
+ NODE_NAME_CASE(R600_EXPORT)
NODE_NAME_CASE(CONST_ADDRESS)
NODE_NAME_CASE(REGISTER_LOAD)
NODE_NAME_CASE(REGISTER_STORE)
@@ -2833,8 +3277,11 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
NODE_NAME_CASE(CONST_DATA_PTR)
NODE_NAME_CASE(PC_ADD_REL_OFFSET)
+ NODE_NAME_CASE(KILL)
+ NODE_NAME_CASE(DUMMY_CHAIN)
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
NODE_NAME_CASE(SENDMSG)
+ NODE_NAME_CASE(SENDMSGHALT)
NODE_NAME_CASE(INTERP_MOV)
NODE_NAME_CASE(INTERP_P1)
NODE_NAME_CASE(INTERP_P2)
@@ -2844,16 +3291,18 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(ATOMIC_CMP_SWAP)
NODE_NAME_CASE(ATOMIC_INC)
NODE_NAME_CASE(ATOMIC_DEC)
+ NODE_NAME_CASE(BUFFER_LOAD)
+ NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
}
return nullptr;
}
-SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand,
- DAGCombinerInfo &DCI,
- unsigned &RefinementSteps,
- bool &UseOneConstNR) const {
- SelectionDAG &DAG = DCI.DAG;
+SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
+ SelectionDAG &DAG, int Enabled,
+ int &RefinementSteps,
+ bool &UseOneConstNR,
+ bool Reciprocal) const {
EVT VT = Operand.getValueType();
if (VT == MVT::f32) {
@@ -2868,9 +3317,8 @@ SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand,
}
SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
- DAGCombinerInfo &DCI,
- unsigned &RefinementSteps) const {
- SelectionDAG &DAG = DCI.DAG;
+ SelectionDAG &DAG, int Enabled,
+ int &RefinementSteps) const {
EVT VT = Operand.getValueType();
if (VT == MVT::f32) {
OpenPOWER on IntegriCloud