diff options
Diffstat (limited to 'contrib/llvm/lib/Target/R600/SIISelLowering.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/R600/SIISelLowering.cpp | 101 |
1 files changed, 84 insertions, 17 deletions
diff --git a/contrib/llvm/lib/Target/R600/SIISelLowering.cpp b/contrib/llvm/lib/Target/R600/SIISelLowering.cpp index 6b2ea06..32ae605 100644 --- a/contrib/llvm/lib/Target/R600/SIISelLowering.cpp +++ b/contrib/llvm/lib/Target/R600/SIISelLowering.cpp @@ -89,8 +89,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::STORE, MVT::v16i32, Custom); setOperationAction(ISD::STORE, MVT::i1, Custom); - setOperationAction(ISD::STORE, MVT::i32, Custom); - setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); setOperationAction(ISD::SELECT, MVT::i64, Custom); @@ -158,8 +156,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : for (MVT VT : MVT::fp_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); - setTruncStoreAction(MVT::i32, MVT::i8, Custom); - setTruncStoreAction(MVT::i32, MVT::i16, Custom); setTruncStoreAction(MVT::f64, MVT::f32, Expand); setTruncStoreAction(MVT::i64, MVT::i32, Expand); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); @@ -214,6 +210,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : } setOperationAction(ISD::FDIV, MVT::f32, Custom); + setOperationAction(ISD::FDIV, MVT::f64, Custom); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); @@ -314,9 +311,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, if (!VT.isSimple() || VT == MVT::Other) return false; - // XXX - CI changes say "Support for unaligned memory accesses" but I don't - // see what for specifically. The wording everywhere else seems to be the - // same. + // TODO - CI+ supports unaligned memory accesses, but this requires driver + // support. // XXX - The only mention I see of this in the ISA manual is for LDS direct // reads the "byte address and must be dword aligned". Is it also true for the @@ -328,12 +324,18 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, return Align % 4 == 0; } + // Smaller than dword value must be aligned. + // FIXME: This should be allowed on CI+ + if (VT.bitsLT(MVT::i32)) + return false; + // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the // byte-address are ignored, thus forcing Dword alignment. // This applies to private, global, and constant memory. if (IsFast) *IsFast = true; - return VT.bitsGT(MVT::i32); + + return VT.bitsGT(MVT::i32) && Align % 4 == 0; } EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, @@ -448,7 +450,7 @@ SDValue SITargetLowering::LowerFormalArguments( // We REALLY want the ORIGINAL number of vertex elements here, e.g. a // three or five element vertex only needs three or five registers, // NOT four or eigth. - Type *ParamType = FType->getParamType(Arg.OrigArgIndex); + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); unsigned NumElements = ParamType->getVectorNumElements(); for (unsigned j = 0; j != NumElements; ++j) { @@ -531,7 +533,7 @@ SDValue SITargetLowering::LowerFormalArguments( Offset, Ins[i].Flags.isSExt()); const PointerType *ParamTy = - dyn_cast<PointerType>(FType->getParamType(Ins[i].OrigArgIndex)); + dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { // On SI local pointers are just offsets into LDS, so they are always @@ -566,7 +568,7 @@ SDValue SITargetLowering::LowerFormalArguments( if (Arg.VT.isVector()) { // Build a vector from the registers - Type *ParamType = FType->getParamType(Arg.OrigArgIndex); + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); unsigned NumElements = ParamType->getVectorNumElements(); SmallVector<SDValue, 4> Regs; @@ -919,6 +921,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_fract: + case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. + return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1), + DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1))); + default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); } @@ -1104,7 +1112,70 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { } SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { - return SDValue(); + if (DAG.getTarget().Options.UnsafeFPMath) + return LowerFastFDIV(Op, DAG); + + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + SDValue Y = Op.getOperand(1); + + const SDValue One = DAG.getConstantFP(1.0, MVT::f64); + + SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); + + SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X); + + SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); + + SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); + + SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); + + SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); + + SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); + + SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X); + + SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); + + SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64, + NegDivScale0, Mul, DivScale1); + + SDValue Scale; + + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { + // Workaround a hardware bug on SI where the condition output from div_scale + // is not usable. + + const SDValue Hi = DAG.getConstant(1, MVT::i32); + + // Figure out if the scale to use for div_fmas. + SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); + SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); + SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); + SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); + + SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); + SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); + + SDValue Scale0Hi + = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); + SDValue Scale1Hi + = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); + + SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); + SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); + Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); + } else { + Scale = DivScale1.getValue(1); + } + + SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, + Fma4, Fma3, Mul, Scale); + + return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); } SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { @@ -1125,11 +1196,6 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { EVT VT = Store->getMemoryVT(); // These stores are legal. - if (Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && - VT.isVector() && VT.getVectorNumElements() == 2 && - VT.getVectorElementType() == MVT::i32) - return SDValue(); - if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { if (VT.isVector() && VT.getVectorNumElements() > 4) return ScalarizeVectorStore(Op, DAG); @@ -1524,6 +1590,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case AMDGPUISD::UMAX: case AMDGPUISD::UMIN: { if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && + N->getValueType(0) != MVT::f64 && getTargetMachine().getOptLevel() > CodeGenOpt::None) return performMin3Max3Combine(N, DCI); break; |