diff options
Diffstat (limited to 'contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 1610 |
1 files changed, 1107 insertions, 503 deletions
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 16ddaf1..5c52bb1 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -13,6 +13,7 @@ #include "PPCISelLowering.h" #include "MCTargetDesc/PPCPredicates.h" +#include "PPCCallingConv.h" #include "PPCMachineFunctionInfo.h" #include "PPCPerfectShuffle.h" #include "PPCTargetMachine.h" @@ -24,6 +25,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" @@ -39,6 +41,10 @@ #include "llvm/Target/TargetOptions.h" using namespace llvm; +// FIXME: Remove this once soft-float is supported. +static cl::opt<bool> DisablePPCFloatInVariadic("disable-ppc-float-in-variadic", +cl::desc("disable saving float registers for va_start on PPC"), cl::Hidden); + static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc", cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden); @@ -51,20 +57,9 @@ cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden); // FIXME: Remove this once the bug has been fixed! extern cl::opt<bool> ANDIGlueBug; -static TargetLoweringObjectFile *createTLOF(const Triple &TT) { - // If it isn't a Mach-O file then it's going to be a linux ELF - // object file. - if (TT.isOSDarwin()) - return new TargetLoweringObjectFileMachO(); - - return new PPC64LinuxTargetObjectFile(); -} - -PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) - : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))), +PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM) + : TargetLowering(TM), Subtarget(*TM.getSubtargetImpl()) { - setPow2DivIsCheap(); - // Use _setjmp/_longjmp instead of setjmp/longjmp. setUseUnderscoreSetJmp(true); setUseUnderscoreLongJmp(true); @@ -80,8 +75,10 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) addRegisterClass(MVT::f64, &PPC::F8RCRegClass); // PowerPC has an i16 but no i8 (or i1) SEXTLOAD - setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); - setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Expand); + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand); + } setTruncStoreAction(MVT::f64, MVT::f32, Expand); @@ -120,12 +117,11 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) if (ANDIGlueBug) setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); - setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote); - setTruncStoreAction(MVT::i64, MVT::i1, Expand); - setTruncStoreAction(MVT::i32, MVT::i1, Expand); - setTruncStoreAction(MVT::i16, MVT::i1, Expand); - setTruncStoreAction(MVT::i8, MVT::i1, Expand); + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); + setTruncStoreAction(VT, MVT::i1, Expand); + } addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass); } @@ -400,10 +396,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) if (Subtarget.hasAltivec()) { // First set operation action for all vector types to expand. Then we // will selectively turn on ones that can be effectively codegen'd. - for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) { - MVT::SimpleValueType VT = (MVT::SimpleValueType)i; - + for (MVT VT : MVT::vector_valuetypes()) { // add/sub are legal for all supported vector VT's. setOperationAction(ISD::ADD , VT, Legal); setOperationAction(ISD::SUB , VT, Legal); @@ -470,14 +463,12 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); - for (unsigned j = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - j <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++j) { - MVT::SimpleValueType InnerVT = (MVT::SimpleValueType)j; + for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); } - setLoadExtAction(ISD::SEXTLOAD, VT, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, Expand); } // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle @@ -604,15 +595,15 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) } } - if (Subtarget.has64BitSupport()) { + if (Subtarget.has64BitSupport()) setOperationAction(ISD::PREFETCH, MVT::Other, Legal); - setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); - } - setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand); - setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand); - setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand); - setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand); + setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom); + + if (!isPPC64) { + setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand); + } setBooleanContents(ZeroOrOneBooleanContent); // Altivec instructions set fields to all zeros or all ones. @@ -637,6 +628,8 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::SINT_TO_FP); + if (Subtarget.hasFPCVT()) + setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::BR_CC); @@ -644,6 +637,8 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setTargetDAGCombine(ISD::BRCOND); setTargetDAGCombine(ISD::BSWAP); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); + setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); + setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); @@ -684,10 +679,23 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) if (Subtarget.isDarwin()) setPrefFunctionAlignment(4); - if (isPPC64 && Subtarget.isJITCodeModel()) - // Temporary workaround for the inability of PPC64 JIT to handle jump - // tables. - setSupportJumpTables(false); + switch (Subtarget.getDarwinDirective()) { + default: break; + case PPC::DIR_970: + case PPC::DIR_A2: + case PPC::DIR_E500mc: + case PPC::DIR_E5500: + case PPC::DIR_PWR4: + case PPC::DIR_PWR5: + case PPC::DIR_PWR5X: + case PPC::DIR_PWR6: + case PPC::DIR_PWR6X: + case PPC::DIR_PWR7: + case PPC::DIR_PWR8: + setPrefFunctionAlignment(4); + setPrefLoopAlignment(4); + break; + } setInsertFencesForAtomic(true); @@ -698,8 +706,8 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) computeRegisterProperties(); - // The Freescale cores does better with aggressive inlining of memcpy and - // friends. Gcc uses same threshold of 128 bytes (= 32 word stores). + // The Freescale cores do better with aggressive inlining of memcpy and + // friends. GCC uses same threshold of 128 bytes (= 32 word stores). if (Subtarget.getDarwinDirective() == PPC::DIR_E500mc || Subtarget.getDarwinDirective() == PPC::DIR_E5500) { MaxStoresPerMemset = 32; @@ -708,8 +716,6 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) MaxStoresPerMemcpyOptSize = 8; MaxStoresPerMemmove = 32; MaxStoresPerMemmoveOptSize = 8; - - setPrefFunctionAlignment(4); } } @@ -761,14 +767,20 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { default: return nullptr; case PPCISD::FSEL: return "PPCISD::FSEL"; case PPCISD::FCFID: return "PPCISD::FCFID"; + case PPCISD::FCFIDU: return "PPCISD::FCFIDU"; + case PPCISD::FCFIDS: return "PPCISD::FCFIDS"; + case PPCISD::FCFIDUS: return "PPCISD::FCFIDUS"; case PPCISD::FCTIDZ: return "PPCISD::FCTIDZ"; case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ"; + case PPCISD::FCTIDUZ: return "PPCISD::FCTIDUZ"; + case PPCISD::FCTIWUZ: return "PPCISD::FCTIWUZ"; case PPCISD::FRE: return "PPCISD::FRE"; case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE"; case PPCISD::STFIWX: return "PPCISD::STFIWX"; case PPCISD::VMADDFP: return "PPCISD::VMADDFP"; case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP"; case PPCISD::VPERM: return "PPCISD::VPERM"; + case PPCISD::CMPB: return "PPCISD::CMPB"; case PPCISD::Hi: return "PPCISD::Hi"; case PPCISD::Lo: return "PPCISD::Lo"; case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY"; @@ -785,7 +797,9 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::CALL_NOP_TLS: return "PPCISD::CALL_NOP_TLS"; case PPCISD::MTCTR: return "PPCISD::MTCTR"; case PPCISD::BCTRL: return "PPCISD::BCTRL"; + case PPCISD::BCTRL_LOAD_TOC: return "PPCISD::BCTRL_LOAD_TOC"; case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG"; + case PPCISD::READ_TIME_BASE: return "PPCISD::READ_TIME_BASE"; case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP"; case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP"; case PPCISD::MFOCRF: return "PPCISD::MFOCRF"; @@ -793,6 +807,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::VCMPo: return "PPCISD::VCMPo"; case PPCISD::LBRX: return "PPCISD::LBRX"; case PPCISD::STBRX: return "PPCISD::STBRX"; + case PPCISD::LFIWAX: return "PPCISD::LFIWAX"; + case PPCISD::LFIWZX: return "PPCISD::LFIWZX"; case PPCISD::LARX: return "PPCISD::LARX"; case PPCISD::STCX: return "PPCISD::STCX"; case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH"; @@ -827,6 +843,11 @@ EVT PPCTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { return VT.changeVectorElementTypeToInteger(); } +bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const { + assert(VT.isFloatingPoint() && "Non-floating-point FMA?"); + return true; +} + //===----------------------------------------------------------------------===// // Node matching predicates, for use by the tblgen matching code. //===----------------------------------------------------------------------===// @@ -858,20 +879,21 @@ static bool isConstantOrUndef(int Op, int Val) { /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, SelectionDAG &DAG) { + bool IsLE = DAG.getSubtarget().getDataLayout()->isLittleEndian(); if (ShuffleKind == 0) { - if (DAG.getTarget().getDataLayout()->isLittleEndian()) + if (IsLE) return false; for (unsigned i = 0; i != 16; ++i) if (!isConstantOrUndef(N->getMaskElt(i), i*2+1)) return false; } else if (ShuffleKind == 2) { - if (!DAG.getTarget().getDataLayout()->isLittleEndian()) + if (!IsLE) return false; for (unsigned i = 0; i != 16; ++i) if (!isConstantOrUndef(N->getMaskElt(i), i*2)) return false; } else if (ShuffleKind == 1) { - unsigned j = DAG.getTarget().getDataLayout()->isLittleEndian() ? 0 : 1; + unsigned j = IsLE ? 0 : 1; for (unsigned i = 0; i != 8; ++i) if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) || !isConstantOrUndef(N->getMaskElt(i+8), i*2+j)) @@ -888,22 +910,23 @@ bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, SelectionDAG &DAG) { + bool IsLE = DAG.getSubtarget().getDataLayout()->isLittleEndian(); if (ShuffleKind == 0) { - if (DAG.getTarget().getDataLayout()->isLittleEndian()) + if (IsLE) return false; for (unsigned i = 0; i != 16; i += 2) if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) || !isConstantOrUndef(N->getMaskElt(i+1), i*2+3)) return false; } else if (ShuffleKind == 2) { - if (!DAG.getTarget().getDataLayout()->isLittleEndian()) + if (!IsLE) return false; for (unsigned i = 0; i != 16; i += 2) if (!isConstantOrUndef(N->getMaskElt(i ), i*2) || !isConstantOrUndef(N->getMaskElt(i+1), i*2+1)) return false; } else if (ShuffleKind == 1) { - unsigned j = DAG.getTarget().getDataLayout()->isLittleEndian() ? 0 : 2; + unsigned j = IsLE ? 0 : 2; for (unsigned i = 0; i != 8; i += 2) if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) || @@ -942,7 +965,7 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize, /// the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, unsigned ShuffleKind, SelectionDAG &DAG) { - if (DAG.getTarget().getDataLayout()->isLittleEndian()) { + if (DAG.getSubtarget().getDataLayout()->isLittleEndian()) { if (ShuffleKind == 1) // unary return isVMerge(N, UnitSize, 0, 0); else if (ShuffleKind == 2) // swapped @@ -967,7 +990,7 @@ bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, /// the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, unsigned ShuffleKind, SelectionDAG &DAG) { - if (DAG.getTarget().getDataLayout()->isLittleEndian()) { + if (DAG.getSubtarget().getDataLayout()->isLittleEndian()) { if (ShuffleKind == 1) // unary return isVMerge(N, UnitSize, 8, 8); else if (ShuffleKind == 2) // swapped @@ -1011,7 +1034,8 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, if (ShiftAmt < i) return -1; ShiftAmt -= i; - bool isLE = DAG.getTarget().getDataLayout()->isLittleEndian(); + bool isLE = DAG.getTarget().getSubtargetImpl()->getDataLayout()-> + isLittleEndian(); if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) { // Check the rest of the elements to see if they are consecutive. @@ -1084,7 +1108,7 @@ unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); assert(isSplatShuffleMask(SVOp, EltSize)); - if (DAG.getTarget().getDataLayout()->isLittleEndian()) + if (DAG.getSubtarget().getDataLayout()->isLittleEndian()) return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize); else return SVOp->getMaskElt(0) / EltSize; @@ -1881,7 +1905,7 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG, // gpr_index SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, VAListPtr, MachinePointerInfo(SV), MVT::i8, - false, false, 0); + false, false, false, 0); InChain = GprIndex.getValue(1); if (VT == MVT::i64) { @@ -1904,7 +1928,7 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG, // fpr SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, FprPtr, MachinePointerInfo(SV), MVT::i8, - false, false, 0); + false, false, false, 0); InChain = FprIndex.getValue(1); SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, @@ -2318,7 +2342,8 @@ static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, /// ensure minimum alignment required for target. static unsigned EnsureStackAlignment(const TargetMachine &Target, unsigned NumBytes) { - unsigned TargetAlign = Target.getFrameLowering()->getStackAlignment(); + unsigned TargetAlign = + Target.getSubtargetImpl()->getFrameLowering()->getStackAlignment(); unsigned AlignMask = TargetAlign - 1; NumBytes = (NumBytes + AlignMask) & ~AlignMask; return NumBytes; @@ -2395,8 +2420,8 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); // Reserve space for the linkage area on the stack. unsigned LinkageSize = PPCFrameLowering::getLinkageSize(false, false, false); @@ -2470,7 +2495,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( // caller's stack frame, right above the parameter list area. SmallVector<CCValAssign, 16> ByValArgLocs; CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ByValArgLocs, *DAG.getContext()); + ByValArgLocs, *DAG.getContext()); // Reserve stack space for the allocations in CCInfo. CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); @@ -2503,7 +2528,9 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, PPC::F8 }; - const unsigned NumFPArgRegs = array_lengthof(FPArgRegs); + unsigned NumFPArgRegs = array_lengthof(FPArgRegs); + if (DisablePPCFloatInVariadic) + NumFPArgRegs = 0; FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs, NumGPArgRegs)); @@ -2512,7 +2539,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( // Make room for NumGPArgRegs and NumFPArgRegs. int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 + - NumFPArgRegs * EVT(MVT::f64).getSizeInBits()/8; + NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8; FuncInfo->setVarArgsStackOffset( MFI->CreateFixedObject(PtrVT.getSizeInBits()/8, @@ -2554,7 +2581,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( MachinePointerInfo(), false, false, 0); MemOps.push_back(Store); // Increment the address by eight for the next argument to store - SDValue PtrOff = DAG.getConstant(EVT(MVT::f64).getSizeInBits()/8, + SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8, PtrVT); FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); } @@ -2703,7 +2730,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( int FI; if (HasParameterArea || ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize) - FI = MFI->CreateFixedObject(ArgSize, ArgOffset, false); + FI = MFI->CreateFixedObject(ArgSize, ArgOffset, false, true); else FI = MFI->CreateStackObject(ArgSize, Align, false); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); @@ -3069,7 +3096,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin( CurArgOffset = CurArgOffset + (4 - ObjSize); } // The value of the object is its address. - int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, false); + int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, false, true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); InVals.push_back(FIN); if (ObjSize==1 || ObjSize==2) { @@ -3547,9 +3574,24 @@ void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain, InFlag = Chain.getValue(1); } +// Is this global address that of a function that can be called by name? (as +// opposed to something that must hold a descriptor for an indirect call). +static bool isFunctionGlobalAddress(SDValue Callee) { + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + if (Callee.getOpcode() == ISD::GlobalTLSAddress || + Callee.getOpcode() == ISD::TargetGlobalTLSAddress) + return false; + + return G->getGlobal()->getType()->getElementType()->isFunctionTy(); + } + + return false; +} + static unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, SDLoc dl, int SPDiff, bool isTailCall, + bool IsPatchPoint, SmallVectorImpl<std::pair<unsigned, SDValue> > &RegsToPass, SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys, const PPCSubtarget &Subtarget) { @@ -3572,34 +3614,31 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, needIndirectCall = false; } - if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { - // XXX Work around for http://llvm.org/bugs/show_bug.cgi?id=5201 - // Use indirect calls for ALL functions calls in JIT mode, since the - // far-call stubs may be outside relocation limits for a BL instruction. - if (!DAG.getTarget().getSubtarget<PPCSubtarget>().isJITCodeModel()) { - unsigned OpFlags = 0; - if ((DAG.getTarget().getRelocationModel() != Reloc::Static && - (Subtarget.getTargetTriple().isMacOSX() && - Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) && - (G->getGlobal()->isDeclaration() || - G->getGlobal()->isWeakForLinker())) || - (Subtarget.isTargetELF() && !isPPC64 && - !G->getGlobal()->hasLocalLinkage() && - DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { - // PC-relative references to external symbols should go through $stub, - // unless we're building with the leopard linker or later, which - // automatically synthesizes these stubs. - OpFlags = PPCII::MO_PLT_OR_STUB; - } - - // If the callee is a GlobalAddress/ExternalSymbol node (quite common, - // every direct call is) turn it into a TargetGlobalAddress / - // TargetExternalSymbol node so that legalize doesn't hack it. - Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, - Callee.getValueType(), - 0, OpFlags); - needIndirectCall = false; + if (isFunctionGlobalAddress(Callee)) { + GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee); + // A call to a TLS address is actually an indirect call to a + // thread-specific pointer. + unsigned OpFlags = 0; + if ((DAG.getTarget().getRelocationModel() != Reloc::Static && + (Subtarget.getTargetTriple().isMacOSX() && + Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) && + (G->getGlobal()->isDeclaration() || + G->getGlobal()->isWeakForLinker())) || + (Subtarget.isTargetELF() && !isPPC64 && + !G->getGlobal()->hasLocalLinkage() && + DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { + // PC-relative references to external symbols should go through $stub, + // unless we're building with the leopard linker or later, which + // automatically synthesizes these stubs. + OpFlags = PPCII::MO_PLT_OR_STUB; } + + // If the callee is a GlobalAddress/ExternalSymbol node (quite common, + // every direct call is) turn it into a TargetGlobalAddress / + // TargetExternalSymbol node so that legalize doesn't hack it. + Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, + Callee.getValueType(), 0, OpFlags); + needIndirectCall = false; } if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { @@ -3609,7 +3648,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, (Subtarget.getTargetTriple().isMacOSX() && Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5))) || (Subtarget.isTargetELF() && !isPPC64 && - DAG.getTarget().getRelocationModel() == Reloc::PIC_) ) { + DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. @@ -3621,6 +3660,16 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, needIndirectCall = false; } + if (IsPatchPoint) { + // We'll form an invalid direct call when lowering a patchpoint; the full + // sequence for an indirect call is complicated, and many of the + // instructions introduced might have side effects (and, thus, can't be + // removed later). The call itself will be removed as soon as the + // argument/return lowering is complete, so the fact that it has the wrong + // kind of operands should not really matter. + needIndirectCall = false; + } + if (needIndirectCall) { // Otherwise, this is an indirect call. We have to use a MTCTR/BCTRL pair // to do the call, we can't use PPCISD::CALL. @@ -3746,7 +3795,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, RegsToPass[i].second.getValueType())); // Direct calls in the ELFv2 ABI need the TOC register live into the call. - if (Callee.getNode() && isELFv2ABI) + if (Callee.getNode() && isELFv2ABI && !IsPatchPoint) Ops.push_back(DAG.getRegister(PPC::X2, PtrVT)); return CallOpc; @@ -3769,8 +3818,8 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, SmallVectorImpl<SDValue> &InVals) const { SmallVector<CCValAssign, 16> RVLocs; - CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC); // Copy all of the result registers out of their specified physreg. @@ -3809,7 +3858,7 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, SDValue PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, - bool isTailCall, bool isVarArg, + bool isTailCall, bool isVarArg, bool IsPatchPoint, SelectionDAG &DAG, SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, @@ -3823,8 +3872,8 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, std::vector<EVT> NodeTys; SmallVector<SDValue, 8> Ops; unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, dl, SPDiff, - isTailCall, RegsToPass, Ops, NodeTys, - Subtarget); + isTailCall, IsPatchPoint, RegsToPass, Ops, + NodeTys, Subtarget); // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64()) @@ -3838,7 +3887,8 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0; // Add a register mask operand representing the call-preserved registers. - const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const TargetRegisterInfo *TRI = + getTargetMachine().getSubtargetImpl()->getRegisterInfo(); const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -3867,8 +3917,8 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, // stack frame. If caller and callee belong to the same module (and have the // same TOC), the NOP will remain unchanged. - bool needsTOCRestore = false; - if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64()) { + if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64() && + !IsPatchPoint) { if (CallOpc == PPCISD::BCTRL) { // This is a call through a function pointer. // Restore the caller TOC from the save area into R2. @@ -3879,7 +3929,17 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, // since r2 is a reserved register (which prevents the register allocator // from allocating it), resulting in an additional register being // allocated and an unnecessary move instruction being generated. - needsTOCRestore = true; + CallOpc = PPCISD::BCTRL_LOAD_TOC; + + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT); + unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(isELFv2ABI); + SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset); + SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff); + + // The address needs to go after the chain input but before the flag (or + // any other variadic arguments). + Ops.insert(std::next(Ops.begin()), AddTOC); } else if ((CallOpc == PPCISD::CALL) && (!isLocalCall(Callee) || DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { @@ -3893,17 +3953,6 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); InFlag = Chain.getValue(1); - if (needsTOCRestore) { - SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); - SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT); - unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(isELFv2ABI); - SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset); - SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff); - Chain = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain, AddTOC, InFlag); - InFlag = Chain.getValue(1); - } - Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), DAG.getIntPtrConstant(BytesCalleePops, true), InFlag, dl); @@ -3927,6 +3976,7 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool &isTailCall = CLI.IsTailCall; CallingConv::ID CallConv = CLI.CallConv; bool isVarArg = CLI.IsVarArg; + bool IsPatchPoint = CLI.IsPatchPoint; if (isTailCall) isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, @@ -3939,23 +3989,23 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (Subtarget.isSVR4ABI()) { if (Subtarget.isPPC64()) return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg, - isTailCall, Outs, OutVals, Ins, + isTailCall, IsPatchPoint, Outs, OutVals, Ins, dl, DAG, InVals); else return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg, - isTailCall, Outs, OutVals, Ins, + isTailCall, IsPatchPoint, Outs, OutVals, Ins, dl, DAG, InVals); } return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg, - isTailCall, Outs, OutVals, Ins, + isTailCall, IsPatchPoint, Outs, OutVals, Ins, dl, DAG, InVals); } SDValue PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, - bool isTailCall, + bool isTailCall, bool IsPatchPoint, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, @@ -3986,8 +4036,8 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, // Assign locations to all of the outgoing arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); // Reserve space for the linkage area on the stack. CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false, false), @@ -4028,7 +4078,7 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, // Assign locations to all of the outgoing aggregate by value arguments. SmallVector<CCValAssign, 16> ByValArgLocs; CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ByValArgLocs, *DAG.getContext()); + ByValArgLocs, *DAG.getContext()); // Reserve stack space for the allocations in CCInfo. CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); @@ -4165,7 +4215,7 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp, false, TailCallArguments); - return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG, + return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, DAG, RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes, Ins, InVals); } @@ -4193,7 +4243,7 @@ PPCTargetLowering::createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff, SDValue PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, - bool isTailCall, + bool isTailCall, bool IsPatchPoint, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, @@ -4365,7 +4415,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, if (GPR_idx != NumGPRs) { SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, MachinePointerInfo(), VT, - false, false, 0); + false, false, false, 0); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load)); @@ -4629,9 +4679,9 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // Check if this is an indirect call (MTCTR/BCTRL). // See PrepareCall() for more information about calls through function // pointers in the 64-bit SVR4 ABI. - if (!isTailCall && - !dyn_cast<GlobalAddressSDNode>(Callee) && - !dyn_cast<ExternalSymbolSDNode>(Callee)) { + if (!isTailCall && !IsPatchPoint && + !isFunctionGlobalAddress(Callee) && + !isa<ExternalSymbolSDNode>(Callee)) { // Load r2 into a virtual register and store it to the TOC save area. SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64); // TOC save area offset. @@ -4643,7 +4693,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // In the ELFv2 ABI, R12 must contain the address of an indirect callee. // This does not mean the MTCTR instruction must use R12; it's easier // to model this as an extra parameter, so do that. - if (isELFv2ABI) + if (isELFv2ABI && !IsPatchPoint) RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee)); } @@ -4660,7 +4710,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp, FPOp, true, TailCallArguments); - return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG, + return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, DAG, RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes, Ins, InVals); } @@ -4668,7 +4718,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, SDValue PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, - bool isTailCall, + bool isTailCall, bool IsPatchPoint, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, @@ -4835,7 +4885,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, if (GPR_idx != NumGPRs) { SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, MachinePointerInfo(), VT, - false, false, 0); + false, false, false, 0); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); @@ -5034,8 +5084,8 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, // not mean the MTCTR instruction must use R12; it's easier to model this as // an extra parameter, so do that. if (!isTailCall && - !dyn_cast<GlobalAddressSDNode>(Callee) && - !dyn_cast<ExternalSymbolSDNode>(Callee) && + !isFunctionGlobalAddress(Callee) && + !isa<ExternalSymbolSDNode>(Callee) && !isBLACompatibleAddress(Callee, DAG)) RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 : PPC::R12), Callee)); @@ -5053,7 +5103,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, PrepareTailCall(DAG, InFlag, Chain, dl, isPPC64, SPDiff, NumBytes, LROp, FPOp, true, TailCallArguments); - return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG, + return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, DAG, RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes, Ins, InVals); } @@ -5064,8 +5114,7 @@ PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv, const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), - RVLocs, Context); + CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_PPC); } @@ -5077,8 +5126,8 @@ PPCTargetLowering::LowerReturn(SDValue Chain, SDLoc dl, SelectionDAG &DAG) const { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_PPC); SDValue Flag; @@ -5168,7 +5217,7 @@ PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const { // Find out what the fix offset of the frame pointer save area. int LROffset = PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI); // Allocate the frame index for frame pointer save area. - RASI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, LROffset, true); + RASI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, LROffset, false); // Save the result. FI->setReturnAddrSaveIndex(RASI); } @@ -5386,9 +5435,9 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { return Op; } -// FIXME: Split this code up when LegalizeDAGTypes lands. -SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, - SDLoc dl) const { +void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI, + SelectionDAG &DAG, + SDLoc dl) const { assert(Op.getOperand(0).getValueType().isFloatingPoint()); SDValue Src = Op.getOperand(0); if (Src.getValueType() == MVT::f32) @@ -5437,15 +5486,95 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, if (Op.getValueType() == MVT::i32 && !i32Stack) { FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, DAG.getConstant(4, FIPtr.getValueType())); - MPI = MachinePointerInfo(); + MPI = MPI.getWithOffset(4); } - return DAG.getLoad(Op.getValueType(), dl, Chain, FIPtr, MPI, - false, false, false, 0); + RLI.Chain = Chain; + RLI.Ptr = FIPtr; + RLI.MPI = MPI; +} + +SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, + SDLoc dl) const { + ReuseLoadInfo RLI; + LowerFP_TO_INTForReuse(Op, RLI, DAG, dl); + + return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI, false, + false, RLI.IsInvariant, RLI.Alignment, RLI.AAInfo, + RLI.Ranges); +} + +// We're trying to insert a regular store, S, and then a load, L. If the +// incoming value, O, is a load, we might just be able to have our load use the +// address used by O. However, we don't know if anything else will store to +// that address before we can load from it. To prevent this situation, we need +// to insert our load, L, into the chain as a peer of O. To do this, we give L +// the same chain operand as O, we create a token factor from the chain results +// of O and L, and we replace all uses of O's chain result with that token +// factor (see spliceIntoChain below for this last part). +bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT, + ReuseLoadInfo &RLI, + SelectionDAG &DAG, + ISD::LoadExtType ET) const { + SDLoc dl(Op); + if (ET == ISD::NON_EXTLOAD && + (Op.getOpcode() == ISD::FP_TO_UINT || + Op.getOpcode() == ISD::FP_TO_SINT) && + isOperationLegalOrCustom(Op.getOpcode(), + Op.getOperand(0).getValueType())) { + + LowerFP_TO_INTForReuse(Op, RLI, DAG, dl); + return true; + } + + LoadSDNode *LD = dyn_cast<LoadSDNode>(Op); + if (!LD || LD->getExtensionType() != ET || LD->isVolatile() || + LD->isNonTemporal()) + return false; + if (LD->getMemoryVT() != MemVT) + return false; + + RLI.Ptr = LD->getBasePtr(); + if (LD->isIndexed() && LD->getOffset().getOpcode() != ISD::UNDEF) { + assert(LD->getAddressingMode() == ISD::PRE_INC && + "Non-pre-inc AM on PPC?"); + RLI.Ptr = DAG.getNode(ISD::ADD, dl, RLI.Ptr.getValueType(), RLI.Ptr, + LD->getOffset()); + } + + RLI.Chain = LD->getChain(); + RLI.MPI = LD->getPointerInfo(); + RLI.IsInvariant = LD->isInvariant(); + RLI.Alignment = LD->getAlignment(); + RLI.AAInfo = LD->getAAInfo(); + RLI.Ranges = LD->getRanges(); + + RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1); + return true; +} + +// Given the head of the old chain, ResChain, insert a token factor containing +// it and NewResChain, and make users of ResChain now be users of that token +// factor. +void PPCTargetLowering::spliceIntoChain(SDValue ResChain, + SDValue NewResChain, + SelectionDAG &DAG) const { + if (!ResChain) + return; + + SDLoc dl(NewResChain); + + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + NewResChain, DAG.getUNDEF(MVT::Other)); + assert(TF.getNode() != NewResChain.getNode() && + "A new TF really is required here"); + + DAG.ReplaceAllUsesOfValueWith(ResChain, TF); + DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain); } SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, - SelectionDAG &DAG) const { + SelectionDAG &DAG) const { SDLoc dl(Op); // Don't handle ppc_fp128 here; let it be lowered to a libcall. if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) @@ -5517,7 +5646,70 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT); } - SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT); + ReuseLoadInfo RLI; + SDValue Bits; + + MachineFunction &MF = DAG.getMachineFunction(); + if (canReuseLoadAddress(SINT, MVT::i64, RLI, DAG)) { + Bits = DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI, false, + false, RLI.IsInvariant, RLI.Alignment, RLI.AAInfo, + RLI.Ranges); + spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); + } else if (Subtarget.hasLFIWAX() && + canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::SEXTLOAD)) { + MachineMemOperand *MMO = + MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, + RLI.Alignment, RLI.AAInfo, RLI.Ranges); + SDValue Ops[] = { RLI.Chain, RLI.Ptr }; + Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWAX, dl, + DAG.getVTList(MVT::f64, MVT::Other), + Ops, MVT::i32, MMO); + spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); + } else if (Subtarget.hasFPCVT() && + canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::ZEXTLOAD)) { + MachineMemOperand *MMO = + MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, + RLI.Alignment, RLI.AAInfo, RLI.Ranges); + SDValue Ops[] = { RLI.Chain, RLI.Ptr }; + Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWZX, dl, + DAG.getVTList(MVT::f64, MVT::Other), + Ops, MVT::i32, MMO); + spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); + } else if (((Subtarget.hasLFIWAX() && + SINT.getOpcode() == ISD::SIGN_EXTEND) || + (Subtarget.hasFPCVT() && + SINT.getOpcode() == ISD::ZERO_EXTEND)) && + SINT.getOperand(0).getValueType() == MVT::i32) { + MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + + int FrameIdx = FrameInfo->CreateStackObject(4, 4, false); + SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); + + SDValue Store = + DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx, + MachinePointerInfo::getFixedStack(FrameIdx), + false, false, 0); + + assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 && + "Expected an i32 store"); + + RLI.Ptr = FIdx; + RLI.Chain = Store; + RLI.MPI = MachinePointerInfo::getFixedStack(FrameIdx); + RLI.Alignment = 4; + + MachineMemOperand *MMO = + MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, + RLI.Alignment, RLI.AAInfo, RLI.Ranges); + SDValue Ops[] = { RLI.Chain, RLI.Ptr }; + Bits = DAG.getMemIntrinsicNode(SINT.getOpcode() == ISD::ZERO_EXTEND ? + PPCISD::LFIWZX : PPCISD::LFIWAX, + dl, DAG.getVTList(MVT::f64, MVT::Other), + Ops, MVT::i32, MMO); + } else + Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT); + SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits); if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) @@ -5538,23 +5730,36 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, SDValue Ld; if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) { - int FrameIdx = FrameInfo->CreateStackObject(4, 4, false); - SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - - SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, - MachinePointerInfo::getFixedStack(FrameIdx), - false, false, 0); + ReuseLoadInfo RLI; + bool ReusingLoad; + if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI, + DAG))) { + int FrameIdx = FrameInfo->CreateStackObject(4, 4, false); + SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); + + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, + MachinePointerInfo::getFixedStack(FrameIdx), + false, false, 0); + + assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 && + "Expected an i32 store"); + + RLI.Ptr = FIdx; + RLI.Chain = Store; + RLI.MPI = MachinePointerInfo::getFixedStack(FrameIdx); + RLI.Alignment = 4; + } - assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 && - "Expected an i32 store"); MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx), - MachineMemOperand::MOLoad, 4, 4); - SDValue Ops[] = { Store, FIdx }; + MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, + RLI.Alignment, RLI.AAInfo, RLI.Ranges); + SDValue Ops[] = { RLI.Chain, RLI.Ptr }; Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::LFIWZX : PPCISD::LFIWAX, dl, DAG.getVTList(MVT::f64, MVT::Other), Ops, MVT::i32, MMO); + if (ReusingLoad) + spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG); } else { assert(Subtarget.isPPC64() && "i32->FP without LFIWAX supported only on PPC64"); @@ -6467,7 +6672,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::FP_TO_UINT: case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, - SDLoc(Op)); + SDLoc(Op)); case ISD::UINT_TO_FP: case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); @@ -6502,6 +6707,15 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, switch (N->getOpcode()) { default: llvm_unreachable("Do not know how to custom type legalize this operation!"); + case ISD::READCYCLECOUNTER: { + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); + SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0)); + + Results.push_back(RTB); + Results.push_back(RTB.getValue(1)); + Results.push_back(RTB.getValue(2)); + break; + } case ISD::INTRINSIC_W_CHAIN: { if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != Intrinsic::ppc_is_decremented_ctr_nonzero) @@ -6566,11 +6780,44 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, // Other Lowering Code //===----------------------------------------------------------------------===// +static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *Func = Intrinsic::getDeclaration(M, Id); + return Builder.CreateCall(Func); +} + +// The mappings for emitLeading/TrailingFence is taken from +// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html +Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder, + AtomicOrdering Ord, bool IsStore, + bool IsLoad) const { + if (Ord == SequentiallyConsistent) + return callIntrinsic(Builder, Intrinsic::ppc_sync); + else if (isAtLeastRelease(Ord)) + return callIntrinsic(Builder, Intrinsic::ppc_lwsync); + else + return nullptr; +} + +Instruction* PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder, + AtomicOrdering Ord, bool IsStore, + bool IsLoad) const { + if (IsLoad && isAtLeastAcquire(Ord)) + return callIntrinsic(Builder, Intrinsic::ppc_lwsync); + // FIXME: this is too conservative, a dependent branch + isync is enough. + // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and + // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html + // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification. + else + return nullptr; +} + MachineBasicBlock * PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, bool is64bit, unsigned BinOpcode) const { // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction *F = BB->getParent(); @@ -6593,9 +6840,8 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, MachineRegisterInfo &RegInfo = F->getRegInfo(); unsigned TmpReg = (!BinOpcode) ? incr : - RegInfo.createVirtualRegister( - is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass : - (const TargetRegisterClass *) &PPC::GPRCRegClass); + RegInfo.createVirtualRegister( is64bit ? &PPC::G8RCRegClass + : &PPC::GPRCRegClass); // thisMBB: // ... @@ -6632,7 +6878,8 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI, bool is8bit, // operation unsigned BinOpcode) const { // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); // In 64 bit mode we have to use 64 bits for addresses, even though the // lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address // registers without caring whether they're 32 or 64, but here we're @@ -6660,9 +6907,8 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI, exitMBB->transferSuccessorsAndUpdatePHIs(BB); MachineRegisterInfo &RegInfo = F->getRegInfo(); - const TargetRegisterClass *RC = - is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass : - (const TargetRegisterClass *) &PPC::GPRCRegClass; + const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass + : &PPC::GPRCRegClass; unsigned PtrReg = RegInfo.createVirtualRegister(RC); unsigned Shift1Reg = RegInfo.createVirtualRegister(RC); unsigned ShiftReg = RegInfo.createVirtualRegister(RC); @@ -6760,7 +7006,8 @@ llvm::MachineBasicBlock* PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -6859,7 +7106,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, // Setup MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB); const PPCRegisterInfo *TRI = - static_cast<const PPCRegisterInfo*>(getTargetMachine().getRegisterInfo()); + getTargetMachine().getSubtarget<PPCSubtarget>().getRegisterInfo(); MIB.addRegMask(TRI->getNoPreservedMask()); BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1); @@ -6908,7 +7155,8 @@ MachineBasicBlock * PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -7012,6 +7260,10 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MachineBasicBlock * PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { + if (MI->getOpcode() == TargetOpcode::STACKMAP || + MI->getOpcode() == TargetOpcode::PATCHPOINT) + return emitPatchPoint(MI, BB); + if (MI->getOpcode() == PPC::EH_SjLj_SetJmp32 || MI->getOpcode() == PPC::EH_SjLj_SetJmp64) { return emitEHSjLjSetJmp(MI, BB); @@ -7020,7 +7272,8 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, return emitEHSjLjLongJmp(MI, BB); } - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); // To "insert" these instructions we actually have to insert their // control-flow patterns. @@ -7043,7 +7296,8 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, Cond.push_back(MI->getOperand(1)); DebugLoc dl = MI->getDebugLoc(); - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); TII->insertSelect(*BB, MI, dl, MI->getOperand(0).getReg(), Cond, MI->getOperand(2).getReg(), MI->getOperand(3).getReg()); @@ -7052,11 +7306,15 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MI->getOpcode() == PPC::SELECT_CC_F4 || MI->getOpcode() == PPC::SELECT_CC_F8 || MI->getOpcode() == PPC::SELECT_CC_VRRC || + MI->getOpcode() == PPC::SELECT_CC_VSFRC || + MI->getOpcode() == PPC::SELECT_CC_VSRC || MI->getOpcode() == PPC::SELECT_I4 || MI->getOpcode() == PPC::SELECT_I8 || MI->getOpcode() == PPC::SELECT_F4 || MI->getOpcode() == PPC::SELECT_F8 || - MI->getOpcode() == PPC::SELECT_VRRC) { + MI->getOpcode() == PPC::SELECT_VRRC || + MI->getOpcode() == PPC::SELECT_VSFRC || + MI->getOpcode() == PPC::SELECT_VSRC) { // The incoming instruction knows the destination vreg to set, the // condition code register to branch on, the true/false values to // select between, and a branch opcode to use. @@ -7087,7 +7345,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MI->getOpcode() == PPC::SELECT_I8 || MI->getOpcode() == PPC::SELECT_F4 || MI->getOpcode() == PPC::SELECT_F8 || - MI->getOpcode() == PPC::SELECT_VRRC) { + MI->getOpcode() == PPC::SELECT_VRRC || + MI->getOpcode() == PPC::SELECT_VSFRC || + MI->getOpcode() == PPC::SELECT_VSRC) { BuildMI(BB, dl, TII->get(PPC::BC)) .addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB); } else { @@ -7112,6 +7372,51 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, TII->get(PPC::PHI), MI->getOperand(0).getReg()) .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB) .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + } else if (MI->getOpcode() == PPC::ReadTB) { + // To read the 64-bit time-base register on a 32-bit target, we read the + // two halves. Should the counter have wrapped while it was being read, we + // need to try again. + // ... + // readLoop: + // mfspr Rx,TBU # load from TBU + // mfspr Ry,TB # load from TB + // mfspr Rz,TBU # load from TBU + // cmpw crX,Rx,Rz # check if ‘old’=’new’ + // bne readLoop # branch if they're not equal + // ... + + MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); + DebugLoc dl = MI->getDebugLoc(); + F->insert(It, readMBB); + F->insert(It, sinkMBB); + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(BB); + + BB->addSuccessor(readMBB); + BB = readMBB; + + MachineRegisterInfo &RegInfo = F->getRegInfo(); + unsigned ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass); + unsigned LoReg = MI->getOperand(0).getReg(); + unsigned HiReg = MI->getOperand(1).getReg(); + + BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269); + BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268); + BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269); + + unsigned CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); + + BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg) + .addReg(HiReg).addReg(ReadAgainReg); + BuildMI(BB, dl, TII->get(PPC::BCC)) + .addImm(PPC::PRED_NE).addReg(CmpReg).addMBB(readMBB); + + BB->addSuccessor(readMBB); + BB->addSuccessor(sinkMBB); } else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4); @@ -7270,9 +7575,8 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, exitMBB->transferSuccessorsAndUpdatePHIs(BB); MachineRegisterInfo &RegInfo = F->getRegInfo(); - const TargetRegisterClass *RC = - is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass : - (const TargetRegisterClass *) &PPC::GPRCRegClass; + const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass + : &PPC::GPRCRegClass; unsigned PtrReg = RegInfo.createVirtualRegister(RC); unsigned Shift1Reg = RegInfo.createVirtualRegister(RC); unsigned ShiftReg = RegInfo.createVirtualRegister(RC); @@ -7448,151 +7752,76 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // Target Optimization Hooks //===----------------------------------------------------------------------===// -SDValue PPCTargetLowering::DAGCombineFastRecip(SDValue Op, - DAGCombinerInfo &DCI) const { - if (DCI.isAfterLegalizeVectorOps()) - return SDValue(); - - EVT VT = Op.getValueType(); - - if ((VT == MVT::f32 && Subtarget.hasFRES()) || - (VT == MVT::f64 && Subtarget.hasFRE()) || +SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const { + EVT VT = Operand.getValueType(); + if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) || + (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || (VT == MVT::v4f32 && Subtarget.hasAltivec()) || (VT == MVT::v2f64 && Subtarget.hasVSX())) { - - // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) - // For the reciprocal, we need to find the zero of the function: - // F(X) = A X - 1 [which has a zero at X = 1/A] - // => - // X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form - // does not require additional intermediate precision] - // Convergence is quadratic, so we essentially double the number of digits - // correct after every iteration. The minimum architected relative - // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has - // 23 digits and double has 52 digits. - int Iterations = Subtarget.hasRecipPrec() ? 1 : 3; + // correct after every iteration. For both FRE and FRSQRTE, the minimum + // architected relative accuracy is 2^-5. When hasRecipPrec(), this is + // 2^-14. IEEE float has 23 digits and double has 52 digits. + RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3; if (VT.getScalarType() == MVT::f64) - ++Iterations; - - SelectionDAG &DAG = DCI.DAG; - SDLoc dl(Op); - - SDValue FPOne = - DAG.getConstantFP(1.0, VT.getScalarType()); - if (VT.isVector()) { - assert(VT.getVectorNumElements() == 4 && - "Unknown vector type"); - FPOne = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, - FPOne, FPOne, FPOne, FPOne); - } - - SDValue Est = DAG.getNode(PPCISD::FRE, dl, VT, Op); - DCI.AddToWorklist(Est.getNode()); - - // Newton iterations: Est = Est + Est (1 - Arg * Est) - for (int i = 0; i < Iterations; ++i) { - SDValue NewEst = DAG.getNode(ISD::FMUL, dl, VT, Op, Est); - DCI.AddToWorklist(NewEst.getNode()); - - NewEst = DAG.getNode(ISD::FSUB, dl, VT, FPOne, NewEst); - DCI.AddToWorklist(NewEst.getNode()); - - NewEst = DAG.getNode(ISD::FMUL, dl, VT, Est, NewEst); - DCI.AddToWorklist(NewEst.getNode()); - - Est = DAG.getNode(ISD::FADD, dl, VT, Est, NewEst); - DCI.AddToWorklist(Est.getNode()); - } - - return Est; + ++RefinementSteps; + UseOneConstNR = true; + return DCI.DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand); } - return SDValue(); } -SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op, - DAGCombinerInfo &DCI) const { - if (DCI.isAfterLegalizeVectorOps()) - return SDValue(); - - EVT VT = Op.getValueType(); - - if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) || - (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || +SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const { + EVT VT = Operand.getValueType(); + if ((VT == MVT::f32 && Subtarget.hasFRES()) || + (VT == MVT::f64 && Subtarget.hasFRE()) || (VT == MVT::v4f32 && Subtarget.hasAltivec()) || (VT == MVT::v2f64 && Subtarget.hasVSX())) { - - // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) - // For the reciprocal sqrt, we need to find the zero of the function: - // F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)] - // => - // X_{i+1} = X_i (1.5 - A X_i^2 / 2) - // As a result, we precompute A/2 prior to the iteration loop. - // Convergence is quadratic, so we essentially double the number of digits - // correct after every iteration. The minimum architected relative - // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has - // 23 digits and double has 52 digits. - int Iterations = Subtarget.hasRecipPrec() ? 1 : 3; + // correct after every iteration. For both FRE and FRSQRTE, the minimum + // architected relative accuracy is 2^-5. When hasRecipPrec(), this is + // 2^-14. IEEE float has 23 digits and double has 52 digits. + RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3; if (VT.getScalarType() == MVT::f64) - ++Iterations; - - SelectionDAG &DAG = DCI.DAG; - SDLoc dl(Op); - - SDValue FPThreeHalves = - DAG.getConstantFP(1.5, VT.getScalarType()); - if (VT.isVector()) { - assert(VT.getVectorNumElements() == 4 && - "Unknown vector type"); - FPThreeHalves = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, - FPThreeHalves, FPThreeHalves, - FPThreeHalves, FPThreeHalves); - } - - SDValue Est = DAG.getNode(PPCISD::FRSQRTE, dl, VT, Op); - DCI.AddToWorklist(Est.getNode()); - - // We now need 0.5*Arg which we can write as (1.5*Arg - Arg) so that - // this entire sequence requires only one FP constant. - SDValue HalfArg = DAG.getNode(ISD::FMUL, dl, VT, FPThreeHalves, Op); - DCI.AddToWorklist(HalfArg.getNode()); - - HalfArg = DAG.getNode(ISD::FSUB, dl, VT, HalfArg, Op); - DCI.AddToWorklist(HalfArg.getNode()); - - // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est) - for (int i = 0; i < Iterations; ++i) { - SDValue NewEst = DAG.getNode(ISD::FMUL, dl, VT, Est, Est); - DCI.AddToWorklist(NewEst.getNode()); - - NewEst = DAG.getNode(ISD::FMUL, dl, VT, HalfArg, NewEst); - DCI.AddToWorklist(NewEst.getNode()); - - NewEst = DAG.getNode(ISD::FSUB, dl, VT, FPThreeHalves, NewEst); - DCI.AddToWorklist(NewEst.getNode()); + ++RefinementSteps; + return DCI.DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand); + } + return SDValue(); +} - Est = DAG.getNode(ISD::FMUL, dl, VT, Est, NewEst); - DCI.AddToWorklist(Est.getNode()); - } +bool PPCTargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const { + // Note: This functionality is used only when unsafe-fp-math is enabled, and + // on cores with reciprocal estimates (which are used when unsafe-fp-math is + // enabled for division), this functionality is redundant with the default + // combiner logic (once the division -> reciprocal/multiply transformation + // has taken place). As a result, this matters more for older cores than for + // newer ones. - return Est; + // Combine multiple FDIVs with the same divisor into multiple FMULs by the + // reciprocal if there are two or more FDIVs (for embedded cores with only + // one FP pipeline) for three or more FDIVs (for generic OOO cores). + switch (Subtarget.getDarwinDirective()) { + default: + return NumUsers > 2; + case PPC::DIR_440: + case PPC::DIR_A2: + case PPC::DIR_E500mc: + case PPC::DIR_E5500: + return NumUsers > 1; } - - return SDValue(); } -// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does -// not enforce equality of the chain operands. -static bool isConsecutiveLS(LSBaseSDNode *LS, LSBaseSDNode *Base, +static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base, unsigned Bytes, int Dist, SelectionDAG &DAG) { - EVT VT = LS->getMemoryVT(); if (VT.getSizeInBits() / 8 != Bytes) return false; - SDValue Loc = LS->getBasePtr(); SDValue BaseLoc = Base->getBasePtr(); if (Loc.getOpcode() == ISD::FrameIndex) { if (BaseLoc.getOpcode() != ISD::FrameIndex) @@ -7623,11 +7852,77 @@ static bool isConsecutiveLS(LSBaseSDNode *LS, LSBaseSDNode *Base, return false; } +// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does +// not enforce equality of the chain operands. +static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, + unsigned Bytes, int Dist, + SelectionDAG &DAG) { + if (LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(N)) { + EVT VT = LS->getMemoryVT(); + SDValue Loc = LS->getBasePtr(); + return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG); + } + + if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { + EVT VT; + switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { + default: return false; + case Intrinsic::ppc_altivec_lvx: + case Intrinsic::ppc_altivec_lvxl: + case Intrinsic::ppc_vsx_lxvw4x: + VT = MVT::v4i32; + break; + case Intrinsic::ppc_vsx_lxvd2x: + VT = MVT::v2f64; + break; + case Intrinsic::ppc_altivec_lvebx: + VT = MVT::i8; + break; + case Intrinsic::ppc_altivec_lvehx: + VT = MVT::i16; + break; + case Intrinsic::ppc_altivec_lvewx: + VT = MVT::i32; + break; + } + + return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG); + } + + if (N->getOpcode() == ISD::INTRINSIC_VOID) { + EVT VT; + switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { + default: return false; + case Intrinsic::ppc_altivec_stvx: + case Intrinsic::ppc_altivec_stvxl: + case Intrinsic::ppc_vsx_stxvw4x: + VT = MVT::v4i32; + break; + case Intrinsic::ppc_vsx_stxvd2x: + VT = MVT::v2f64; + break; + case Intrinsic::ppc_altivec_stvebx: + VT = MVT::i8; + break; + case Intrinsic::ppc_altivec_stvehx: + VT = MVT::i16; + break; + case Intrinsic::ppc_altivec_stvewx: + VT = MVT::i32; + break; + } + + return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG); + } + + return false; +} + // Return true is there is a nearyby consecutive load to the one provided // (regardless of alignment). We search up and down the chain, looking though -// token factors and other loads (but nothing else). As a result, a true -// results indicates that it is safe to create a new consecutive load adjacent -// to the load provided. +// token factors and other loads (but nothing else). As a result, a true result +// indicates that it is safe to create a new consecutive load adjacent to the +// load provided. static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { SDValue Chain = LD->getChain(); EVT VT = LD->getMemoryVT(); @@ -7641,10 +7936,10 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { // nodes just above the top-level loads and token factors. while (!Queue.empty()) { SDNode *ChainNext = Queue.pop_back_val(); - if (!Visited.insert(ChainNext)) + if (!Visited.insert(ChainNext).second) continue; - if (LoadSDNode *ChainLD = dyn_cast<LoadSDNode>(ChainNext)) { + if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(ChainNext)) { if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) return true; @@ -7672,17 +7967,17 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { while (!Queue.empty()) { SDNode *LoadRoot = Queue.pop_back_val(); - if (!Visited.insert(LoadRoot)) + if (!Visited.insert(LoadRoot).second) continue; - if (LoadSDNode *ChainLD = dyn_cast<LoadSDNode>(LoadRoot)) + if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(LoadRoot)) if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) return true; for (SDNode::use_iterator UI = LoadRoot->use_begin(), UE = LoadRoot->use_end(); UI != UE; ++UI) - if (((isa<LoadSDNode>(*UI) && - cast<LoadSDNode>(*UI)->getChain().getNode() == LoadRoot) || + if (((isa<MemSDNode>(*UI) && + cast<MemSDNode>(*UI)->getChain().getNode() == LoadRoot) || UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI)) Queue.push_back(*UI); } @@ -7802,7 +8097,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, SDValue BinOp = BinOps.back(); BinOps.pop_back(); - if (!Visited.insert(BinOp.getNode())) + if (!Visited.insert(BinOp.getNode()).second) continue; PromOps.push_back(BinOp); @@ -8016,7 +8311,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, SDValue BinOp = BinOps.back(); BinOps.pop_back(); - if (!Visited.insert(BinOp.getNode())) + if (!Visited.insert(BinOp.getNode()).second) continue; PromOps.push_back(BinOp); @@ -8045,6 +8340,10 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, } } + // The operands of a select that must be truncated when the select is + // promoted because the operand is actually part of the to-be-promoted set. + DenseMap<SDNode *, EVT> SelectTruncOp[2]; + // Make sure that this is a self-contained cluster of operations (which // is not quite the same thing as saying that everything has only one // use). @@ -8059,18 +8358,19 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, if (User != N && !Visited.count(User)) return SDValue(); - // Make sure that we're not going to promote the non-output-value - // operand(s) or SELECT or SELECT_CC. - // FIXME: Although we could sometimes handle this, and it does occur in - // practice that one of the condition inputs to the select is also one of - // the outputs, we currently can't deal with this. + // If we're going to promote the non-output-value operand(s) or SELECT or + // SELECT_CC, record them for truncation. if (User->getOpcode() == ISD::SELECT) { if (User->getOperand(0) == Inputs[i]) - return SDValue(); + SelectTruncOp[0].insert(std::make_pair(User, + User->getOperand(0).getValueType())); } else if (User->getOpcode() == ISD::SELECT_CC) { - if (User->getOperand(0) == Inputs[i] || - User->getOperand(1) == Inputs[i]) - return SDValue(); + if (User->getOperand(0) == Inputs[i]) + SelectTruncOp[0].insert(std::make_pair(User, + User->getOperand(0).getValueType())); + if (User->getOperand(1) == Inputs[i]) + SelectTruncOp[1].insert(std::make_pair(User, + User->getOperand(1).getValueType())); } } } @@ -8083,18 +8383,19 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, if (User != N && !Visited.count(User)) return SDValue(); - // Make sure that we're not going to promote the non-output-value - // operand(s) or SELECT or SELECT_CC. - // FIXME: Although we could sometimes handle this, and it does occur in - // practice that one of the condition inputs to the select is also one of - // the outputs, we currently can't deal with this. + // If we're going to promote the non-output-value operand(s) or SELECT or + // SELECT_CC, record them for truncation. if (User->getOpcode() == ISD::SELECT) { if (User->getOperand(0) == PromOps[i]) - return SDValue(); + SelectTruncOp[0].insert(std::make_pair(User, + User->getOperand(0).getValueType())); } else if (User->getOpcode() == ISD::SELECT_CC) { - if (User->getOperand(0) == PromOps[i] || - User->getOperand(1) == PromOps[i]) - return SDValue(); + if (User->getOperand(0) == PromOps[i]) + SelectTruncOp[0].insert(std::make_pair(User, + User->getOperand(0).getValueType())); + if (User->getOperand(1) == PromOps[i]) + SelectTruncOp[1].insert(std::make_pair(User, + User->getOperand(1).getValueType())); } } } @@ -8175,6 +8476,19 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, continue; } + // For SELECT and SELECT_CC nodes, we do a similar check for any + // to-be-promoted comparison inputs. + if (PromOp.getOpcode() == ISD::SELECT || + PromOp.getOpcode() == ISD::SELECT_CC) { + if ((SelectTruncOp[0].count(PromOp.getNode()) && + PromOp.getOperand(0).getValueType() != N->getValueType(0)) || + (SelectTruncOp[1].count(PromOp.getNode()) && + PromOp.getOperand(1).getValueType() != N->getValueType(0))) { + PromOps.insert(PromOps.begin(), PromOp); + continue; + } + } + SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(), PromOp.getNode()->op_end()); @@ -8193,6 +8507,18 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); } + // If we've promoted the comparison inputs of a SELECT or SELECT_CC, + // truncate them again to the original value type. + if (PromOp.getOpcode() == ISD::SELECT || + PromOp.getOpcode() == ISD::SELECT_CC) { + auto SI0 = SelectTruncOp[0].find(PromOp.getNode()); + if (SI0 != SelectTruncOp[0].end()) + Ops[0] = DAG.getNode(ISD::TRUNCATE, dl, SI0->second, Ops[0]); + auto SI1 = SelectTruncOp[1].find(PromOp.getNode()); + if (SI1 != SelectTruncOp[1].end()) + Ops[1] = DAG.getNode(ISD::TRUNCATE, dl, SI1->second, Ops[1]); + } + DAG.ReplaceAllUsesOfValueWith(PromOp, DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops)); } @@ -8219,6 +8545,174 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, N->getOperand(0), ShiftCst), ShiftCst); } +SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, + DAGCombinerInfo &DCI) const { + assert((N->getOpcode() == ISD::SINT_TO_FP || + N->getOpcode() == ISD::UINT_TO_FP) && + "Need an int -> FP conversion node here"); + + if (!Subtarget.has64BitSupport()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + SDValue Op(N, 0); + + // Don't handle ppc_fp128 here or i1 conversions. + if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) + return SDValue(); + if (Op.getOperand(0).getValueType() == MVT::i1) + return SDValue(); + + // For i32 intermediate values, unfortunately, the conversion functions + // leave the upper 32 bits of the value are undefined. Within the set of + // scalar instructions, we have no method for zero- or sign-extending the + // value. Thus, we cannot handle i32 intermediate values here. + if (Op.getOperand(0).getValueType() == MVT::i32) + return SDValue(); + + assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) && + "UINT_TO_FP is supported only with FPCVT"); + + // If we have FCFIDS, then use it when converting to single-precision. + // Otherwise, convert to double-precision and then round. + unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? + (Op.getOpcode() == ISD::UINT_TO_FP ? + PPCISD::FCFIDUS : PPCISD::FCFIDS) : + (Op.getOpcode() == ISD::UINT_TO_FP ? + PPCISD::FCFIDU : PPCISD::FCFID); + MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? + MVT::f32 : MVT::f64; + + // If we're converting from a float, to an int, and back to a float again, + // then we don't need the store/load pair at all. + if ((Op.getOperand(0).getOpcode() == ISD::FP_TO_UINT && + Subtarget.hasFPCVT()) || + (Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT)) { + SDValue Src = Op.getOperand(0).getOperand(0); + if (Src.getValueType() == MVT::f32) { + Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); + DCI.AddToWorklist(Src.getNode()); + } + + unsigned FCTOp = + Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ : + PPCISD::FCTIDUZ; + + SDValue Tmp = DAG.getNode(FCTOp, dl, MVT::f64, Src); + SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Tmp); + + if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) { + FP = DAG.getNode(ISD::FP_ROUND, dl, + MVT::f32, FP, DAG.getIntPtrConstant(0)); + DCI.AddToWorklist(FP.getNode()); + } + + return FP; + } + + return SDValue(); +} + +// expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for +// builtins) into loads with swaps. +SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + SDValue Chain; + SDValue Base; + MachineMemOperand *MMO; + + switch (N->getOpcode()) { + default: + llvm_unreachable("Unexpected opcode for little endian VSX load"); + case ISD::LOAD: { + LoadSDNode *LD = cast<LoadSDNode>(N); + Chain = LD->getChain(); + Base = LD->getBasePtr(); + MMO = LD->getMemOperand(); + // If the MMO suggests this isn't a load of a full vector, leave + // things alone. For a built-in, we have to make the change for + // correctness, so if there is a size problem that will be a bug. + if (MMO->getSize() < 16) + return SDValue(); + break; + } + case ISD::INTRINSIC_W_CHAIN: { + MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N); + Chain = Intrin->getChain(); + Base = Intrin->getBasePtr(); + MMO = Intrin->getMemOperand(); + break; + } + } + + MVT VecTy = N->getValueType(0).getSimpleVT(); + SDValue LoadOps[] = { Chain, Base }; + SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl, + DAG.getVTList(VecTy, MVT::Other), + LoadOps, VecTy, MMO); + DCI.AddToWorklist(Load.getNode()); + Chain = Load.getValue(1); + SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl, + DAG.getVTList(VecTy, MVT::Other), Chain, Load); + DCI.AddToWorklist(Swap.getNode()); + return Swap; +} + +// expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for +// builtins) into stores with swaps. +SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + SDValue Chain; + SDValue Base; + unsigned SrcOpnd; + MachineMemOperand *MMO; + + switch (N->getOpcode()) { + default: + llvm_unreachable("Unexpected opcode for little endian VSX store"); + case ISD::STORE: { + StoreSDNode *ST = cast<StoreSDNode>(N); + Chain = ST->getChain(); + Base = ST->getBasePtr(); + MMO = ST->getMemOperand(); + SrcOpnd = 1; + // If the MMO suggests this isn't a store of a full vector, leave + // things alone. For a built-in, we have to make the change for + // correctness, so if there is a size problem that will be a bug. + if (MMO->getSize() < 16) + return SDValue(); + break; + } + case ISD::INTRINSIC_VOID: { + MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N); + Chain = Intrin->getChain(); + // Intrin->getBasePtr() oddly does not get what we want. + Base = Intrin->getOperand(3); + MMO = Intrin->getMemOperand(); + SrcOpnd = 2; + break; + } + } + + SDValue Src = N->getOperand(SrcOpnd); + MVT VecTy = Src.getValueType().getSimpleVT(); + SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl, + DAG.getVTList(VecTy, MVT::Other), Chain, Src); + DCI.AddToWorklist(Swap.getNode()); + Chain = Swap.getValue(1); + SDValue StoreOps[] = { Chain, Swap, Base }; + SDValue Store = DAG.getMemIntrinsicNode(PPCISD::STXVD2X, dl, + DAG.getVTList(MVT::Other), + StoreOps, VecTy, MMO); + DCI.AddToWorklist(Store.getNode()); + return Store; +} + SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { const TargetMachine &TM = getTargetMachine(); @@ -8253,124 +8747,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SETCC: case ISD::SELECT_CC: return DAGCombineTruncBoolExt(N, DCI); - case ISD::FDIV: { - assert(TM.Options.UnsafeFPMath && - "Reciprocal estimates require UnsafeFPMath"); - - if (N->getOperand(1).getOpcode() == ISD::FSQRT) { - SDValue RV = - DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0), DCI); - if (RV.getNode()) { - DCI.AddToWorklist(RV.getNode()); - return DAG.getNode(ISD::FMUL, dl, N->getValueType(0), - N->getOperand(0), RV); - } - } else if (N->getOperand(1).getOpcode() == ISD::FP_EXTEND && - N->getOperand(1).getOperand(0).getOpcode() == ISD::FSQRT) { - SDValue RV = - DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0).getOperand(0), - DCI); - if (RV.getNode()) { - DCI.AddToWorklist(RV.getNode()); - RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N->getOperand(1)), - N->getValueType(0), RV); - DCI.AddToWorklist(RV.getNode()); - return DAG.getNode(ISD::FMUL, dl, N->getValueType(0), - N->getOperand(0), RV); - } - } else if (N->getOperand(1).getOpcode() == ISD::FP_ROUND && - N->getOperand(1).getOperand(0).getOpcode() == ISD::FSQRT) { - SDValue RV = - DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0).getOperand(0), - DCI); - if (RV.getNode()) { - DCI.AddToWorklist(RV.getNode()); - RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N->getOperand(1)), - N->getValueType(0), RV, - N->getOperand(1).getOperand(1)); - DCI.AddToWorklist(RV.getNode()); - return DAG.getNode(ISD::FMUL, dl, N->getValueType(0), - N->getOperand(0), RV); - } - } - - SDValue RV = DAGCombineFastRecip(N->getOperand(1), DCI); - if (RV.getNode()) { - DCI.AddToWorklist(RV.getNode()); - return DAG.getNode(ISD::FMUL, dl, N->getValueType(0), - N->getOperand(0), RV); - } - - } - break; - case ISD::FSQRT: { - assert(TM.Options.UnsafeFPMath && - "Reciprocal estimates require UnsafeFPMath"); - - // Compute this as 1/(1/sqrt(X)), which is the reciprocal of the - // reciprocal sqrt. - SDValue RV = DAGCombineFastRecipFSQRT(N->getOperand(0), DCI); - if (RV.getNode()) { - DCI.AddToWorklist(RV.getNode()); - RV = DAGCombineFastRecip(RV, DCI); - if (RV.getNode()) { - // Unfortunately, RV is now NaN if the input was exactly 0. Select out - // this case and force the answer to 0. - - EVT VT = RV.getValueType(); - - SDValue Zero = DAG.getConstantFP(0.0, VT.getScalarType()); - if (VT.isVector()) { - assert(VT.getVectorNumElements() == 4 && "Unknown vector type"); - Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Zero, Zero, Zero, Zero); - } - - SDValue ZeroCmp = - DAG.getSetCC(dl, getSetCCResultType(*DAG.getContext(), VT), - N->getOperand(0), Zero, ISD::SETEQ); - DCI.AddToWorklist(ZeroCmp.getNode()); - DCI.AddToWorklist(RV.getNode()); - - RV = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, dl, VT, - ZeroCmp, Zero, RV); - return RV; - } - } - - } - break; case ISD::SINT_TO_FP: - if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) { - if (N->getOperand(0).getOpcode() == ISD::FP_TO_SINT) { - // Turn (sint_to_fp (fp_to_sint X)) -> fctidz/fcfid without load/stores. - // We allow the src/dst to be either f32/f64, but the intermediate - // type must be i64. - if (N->getOperand(0).getValueType() == MVT::i64 && - N->getOperand(0).getOperand(0).getValueType() != MVT::ppcf128) { - SDValue Val = N->getOperand(0).getOperand(0); - if (Val.getValueType() == MVT::f32) { - Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val); - DCI.AddToWorklist(Val.getNode()); - } - - Val = DAG.getNode(PPCISD::FCTIDZ, dl, MVT::f64, Val); - DCI.AddToWorklist(Val.getNode()); - Val = DAG.getNode(PPCISD::FCFID, dl, MVT::f64, Val); - DCI.AddToWorklist(Val.getNode()); - if (N->getValueType(0) == MVT::f32) { - Val = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Val, - DAG.getIntPtrConstant(0)); - DCI.AddToWorklist(Val.getNode()); - } - return Val; - } else if (N->getOperand(0).getValueType() == MVT::i32) { - // If the intermediate type is i32, we can avoid the load/store here - // too. - } - } - } - break; - case ISD::STORE: + case ISD::UINT_TO_FP: + return combineFPToIntToFP(N, DCI); + case ISD::STORE: { // Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)). if (TM.getSubtarget<PPCSubtarget>().hasSTFIWX() && !cast<StoreSDNode>(N)->isTruncatingStore() && @@ -8421,14 +8801,39 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, Ops, cast<StoreSDNode>(N)->getMemoryVT(), cast<StoreSDNode>(N)->getMemOperand()); } + + // For little endian, VSX stores require generating xxswapd/lxvd2x. + EVT VT = N->getOperand(1).getValueType(); + if (VT.isSimple()) { + MVT StoreVT = VT.getSimpleVT(); + if (TM.getSubtarget<PPCSubtarget>().hasVSX() && + TM.getSubtarget<PPCSubtarget>().isLittleEndian() && + (StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 || + StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32)) + return expandVSXStoreForLE(N, DCI); + } break; + } case ISD::LOAD: { LoadSDNode *LD = cast<LoadSDNode>(N); EVT VT = LD->getValueType(0); + + // For little endian, VSX loads require generating lxvd2x/xxswapd. + if (VT.isSimple()) { + MVT LoadVT = VT.getSimpleVT(); + if (TM.getSubtarget<PPCSubtarget>().hasVSX() && + TM.getSubtarget<PPCSubtarget>().isLittleEndian() && + (LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 || + LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32)) + return expandVSXLoadForLE(N, DCI); + } + Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext()); unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty); if (ISD::isNON_EXTLoad(N) && VT.isVector() && TM.getSubtarget<PPCSubtarget>().hasAltivec() && + // P8 and later hardware should just use LOAD. + !TM.getSubtarget<PPCSubtarget>().hasP8Vector() && (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v4f32) && LD->getAlignment() < ABIAlignment) { @@ -8466,17 +8871,25 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, Intrinsic::ppc_altivec_lvsl); SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, MVT::v16i8); - // Refine the alignment of the original load (a "new" load created here - // which was identical to the first except for the alignment would be - // merged with the existing node regardless). + // Create the new MMO for the new base load. It is like the original MMO, + // but represents an area in memory almost twice the vector size centered + // on the original address. If the address is unaligned, we might start + // reading up to (sizeof(vector)-1) bytes below the address of the + // original unaligned load. MachineFunction &MF = DAG.getMachineFunction(); - MachineMemOperand *MMO = - MF.getMachineMemOperand(LD->getPointerInfo(), - LD->getMemOperand()->getFlags(), - LD->getMemoryVT().getStoreSize(), - ABIAlignment); - LD->refineAlignment(MMO); - SDValue BaseLoad = SDValue(LD, 0); + MachineMemOperand *BaseMMO = + MF.getMachineMemOperand(LD->getMemOperand(), + -LD->getMemoryVT().getStoreSize()+1, + 2*LD->getMemoryVT().getStoreSize()-1); + + // Create the new base load. + SDValue LDXIntID = DAG.getTargetConstant(Intrinsic::ppc_altivec_lvx, + getPointerTy()); + SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr }; + SDValue BaseLoad = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, + DAG.getVTList(MVT::v4i32, MVT::Other), + BaseLoadOps, MVT::v4i32, BaseMMO); // Note that the value of IncOffset (which is provided to the next // load's pointer info offset value, and thus used to calculate the @@ -8498,21 +8911,18 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, SDValue Increment = DAG.getConstant(IncValue, getPointerTy()); Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + MachineMemOperand *ExtraMMO = + MF.getMachineMemOperand(LD->getMemOperand(), + 1, 2*LD->getMemoryVT().getStoreSize()-1); + SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr }; SDValue ExtraLoad = - DAG.getLoad(VT, dl, Chain, Ptr, - LD->getPointerInfo().getWithOffset(IncOffset), - LD->isVolatile(), LD->isNonTemporal(), - LD->isInvariant(), ABIAlignment); + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, + DAG.getVTList(MVT::v4i32, MVT::Other), + ExtraLoadOps, MVT::v4i32, ExtraMMO); SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, BaseLoad.getValue(1), ExtraLoad.getValue(1)); - if (BaseLoad.getValueType() != MVT::v4i32) - BaseLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, BaseLoad); - - if (ExtraLoad.getValueType() != MVT::v4i32) - ExtraLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ExtraLoad); - // Because vperm has a big-endian bias, we must reverse the order // of the input vectors and complement the permute control vector // when generating little endian code. We have already handled the @@ -8529,36 +8939,9 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, if (VT != MVT::v4i32) Perm = DAG.getNode(ISD::BITCAST, dl, VT, Perm); - // Now we need to be really careful about how we update the users of the - // original load. We cannot just call DCI.CombineTo (or - // DAG.ReplaceAllUsesWith for that matter), because the load still has - // uses created here (the permutation for example) that need to stay. - SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); - while (UI != UE) { - SDUse &Use = UI.getUse(); - SDNode *User = *UI; - // Note: BaseLoad is checked here because it might not be N, but a - // bitcast of N. - if (User == Perm.getNode() || User == BaseLoad.getNode() || - User == TF.getNode() || Use.getResNo() > 1) { - ++UI; - continue; - } - - SDValue To = Use.getResNo() ? TF : Perm; - ++UI; - - SmallVector<SDValue, 8> Ops; - for (const SDUse &O : User->ops()) { - if (O == Use) - Ops.push_back(To); - else - Ops.push_back(O); - } - - DAG.UpdateNodeOperands(User, Ops); - } - + // The output of the permutation is our loaded result, the TokenFactor is + // our new chain. + DCI.CombineTo(N, Perm, TF); return SDValue(N, 0); } } @@ -8593,6 +8976,34 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } break; + case ISD::INTRINSIC_W_CHAIN: { + // For little endian, VSX loads require generating lxvd2x/xxswapd. + if (TM.getSubtarget<PPCSubtarget>().hasVSX() && + TM.getSubtarget<PPCSubtarget>().isLittleEndian()) { + switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { + default: + break; + case Intrinsic::ppc_vsx_lxvw4x: + case Intrinsic::ppc_vsx_lxvd2x: + return expandVSXLoadForLE(N, DCI); + } + } + break; + } + case ISD::INTRINSIC_VOID: { + // For little endian, VSX stores require generating xxswapd/stxvd2x. + if (TM.getSubtarget<PPCSubtarget>().hasVSX() && + TM.getSubtarget<PPCSubtarget>().isLittleEndian()) { + switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { + default: + break; + case Intrinsic::ppc_vsx_stxvw4x: + case Intrinsic::ppc_vsx_stxvd2x: + return expandVSXStoreForLE(N, DCI); + } + } + break; + } case ISD::BSWAP: // Turn BSWAP (LOAD) -> lhbrx/lwbrx. if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) && @@ -8803,6 +9214,38 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); } +SDValue +PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, + SelectionDAG &DAG, + std::vector<SDNode *> *Created) const { + // fold (sdiv X, pow2) + EVT VT = N->getValueType(0); + if (VT == MVT::i64 && !Subtarget.isPPC64()) + return SDValue(); + if ((VT != MVT::i32 && VT != MVT::i64) || + !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2())) + return SDValue(); + + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + + bool IsNegPow2 = (-Divisor).isPowerOf2(); + unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countTrailingZeros(); + SDValue ShiftAmt = DAG.getConstant(Lg2, VT); + + SDValue Op = DAG.getNode(PPCISD::SRA_ADDZE, DL, VT, N0, ShiftAmt); + if (Created) + Created->push_back(Op.getNode()); + + if (IsNegPow2) { + Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), Op); + if (Created) + Created->push_back(Op.getNode()); + } + + return Op; +} + //===----------------------------------------------------------------------===// // Inline Assembly Support //===----------------------------------------------------------------------===// @@ -8844,6 +9287,40 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, } } +unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { + switch (Subtarget.getDarwinDirective()) { + default: break; + case PPC::DIR_970: + case PPC::DIR_PWR4: + case PPC::DIR_PWR5: + case PPC::DIR_PWR5X: + case PPC::DIR_PWR6: + case PPC::DIR_PWR6X: + case PPC::DIR_PWR7: + case PPC::DIR_PWR8: { + if (!ML) + break; + + const PPCInstrInfo *TII = + static_cast<const PPCInstrInfo *>(getTargetMachine().getSubtargetImpl()-> + getInstrInfo()); + + // For small loops (between 5 and 8 instructions), align to a 32-byte + // boundary so that the entire loop fits in one instruction-cache line. + uint64_t LoopSize = 0; + for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I) + for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J) + LoopSize += TII->GetInstSizeInBytes(J); + + if (LoopSize > 16 && LoopSize <= 32) + return 5; + + break; + } + } + + return TargetLowering::getPrefLoopAlignment(ML); +} /// getConstraintType - Given a constraint, return the type of /// constraint it is for this target. @@ -8976,7 +9453,8 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, // the AsmName field from *RegisterInfo.td, then this would not be necessary. if (R.first && VT == MVT::i64 && Subtarget.isPPC64() && PPC::GPRCRegClass.contains(R.first)) { - const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const TargetRegisterInfo *TRI = + getTargetMachine().getSubtargetImpl()->getRegisterInfo(); return std::make_pair(TRI->getMatchingSuperReg(R.first, PPC::sub_32, &PPC::G8RCRegClass), &PPC::G8RCRegClass); @@ -9200,6 +9678,92 @@ PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { return false; } +bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, + unsigned Intrinsic) const { + + switch (Intrinsic) { + case Intrinsic::ppc_altivec_lvx: + case Intrinsic::ppc_altivec_lvxl: + case Intrinsic::ppc_altivec_lvebx: + case Intrinsic::ppc_altivec_lvehx: + case Intrinsic::ppc_altivec_lvewx: + case Intrinsic::ppc_vsx_lxvd2x: + case Intrinsic::ppc_vsx_lxvw4x: { + EVT VT; + switch (Intrinsic) { + case Intrinsic::ppc_altivec_lvebx: + VT = MVT::i8; + break; + case Intrinsic::ppc_altivec_lvehx: + VT = MVT::i16; + break; + case Intrinsic::ppc_altivec_lvewx: + VT = MVT::i32; + break; + case Intrinsic::ppc_vsx_lxvd2x: + VT = MVT::v2f64; + break; + default: + VT = MVT::v4i32; + break; + } + + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = VT; + Info.ptrVal = I.getArgOperand(0); + Info.offset = -VT.getStoreSize()+1; + Info.size = 2*VT.getStoreSize()-1; + Info.align = 1; + Info.vol = false; + Info.readMem = true; + Info.writeMem = false; + return true; + } + case Intrinsic::ppc_altivec_stvx: + case Intrinsic::ppc_altivec_stvxl: + case Intrinsic::ppc_altivec_stvebx: + case Intrinsic::ppc_altivec_stvehx: + case Intrinsic::ppc_altivec_stvewx: + case Intrinsic::ppc_vsx_stxvd2x: + case Intrinsic::ppc_vsx_stxvw4x: { + EVT VT; + switch (Intrinsic) { + case Intrinsic::ppc_altivec_stvebx: + VT = MVT::i8; + break; + case Intrinsic::ppc_altivec_stvehx: + VT = MVT::i16; + break; + case Intrinsic::ppc_altivec_stvewx: + VT = MVT::i32; + break; + case Intrinsic::ppc_vsx_stxvd2x: + VT = MVT::v2f64; + break; + default: + VT = MVT::v4i32; + break; + } + + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = VT; + Info.ptrVal = I.getArgOperand(1); + Info.offset = -VT.getStoreSize()+1; + Info.size = 2*VT.getStoreSize()-1; + Info.align = 1; + Info.vol = false; + Info.readMem = false; + Info.writeMem = true; + return true; + } + default: + break; + } + + return false; +} + /// getOptimalMemOpType - Returns the target specific optimal type for load /// and store operations as a result of memset, memcpy, and memmove /// lowering. If DstAlign is zero that means it's safe to destination @@ -9251,6 +9815,31 @@ bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { return NumBits1 == 64 && NumBits2 == 32; } +bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { + // Generally speaking, zexts are not free, but they are free when they can be + // folded with other operations. + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val)) { + EVT MemVT = LD->getMemoryVT(); + if ((MemVT == MVT::i1 || MemVT == MVT::i8 || MemVT == MVT::i16 || + (Subtarget.isPPC64() && MemVT == MVT::i32)) && + (LD->getExtensionType() == ISD::NON_EXTLOAD || + LD->getExtensionType() == ISD::ZEXTLOAD)) + return true; + } + + // FIXME: Add other cases... + // - 32-bit shifts with a zext to i64 + // - zext after ctlz, bswap, etc. + // - zext after and by a constant mask + + return TargetLowering::isZExtFree(Val, VT2); +} + +bool PPCTargetLowering::isFPExtFree(EVT VT) const { + assert(VT.isFloatingPoint()); + return true; +} + bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const { return isInt<16>(Imm) || isUInt<16>(Imm); } @@ -9259,9 +9848,10 @@ bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const { return isInt<16>(Imm) || isUInt<16>(Imm); } -bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, - unsigned, - bool *Fast) const { +bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned, + unsigned, + bool *Fast) const { if (DisablePPCUnaligned) return false; @@ -9276,7 +9866,8 @@ bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, if (VT.getSimpleVT().isVector()) { if (Subtarget.hasVSX()) { - if (VT != MVT::v2f64 && VT != MVT::v2i64) + if (VT != MVT::v2f64 && VT != MVT::v2i64 && + VT != MVT::v4f32 && VT != MVT::v4i32) return false; } else { return false; @@ -9309,6 +9900,19 @@ bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { return false; } +const MCPhysReg * +PPCTargetLowering::getScratchRegisters(CallingConv::ID) const { + // LR is a callee-save register, but we must treat it as clobbered by any call + // site. Hence we include LR in the scratch registers, which are in turn added + // as implicit-defs for stackmaps and patchpoints. The same reasoning applies + // to CTR, which is used by any indirect call. + static const MCPhysReg ScratchRegs[] = { + PPC::X11, PPC::X12, PPC::LR8, PPC::CTR8, 0 + }; + + return ScratchRegs; +} + bool PPCTargetLowering::shouldExpandBuildVectorWithShuffles( EVT VT , unsigned DefinedValues) const { |