diff options
Diffstat (limited to 'contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 1159 |
1 files changed, 929 insertions, 230 deletions
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 2b9195b..b3a3c73 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -13,37 +13,87 @@ #include "PPCISelLowering.h" #include "MCTargetDesc/PPCPredicates.h" -#include "PPCCallingConv.h" +#include "PPC.h" #include "PPCCCState.h" +#include "PPCCallingConv.h" +#include "PPCFrameLowering.h" +#include "PPCInstrInfo.h" #include "PPCMachineFunctionInfo.h" #include "PPCPerfectShuffle.h" +#include "PPCRegisterInfo.h" +#include "PPCSubtarget.h" #include "PPCTargetMachine.h" -#include "PPCTargetObjectFile.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/None.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" -#include "llvm/ADT/Triple.h" #include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/Value.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/AtomicOrdering.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> #include <list> +#include <utility> +#include <vector> using namespace llvm; @@ -86,7 +136,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, addRegisterClass(MVT::f64, &PPC::F8RCRegClass); } - // PowerPC has an i16 but no i8 (or i1) SEXTLOAD + // Match BITREVERSE to customized fast code sequence in the td file. + setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); + + // PowerPC has an i16 but no i8 (or i1) SEXTLOAD. for (MVT VT : MVT::integer_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand); @@ -125,7 +179,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom); } - // PowerPC does not support direct load / store of condition registers + // PowerPC does not support direct load/store of condition registers. setOperationAction(ISD::LOAD, MVT::i1, Custom); setOperationAction(ISD::STORE, MVT::i1, Custom); @@ -154,11 +208,23 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand); setOperationAction(ISD::FREM, MVT::ppcf128, Expand); - // PowerPC has no SREM/UREM instructions - setOperationAction(ISD::SREM, MVT::i32, Expand); - setOperationAction(ISD::UREM, MVT::i32, Expand); - setOperationAction(ISD::SREM, MVT::i64, Expand); - setOperationAction(ISD::UREM, MVT::i64, Expand); + // PowerPC has no SREM/UREM instructions unless we are on P9 + // On P9 we may use a hardware instruction to compute the remainder. + // The instructions are not legalized directly because in the cases where the + // result of both the remainder and the division is required it is more + // efficient to compute the remainder from the result of the division rather + // than use the remainder instruction. + if (Subtarget.isISA3_0()) { + setOperationAction(ISD::SREM, MVT::i32, Custom); + setOperationAction(ISD::UREM, MVT::i32, Custom); + setOperationAction(ISD::SREM, MVT::i64, Custom); + setOperationAction(ISD::UREM, MVT::i64, Custom); + } else { + setOperationAction(ISD::SREM, MVT::i32, Expand); + setOperationAction(ISD::UREM, MVT::i32, Expand); + setOperationAction(ISD::SREM, MVT::i64, Expand); + setOperationAction(ISD::UREM, MVT::i64, Expand); + } // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM. setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); @@ -360,6 +426,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // To handle counter-based loop conditions. setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i32, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + // Comparisons that require checking two conditions. setCondCodeAction(ISD::SETULT, MVT::f32, Expand); setCondCodeAction(ISD::SETULT, MVT::f64, Expand); @@ -484,7 +555,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FABS, VT, Expand); - setOperationAction(ISD::FPOWI, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); setOperationAction(ISD::FCEIL, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); @@ -634,6 +704,14 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::SRA, MVT::v2i64, Legal); setOperationAction(ISD::SRL, MVT::v2i64, Legal); + // 128 bit shifts can be accomplished via 3 instructions for SHL and + // SRL, but not for SRA because of the instructions available: + // VS{RL} and VS{RL}O. However due to direct move costs, it's not worth + // doing + setOperationAction(ISD::SHL, MVT::v1i128, Expand); + setOperationAction(ISD::SRL, MVT::v1i128, Expand); + setOperationAction(ISD::SRA, MVT::v1i128, Expand); + setOperationAction(ISD::SETCC, MVT::v2i64, Legal); } else { @@ -687,6 +765,13 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, if (Subtarget.hasP9Vector()) { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + + // 128 bit shifts can be accomplished via 3 instructions for SHL and + // SRL, but not for SRA because of the instructions available: + // VS{RL} and VS{RL}O. + setOperationAction(ISD::SHL, MVT::v1i128, Legal); + setOperationAction(ISD::SRL, MVT::v1i128, Legal); + setOperationAction(ISD::SRA, MVT::v1i128, Expand); } } @@ -728,7 +813,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FABS , MVT::v4f64, Legal); setOperationAction(ISD::FSIN , MVT::v4f64, Expand); setOperationAction(ISD::FCOS , MVT::v4f64, Expand); - setOperationAction(ISD::FPOWI , MVT::v4f64, Expand); setOperationAction(ISD::FPOW , MVT::v4f64, Expand); setOperationAction(ISD::FLOG , MVT::v4f64, Expand); setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand); @@ -774,7 +858,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FABS , MVT::v4f32, Legal); setOperationAction(ISD::FSIN , MVT::v4f32, Expand); setOperationAction(ISD::FCOS , MVT::v4f32, Expand); - setOperationAction(ISD::FPOWI , MVT::v4f32, Expand); setOperationAction(ISD::FPOW , MVT::v4f32, Expand); setOperationAction(ISD::FLOG , MVT::v4f32, Expand); setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand); @@ -873,6 +956,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1); // We have target-specific dag combine patterns for the following nodes: + setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SRA); + setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::BUILD_VECTOR); if (Subtarget.hasFPCVT()) @@ -971,6 +1057,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, MaxStoresPerMemset = 128; MaxStoresPerMemcpy = 128; MaxStoresPerMemmove = 128; + MaxLoadsPerMemcmp = 128; + } else { + MaxLoadsPerMemcmp = 8; + MaxLoadsPerMemcmpOptSize = 4; } } @@ -1042,6 +1132,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::VPERM: return "PPCISD::VPERM"; case PPCISD::XXSPLT: return "PPCISD::XXSPLT"; case PPCISD::XXINSERT: return "PPCISD::XXINSERT"; + case PPCISD::XXREVERSE: return "PPCISD::XXREVERSE"; + case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI"; case PPCISD::VECSHL: return "PPCISD::VECSHL"; case PPCISD::CMPB: return "PPCISD::CMPB"; case PPCISD::Hi: return "PPCISD::Hi"; @@ -1080,6 +1172,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::LXSIZX: return "PPCISD::LXSIZX"; case PPCISD::STXSIX: return "PPCISD::STXSIX"; case PPCISD::VEXTS: return "PPCISD::VEXTS"; + case PPCISD::SExtVElems: return "PPCISD::SExtVElems"; case PPCISD::LXVD2X: return "PPCISD::LXVD2X"; case PPCISD::STXVD2X: return "PPCISD::STXVD2X"; case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH"; @@ -1523,21 +1616,47 @@ bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) { return true; } -bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, - unsigned &InsertAtByte, bool &Swap, bool IsLE) { - - // Check that the mask is shuffling words - for (unsigned i = 0; i < 4; ++i) { - unsigned B0 = N->getMaskElt(i*4); - unsigned B1 = N->getMaskElt(i*4+1); - unsigned B2 = N->getMaskElt(i*4+2); - unsigned B3 = N->getMaskElt(i*4+3); - if (B0 % 4) +/// Check that the mask is shuffling N byte elements. Within each N byte +/// element of the mask, the indices could be either in increasing or +/// decreasing order as long as they are consecutive. +/// \param[in] N the shuffle vector SD Node to analyze +/// \param[in] Width the element width in bytes, could be 2/4/8/16 (HalfWord/ +/// Word/DoubleWord/QuadWord). +/// \param[in] StepLen the delta indices number among the N byte element, if +/// the mask is in increasing/decreasing order then it is 1/-1. +/// \return true iff the mask is shuffling N byte elements. +static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width, + int StepLen) { + assert((Width == 2 || Width == 4 || Width == 8 || Width == 16) && + "Unexpected element width."); + assert((StepLen == 1 || StepLen == -1) && "Unexpected element width."); + + unsigned NumOfElem = 16 / Width; + unsigned MaskVal[16]; // Width is never greater than 16 + for (unsigned i = 0; i < NumOfElem; ++i) { + MaskVal[0] = N->getMaskElt(i * Width); + if ((StepLen == 1) && (MaskVal[0] % Width)) { return false; - if (B1 != B0+1 || B2 != B1+1 || B3 != B2+1) + } else if ((StepLen == -1) && ((MaskVal[0] + 1) % Width)) { return false; + } + + for (unsigned int j = 1; j < Width; ++j) { + MaskVal[j] = N->getMaskElt(i * Width + j); + if (MaskVal[j] != MaskVal[j-1] + StepLen) { + return false; + } + } } + return true; +} + +bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, + unsigned &InsertAtByte, bool &Swap, bool IsLE) { + if (!isNByteElemShuffleMask(N, 4, 1)) + return false; + // Now we look at mask elements 0,4,8,12 unsigned M0 = N->getMaskElt(0) / 4; unsigned M1 = N->getMaskElt(4) / 4; @@ -1608,6 +1727,158 @@ bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, return false; } +bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, + bool &Swap, bool IsLE) { + assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); + // Ensure each byte index of the word is consecutive. + if (!isNByteElemShuffleMask(N, 4, 1)) + return false; + + // Now we look at mask elements 0,4,8,12, which are the beginning of words. + unsigned M0 = N->getMaskElt(0) / 4; + unsigned M1 = N->getMaskElt(4) / 4; + unsigned M2 = N->getMaskElt(8) / 4; + unsigned M3 = N->getMaskElt(12) / 4; + + // If both vector operands for the shuffle are the same vector, the mask will + // contain only elements from the first one and the second one will be undef. + if (N->getOperand(1).isUndef()) { + assert(M0 < 4 && "Indexing into an undef vector?"); + if (M1 != (M0 + 1) % 4 || M2 != (M1 + 1) % 4 || M3 != (M2 + 1) % 4) + return false; + + ShiftElts = IsLE ? (4 - M0) % 4 : M0; + Swap = false; + return true; + } + + // Ensure each word index of the ShuffleVector Mask is consecutive. + if (M1 != (M0 + 1) % 8 || M2 != (M1 + 1) % 8 || M3 != (M2 + 1) % 8) + return false; + + if (IsLE) { + if (M0 == 0 || M0 == 7 || M0 == 6 || M0 == 5) { + // Input vectors don't need to be swapped if the leading element + // of the result is one of the 3 left elements of the second vector + // (or if there is no shift to be done at all). + Swap = false; + ShiftElts = (8 - M0) % 8; + } else if (M0 == 4 || M0 == 3 || M0 == 2 || M0 == 1) { + // Input vectors need to be swapped if the leading element + // of the result is one of the 3 left elements of the first vector + // (or if we're shifting by 4 - thereby simply swapping the vectors). + Swap = true; + ShiftElts = (4 - M0) % 4; + } + + return true; + } else { // BE + if (M0 == 0 || M0 == 1 || M0 == 2 || M0 == 3) { + // Input vectors don't need to be swapped if the leading element + // of the result is one of the 4 elements of the first vector. + Swap = false; + ShiftElts = M0; + } else if (M0 == 4 || M0 == 5 || M0 == 6 || M0 == 7) { + // Input vectors need to be swapped if the leading element + // of the result is one of the 4 elements of the right vector. + Swap = true; + ShiftElts = M0 - 4; + } + + return true; + } +} + +bool static isXXBRShuffleMaskHelper(ShuffleVectorSDNode *N, int Width) { + assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); + + if (!isNByteElemShuffleMask(N, Width, -1)) + return false; + + for (int i = 0; i < 16; i += Width) + if (N->getMaskElt(i) != i + Width - 1) + return false; + + return true; +} + +bool PPC::isXXBRHShuffleMask(ShuffleVectorSDNode *N) { + return isXXBRShuffleMaskHelper(N, 2); +} + +bool PPC::isXXBRWShuffleMask(ShuffleVectorSDNode *N) { + return isXXBRShuffleMaskHelper(N, 4); +} + +bool PPC::isXXBRDShuffleMask(ShuffleVectorSDNode *N) { + return isXXBRShuffleMaskHelper(N, 8); +} + +bool PPC::isXXBRQShuffleMask(ShuffleVectorSDNode *N) { + return isXXBRShuffleMaskHelper(N, 16); +} + +/// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap +/// if the inputs to the instruction should be swapped and set \p DM to the +/// value for the immediate. +/// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI +/// AND element 0 of the result comes from the first input (LE) or second input +/// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered. +/// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle +/// mask. +bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM, + bool &Swap, bool IsLE) { + assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); + + // Ensure each byte index of the double word is consecutive. + if (!isNByteElemShuffleMask(N, 8, 1)) + return false; + + unsigned M0 = N->getMaskElt(0) / 8; + unsigned M1 = N->getMaskElt(8) / 8; + assert(((M0 | M1) < 4) && "A mask element out of bounds?"); + + // If both vector operands for the shuffle are the same vector, the mask will + // contain only elements from the first one and the second one will be undef. + if (N->getOperand(1).isUndef()) { + if ((M0 | M1) < 2) { + DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1); + Swap = false; + return true; + } else + return false; + } + + if (IsLE) { + if (M0 > 1 && M1 < 2) { + Swap = false; + } else if (M0 < 2 && M1 > 1) { + M0 = (M0 + 2) % 4; + M1 = (M1 + 2) % 4; + Swap = true; + } else + return false; + + // Note: if control flow comes here that means Swap is already set above + DM = (((~M1) & 1) << 1) + ((~M0) & 1); + return true; + } else { // BE + if (M0 < 2 && M1 > 1) { + Swap = false; + } else if (M0 > 1 && M1 < 2) { + M0 = (M0 + 2) % 4; + M1 = (M1 + 2) % 4; + Swap = true; + } else + return false; + + // Note: if control flow comes here that means Swap is already set above + DM = (M0 << 1) + (M1 & 1); + return true; + } +} + + /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the /// specified isSplatShuffleMask VECTOR_SHUFFLE mask. unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize, @@ -1643,7 +1914,6 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { // If the element isn't a constant, bail fully out. if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue(); - if (!UniquedVals[i&(Multiple-1)].getNode()) UniquedVals[i&(Multiple-1)] = N->getOperand(i); else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i)) @@ -1763,17 +2033,17 @@ int PPC::isQVALIGNIShuffleMask(SDNode *N) { /// or 64-bit immediate, and if the value can be accurately represented as a /// sign extension from a 16-bit value. If so, this returns true and the /// immediate. -static bool isIntS16Immediate(SDNode *N, short &Imm) { +bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) { if (!isa<ConstantSDNode>(N)) return false; - Imm = (short)cast<ConstantSDNode>(N)->getZExtValue(); + Imm = (int16_t)cast<ConstantSDNode>(N)->getZExtValue(); if (N->getValueType(0) == MVT::i32) return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue(); else return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue(); } -static bool isIntS16Immediate(SDValue Op, short &Imm) { +bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) { return isIntS16Immediate(Op.getNode(), Imm); } @@ -1783,7 +2053,7 @@ static bool isIntS16Immediate(SDValue Op, short &Imm) { bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base, SDValue &Index, SelectionDAG &DAG) const { - short imm = 0; + int16_t imm = 0; if (N.getOpcode() == ISD::ADD) { if (isIntS16Immediate(N.getOperand(1), imm)) return false; // r+i @@ -1800,17 +2070,14 @@ bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base, // If this is an or of disjoint bitfields, we can codegen this as an add // (for better address arithmetic) if the LHS and RHS of the OR are provably // disjoint. - APInt LHSKnownZero, LHSKnownOne; - APInt RHSKnownZero, RHSKnownOne; - DAG.computeKnownBits(N.getOperand(0), - LHSKnownZero, LHSKnownOne); - - if (LHSKnownZero.getBoolValue()) { - DAG.computeKnownBits(N.getOperand(1), - RHSKnownZero, RHSKnownOne); + KnownBits LHSKnown, RHSKnown; + DAG.computeKnownBits(N.getOperand(0), LHSKnown); + + if (LHSKnown.Zero.getBoolValue()) { + DAG.computeKnownBits(N.getOperand(1), RHSKnown); // If all of the bits are known zero on the LHS or RHS, the add won't // carry. - if (~(LHSKnownZero | RHSKnownZero) == 0) { + if (~(LHSKnown.Zero | RHSKnown.Zero) == 0) { Base = N.getOperand(0); Index = N.getOperand(1); return true; @@ -1863,12 +2130,12 @@ static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) { /// Returns true if the address N can be represented by a base register plus /// a signed 16-bit displacement [r+imm], and if it is not better -/// represented as reg+reg. If Aligned is true, only accept displacements -/// suitable for STD and friends, i.e. multiples of 4. +/// represented as reg+reg. If \p Alignment is non-zero, only accept +/// displacements that are multiples of that value. bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base, SelectionDAG &DAG, - bool Aligned) const { + unsigned Alignment) const { // FIXME dl should come from parent load or store, not from address SDLoc dl(N); // If this can be more profitably realized as r+r, fail. @@ -1876,9 +2143,9 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, return false; if (N.getOpcode() == ISD::ADD) { - short imm = 0; + int16_t imm = 0; if (isIntS16Immediate(N.getOperand(1), imm) && - (!Aligned || (imm & 3) == 0)) { + (!Alignment || (imm % Alignment) == 0)) { Disp = DAG.getTargetConstant(imm, dl, N.getValueType()); if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) { Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); @@ -1900,16 +2167,16 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, return true; // [&g+r] } } else if (N.getOpcode() == ISD::OR) { - short imm = 0; + int16_t imm = 0; if (isIntS16Immediate(N.getOperand(1), imm) && - (!Aligned || (imm & 3) == 0)) { + (!Alignment || (imm % Alignment) == 0)) { // If this is an or of disjoint bitfields, we can codegen this as an add // (for better address arithmetic) if the LHS and RHS of the OR are // provably disjoint. - APInt LHSKnownZero, LHSKnownOne; - DAG.computeKnownBits(N.getOperand(0), LHSKnownZero, LHSKnownOne); + KnownBits LHSKnown; + DAG.computeKnownBits(N.getOperand(0), LHSKnown); - if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) { + if ((LHSKnown.Zero.getZExtValue()|~(uint64_t)imm) == ~0ULL) { // If all of the bits are known zero on the LHS or RHS, the add won't // carry. if (FrameIndexSDNode *FI = @@ -1928,8 +2195,8 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, // If this address fits entirely in a 16-bit sext immediate field, codegen // this as "d, 0" - short Imm; - if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) { + int16_t Imm; + if (isIntS16Immediate(CN, Imm) && (!Alignment || (Imm % Alignment) == 0)) { Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0)); Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, CN->getValueType(0)); @@ -1939,7 +2206,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, // Handle 32-bit sext immediates with LIS + addr mode. if ((CN->getValueType(0) == MVT::i32 || (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) && - (!Aligned || (CN->getZExtValue() & 3) == 0)) { + (!Alignment || (CN->getZExtValue() % Alignment) == 0)) { int Addr = (int)CN->getZExtValue(); // Otherwise, break this down into an LIS + disp. @@ -1973,10 +2240,15 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base, if (SelectAddressRegReg(N, Base, Index, DAG)) return true; - // If the operand is an addition, always emit this as [r+r], since this is - // better (for code size, and execution, as the memop does the add for free) - // than emitting an explicit add. - if (N.getOpcode() == ISD::ADD) { + // If the address is the result of an add, we will utilize the fact that the + // address calculation includes an implicit add. However, we can reduce + // register pressure if we do not materialize a constant just for use as the + // index register. We only get rid of the add if it is not an add of a + // value and a 16-bit signed constant and both have a single use. + int16_t imm = 0; + if (N.getOpcode() == ISD::ADD && + (!isIntS16Immediate(N.getOperand(1), imm) || + !N.getOperand(1).hasOneUse() || !N.getOperand(0).hasOneUse())) { Base = N.getOperand(0); Index = N.getOperand(1); return true; @@ -2026,7 +2298,6 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, } if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) { - // Common code will reject creating a pre-inc form if the base pointer // is a frame index, or if N is a store and the base pointer is either // the same as or a predecessor of the value being stored. Check for @@ -2050,14 +2321,14 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, // LDU/STU can only handle immediates that are a multiple of 4. if (VT != MVT::i64) { - if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, false)) + if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 0)) return false; } else { // LDU/STU need an address with at least 4-byte alignment. if (Alignment < 4) return false; - if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, true)) + if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 4)) return false; } @@ -2277,7 +2548,6 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op, SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { - // FIXME: TLS addresses currently use medium model code sequences, // which is the most useful form. Eventually support for small and // large models could be added if users need it, at the cost of @@ -2300,8 +2570,9 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, PPCII::MO_TPREL_HA); SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TPREL_LO); - SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2, - is64bit ? MVT::i64 : MVT::i32); + SDValue TLSReg = is64bit ? DAG.getRegister(PPC::X13, MVT::i64) + : DAG.getRegister(PPC::R2, MVT::i32); + SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg); return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi); } @@ -2602,10 +2873,9 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg) TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(Chain) - .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol("__trampoline_setup", PtrVT), - std::move(Args)); + CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( + CallingConv::C, Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args)); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); return CallResult.second; @@ -2737,7 +3007,7 @@ bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, return false; } -bool +bool llvm::CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, @@ -2752,7 +3022,7 @@ llvm::CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT, unsigned RegNum = State.getFirstUnallocated(ArgRegs); int RegsLeft = NumArgRegs - RegNum; - // Skip if there is not enough registers left for long double type (4 gpr regs + // Skip if there is not enough registers left for long double type (4 gpr regs // in soft float mode) and put long double argument on the stack. if (RegNum != NumArgRegs && RegsLeft < 4) { for (int i = 0; i < RegsLeft; i++) { @@ -4066,7 +4336,7 @@ needStackSlotPassParameters(const PPCSubtarget &Subtarget, static bool hasSameArgumentList(const Function *CallerFn, ImmutableCallSite *CS) { - if (CS->arg_size() != CallerFn->getArgumentList().size()) + if (CS->arg_size() != CallerFn->arg_size()) return false; ImmutableCallSite::arg_iterator CalleeArgIter = CS->arg_begin(); @@ -4222,11 +4492,12 @@ namespace { struct TailCallArgumentInfo { SDValue Arg; SDValue FrameIdxOp; - int FrameIdx; + int FrameIdx = 0; - TailCallArgumentInfo() : FrameIdx(0) {} + TailCallArgumentInfo() = default; }; -} + +} // end anonymous namespace /// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot. static void StoreTailCallArgumentsToStackSlot( @@ -4406,7 +4677,6 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys, ImmutableCallSite *CS, const PPCSubtarget &Subtarget) { - bool isPPC64 = Subtarget.isPPC64(); bool isSVR4ABI = Subtarget.isSVR4ABI(); bool isELFv2ABI = Subtarget.isELFv2ABI(); @@ -4602,7 +4872,6 @@ SDValue PPCTargetLowering::LowerCallResult( SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { - SmallVector<CCValAssign, 16> RVLocs; CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); @@ -4649,7 +4918,6 @@ SDValue PPCTargetLowering::FinishCall( SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff, unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins, SmallVectorImpl<SDValue> &InVals, ImmutableCallSite *CS) const { - std::vector<EVT> NodeTys; SmallVector<SDValue, 8> Ops; unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl, @@ -4909,8 +5177,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), - dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SDValue CallSeqStart = Chain; // Load the return address and frame pointer so it can be moved somewhere else @@ -4960,9 +5227,8 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( Flags, DAG, dl); // This must go outside the CALLSEQ_START..END. - SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, - CallSeqStart.getNode()->getOperand(1), - SDLoc(MemcpyCall)); + SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, NumBytes, 0, + SDLoc(MemcpyCall)); DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), NewCallSeqStart.getNode()); Chain = CallSeqStart = NewCallSeqStart; @@ -5043,9 +5309,9 @@ SDValue PPCTargetLowering::createMemcpyOutsideCallSeq( CallSeqStart.getNode()->getOperand(0), Flags, DAG, dl); // The MEMCPY must go outside the CALLSEQ_START..END. - SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, - CallSeqStart.getNode()->getOperand(1), - SDLoc(MemcpyCall)); + int64_t FrameSize = CallSeqStart.getConstantOperandVal(1); + SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, FrameSize, 0, + SDLoc(MemcpyCall)); DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), NewCallSeqStart.getNode()); return NewCallSeqStart; @@ -5059,7 +5325,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, ImmutableCallSite *CS) const { - bool isELFv2ABI = Subtarget.isELFv2ABI(); bool isLittleEndian = Subtarget.isLittleEndian(); unsigned NumOps = Outs.size(); @@ -5105,10 +5370,30 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( }; const unsigned NumGPRs = array_lengthof(GPR); - const unsigned NumFPRs = 13; + const unsigned NumFPRs = useSoftFloat() ? 0 : 13; const unsigned NumVRs = array_lengthof(VR); const unsigned NumQFPRs = NumFPRs; + // On ELFv2, we can avoid allocating the parameter area if all the arguments + // can be passed to the callee in registers. + // For the fast calling convention, there is another check below. + // Note: We should keep consistent with LowerFormalArguments_64SVR4() + bool HasParameterArea = !isELFv2ABI || isVarArg || CallConv == CallingConv::Fast; + if (!HasParameterArea) { + unsigned ParamAreaSize = NumGPRs * PtrByteSize; + unsigned AvailableFPRs = NumFPRs; + unsigned AvailableVRs = NumVRs; + unsigned NumBytesTmp = NumBytes; + for (unsigned i = 0; i != NumOps; ++i) { + if (Outs[i].Flags.isNest()) continue; + if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags, + PtrByteSize, LinkageSize, ParamAreaSize, + NumBytesTmp, AvailableFPRs, AvailableVRs, + Subtarget.hasQPX())) + HasParameterArea = true; + } + } + // When using the fast calling convention, we don't provide backing for // arguments that will be in registers. unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0; @@ -5176,13 +5461,18 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( unsigned NumBytesActuallyUsed = NumBytes; - // The prolog code of the callee may store up to 8 GPR argument registers to + // In the old ELFv1 ABI, + // the prolog code of the callee may store up to 8 GPR argument registers to // the stack, allowing va_start to index over them in memory if its varargs. // Because we cannot tell if this is needed on the caller side, we have to // conservatively assume that it is needed. As such, make sure we have at // least enough stack space for the caller to store the 8 GPRs. - // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area. - NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); + // In the ELFv2 ABI, we allocate the parameter area iff a callee + // really requires memory operands, e.g. a vararg function. + if (HasParameterArea) + NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); + else + NumBytes = LinkageSize; // Tail call needs the stack to be aligned. if (getTargetMachine().Options.GuaranteedTailCallOpt && @@ -5204,8 +5494,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) - Chain = DAG.getCALLSEQ_START(Chain, - DAG.getIntPtrConstant(NumBytes, dl, true), dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SDValue CallSeqStart = Chain; // Load the return address and frame pointer so it can be move somewhere else @@ -5401,6 +5690,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( if (CallConv == CallingConv::Fast) ComputePtrOff(); + assert(HasParameterArea && + "Parameter area must exist to pass an argument in memory."); LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, false, MemOpChains, TailCallArguments, dl); @@ -5486,6 +5777,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); } + assert(HasParameterArea && + "Parameter area must exist to pass an argument in memory."); LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, false, MemOpChains, TailCallArguments, dl); @@ -5520,6 +5813,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( // GPRs when within range. For now, we always put the value in both // locations (or even all three). if (isVarArg) { + assert(HasParameterArea && + "Parameter area must exist if we have a varargs call."); // We could elide this store in the case where the object fits // entirely in R registers. Maybe later. SDValue Store = @@ -5552,6 +5847,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( if (CallConv == CallingConv::Fast) ComputePtrOff(); + assert(HasParameterArea && + "Parameter area must exist to pass an argument in memory."); LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, true, MemOpChains, TailCallArguments, dl); @@ -5572,6 +5869,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( case MVT::v4i1: { bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32; if (isVarArg) { + assert(HasParameterArea && + "Parameter area must exist if we have a varargs call."); // We could elide this store in the case where the object fits // entirely in R registers. Maybe later. SDValue Store = @@ -5604,6 +5903,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( if (CallConv == CallingConv::Fast) ComputePtrOff(); + assert(HasParameterArea && + "Parameter area must exist to pass an argument in memory."); LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, true, MemOpChains, TailCallArguments, dl); @@ -5618,7 +5919,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( } } - assert(NumBytesActuallyUsed == ArgOffset); + assert((!HasParameterArea || NumBytesActuallyUsed == ArgOffset) && + "mismatch in size of parameter area"); (void)NumBytesActuallyUsed; if (!MemOpChains.empty()) @@ -5673,7 +5975,6 @@ SDValue PPCTargetLowering::LowerCall_Darwin( const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, ImmutableCallSite *CS) const { - unsigned NumOps = Outs.size(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); @@ -5752,8 +6053,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin( // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), - dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SDValue CallSeqStart = Chain; // Load the return address and frame pointer so it can be move somewhere else @@ -6065,7 +6365,6 @@ PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl, SelectionDAG &DAG) const { - SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); @@ -6133,7 +6432,7 @@ PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); - // Get the corect type for integers. + // Get the correct type for integers. EVT IntVT = Op.getValueType(); // Get the inputs. @@ -6150,7 +6449,7 @@ SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, // When we pop the dynamic allocation we need to restore the SP link. SDLoc dl(Op); - // Get the corect type for pointers. + // Get the correct type for pointers. EVT PtrVT = getPointerTy(DAG.getDataLayout()); // Construct the stack pointer operand. @@ -6225,7 +6524,7 @@ SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue Size = Op.getOperand(1); SDLoc dl(Op); - // Get the corect type for pointers. + // Get the correct type for pointers. EVT PtrVT = getPointerTy(DAG.getDataLayout()); // Negate the size. SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT, @@ -6356,6 +6655,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { default: break; // SETUO etc aren't handled by fsel. case ISD::SETNE: std::swap(TV, FV); + LLVM_FALLTHROUGH; case ISD::SETEQ: if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); @@ -6367,6 +6667,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { case ISD::SETULT: case ISD::SETLT: std::swap(TV, FV); // fsel is natively setge, swap operands for setlt + LLVM_FALLTHROUGH; case ISD::SETOGE: case ISD::SETGE: if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits @@ -6375,6 +6676,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { case ISD::SETUGT: case ISD::SETGT: std::swap(TV, FV); // fsel is natively setge, swap operands for setlt + LLVM_FALLTHROUGH; case ISD::SETOLE: case ISD::SETLE: if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits @@ -6388,8 +6690,9 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { default: break; // SETUO etc aren't handled by fsel. case ISD::SETNE: std::swap(TV, FV); + LLVM_FALLTHROUGH; case ISD::SETEQ: - Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags); + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); @@ -6399,25 +6702,25 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV); case ISD::SETULT: case ISD::SETLT: - Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags); + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); case ISD::SETOGE: case ISD::SETGE: - Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags); + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); case ISD::SETUGT: case ISD::SETGT: - Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, &Flags); + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); case ISD::SETOLE: case ISD::SETLE: - Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, &Flags); + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); @@ -6585,6 +6888,7 @@ bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT, // Given the head of the old chain, ResChain, insert a token factor containing // it and NewResChain, and make users of ResChain now be users of that token // factor. +// TODO: Remove and use DAG::makeEquivalentMemoryOrdering() instead. void PPCTargetLowering::spliceIntoChain(SDValue ResChain, SDValue NewResChain, SelectionDAG &DAG) const { @@ -7585,6 +7889,53 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); } + + if (Subtarget.hasVSX() && + PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) { + if (Swap) + std::swap(V1, V2); + SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); + SDValue Conv2 = + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2.isUndef() ? V1 : V2); + + SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv1, Conv2, + DAG.getConstant(ShiftElts, dl, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl); + } + + if (Subtarget.hasVSX() && + PPC::isXXPERMDIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) { + if (Swap) + std::swap(V1, V2); + SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1); + SDValue Conv2 = + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2.isUndef() ? V1 : V2); + + SDValue PermDI = DAG.getNode(PPCISD::XXPERMDI, dl, MVT::v2i64, Conv1, Conv2, + DAG.getConstant(ShiftElts, dl, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, PermDI); + } + + if (Subtarget.hasP9Vector()) { + if (PPC::isXXBRHShuffleMask(SVOp)) { + SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); + SDValue ReveHWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v8i16, Conv); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveHWord); + } else if (PPC::isXXBRWShuffleMask(SVOp)) { + SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); + SDValue ReveWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v4i32, Conv); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveWord); + } else if (PPC::isXXBRDShuffleMask(SVOp)) { + SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1); + SDValue ReveDWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Conv); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveDWord); + } else if (PPC::isXXBRQShuffleMask(SVOp)) { + SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, V1); + SDValue ReveQWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v1i128, Conv); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveQWord); + } + } + if (Subtarget.hasVSX()) { if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) { int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG); @@ -7612,7 +7963,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SDValue Swap = DAG.getNode(PPCISD::SWAP_NO_CHAIN, dl, MVT::v2f64, Conv); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Swap); } - } if (Subtarget.hasQPX()) { @@ -7792,24 +8142,39 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc, bool &isDot, const PPCSubtarget &Subtarget) { unsigned IntrinsicID = - cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue(); + cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue(); CompareOpc = -1; isDot = false; switch (IntrinsicID) { - default: return false; - // Comparison predicates. - case Intrinsic::ppc_altivec_vcmpbfp_p: CompareOpc = 966; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpeqfp_p: CompareOpc = 198; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpequb_p: CompareOpc = 6; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpequh_p: CompareOpc = 70; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpequw_p: CompareOpc = 134; isDot = 1; break; + default: + return false; + // Comparison predicates. + case Intrinsic::ppc_altivec_vcmpbfp_p: + CompareOpc = 966; + isDot = true; + break; + case Intrinsic::ppc_altivec_vcmpeqfp_p: + CompareOpc = 198; + isDot = true; + break; + case Intrinsic::ppc_altivec_vcmpequb_p: + CompareOpc = 6; + isDot = true; + break; + case Intrinsic::ppc_altivec_vcmpequh_p: + CompareOpc = 70; + isDot = true; + break; + case Intrinsic::ppc_altivec_vcmpequw_p: + CompareOpc = 134; + isDot = true; + break; case Intrinsic::ppc_altivec_vcmpequd_p: if (Subtarget.hasP8Altivec()) { CompareOpc = 199; - isDot = 1; + isDot = true; } else return false; - break; case Intrinsic::ppc_altivec_vcmpneb_p: case Intrinsic::ppc_altivec_vcmpneh_p: @@ -7818,45 +8183,80 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc, case Intrinsic::ppc_altivec_vcmpnezh_p: case Intrinsic::ppc_altivec_vcmpnezw_p: if (Subtarget.hasP9Altivec()) { - switch(IntrinsicID) { - default: llvm_unreachable("Unknown comparison intrinsic."); - case Intrinsic::ppc_altivec_vcmpneb_p: CompareOpc = 7; break; - case Intrinsic::ppc_altivec_vcmpneh_p: CompareOpc = 71; break; - case Intrinsic::ppc_altivec_vcmpnew_p: CompareOpc = 135; break; - case Intrinsic::ppc_altivec_vcmpnezb_p: CompareOpc = 263; break; - case Intrinsic::ppc_altivec_vcmpnezh_p: CompareOpc = 327; break; - case Intrinsic::ppc_altivec_vcmpnezw_p: CompareOpc = 391; break; + switch (IntrinsicID) { + default: + llvm_unreachable("Unknown comparison intrinsic."); + case Intrinsic::ppc_altivec_vcmpneb_p: + CompareOpc = 7; + break; + case Intrinsic::ppc_altivec_vcmpneh_p: + CompareOpc = 71; + break; + case Intrinsic::ppc_altivec_vcmpnew_p: + CompareOpc = 135; + break; + case Intrinsic::ppc_altivec_vcmpnezb_p: + CompareOpc = 263; + break; + case Intrinsic::ppc_altivec_vcmpnezh_p: + CompareOpc = 327; + break; + case Intrinsic::ppc_altivec_vcmpnezw_p: + CompareOpc = 391; + break; } - isDot = 1; + isDot = true; } else return false; - break; - case Intrinsic::ppc_altivec_vcmpgefp_p: CompareOpc = 454; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpgtfp_p: CompareOpc = 710; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpgtsb_p: CompareOpc = 774; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpgtsh_p: CompareOpc = 838; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpgtsw_p: CompareOpc = 902; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgefp_p: + CompareOpc = 454; + isDot = true; + break; + case Intrinsic::ppc_altivec_vcmpgtfp_p: + CompareOpc = 710; + isDot = true; + break; + case Intrinsic::ppc_altivec_vcmpgtsb_p: + CompareOpc = 774; + isDot = true; + break; + case Intrinsic::ppc_altivec_vcmpgtsh_p: + CompareOpc = 838; + isDot = true; + break; + case Intrinsic::ppc_altivec_vcmpgtsw_p: + CompareOpc = 902; + isDot = true; + break; case Intrinsic::ppc_altivec_vcmpgtsd_p: if (Subtarget.hasP8Altivec()) { CompareOpc = 967; - isDot = 1; + isDot = true; } else return false; - break; - case Intrinsic::ppc_altivec_vcmpgtub_p: CompareOpc = 518; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpgtuh_p: CompareOpc = 582; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpgtuw_p: CompareOpc = 646; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtub_p: + CompareOpc = 518; + isDot = true; + break; + case Intrinsic::ppc_altivec_vcmpgtuh_p: + CompareOpc = 582; + isDot = true; + break; + case Intrinsic::ppc_altivec_vcmpgtuw_p: + CompareOpc = 646; + isDot = true; + break; case Intrinsic::ppc_altivec_vcmpgtud_p: if (Subtarget.hasP8Altivec()) { CompareOpc = 711; - isDot = 1; + isDot = true; } else return false; - break; - // VSX predicate comparisons use the same infrastructure + + // VSX predicate comparisons use the same infrastructure case Intrinsic::ppc_vsx_xvcmpeqdp_p: case Intrinsic::ppc_vsx_xvcmpgedp_p: case Intrinsic::ppc_vsx_xvcmpgtdp_p: @@ -7865,33 +8265,51 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc, case Intrinsic::ppc_vsx_xvcmpgtsp_p: if (Subtarget.hasVSX()) { switch (IntrinsicID) { - case Intrinsic::ppc_vsx_xvcmpeqdp_p: CompareOpc = 99; break; - case Intrinsic::ppc_vsx_xvcmpgedp_p: CompareOpc = 115; break; - case Intrinsic::ppc_vsx_xvcmpgtdp_p: CompareOpc = 107; break; - case Intrinsic::ppc_vsx_xvcmpeqsp_p: CompareOpc = 67; break; - case Intrinsic::ppc_vsx_xvcmpgesp_p: CompareOpc = 83; break; - case Intrinsic::ppc_vsx_xvcmpgtsp_p: CompareOpc = 75; break; + case Intrinsic::ppc_vsx_xvcmpeqdp_p: + CompareOpc = 99; + break; + case Intrinsic::ppc_vsx_xvcmpgedp_p: + CompareOpc = 115; + break; + case Intrinsic::ppc_vsx_xvcmpgtdp_p: + CompareOpc = 107; + break; + case Intrinsic::ppc_vsx_xvcmpeqsp_p: + CompareOpc = 67; + break; + case Intrinsic::ppc_vsx_xvcmpgesp_p: + CompareOpc = 83; + break; + case Intrinsic::ppc_vsx_xvcmpgtsp_p: + CompareOpc = 75; + break; } - isDot = 1; - } - else + isDot = true; + } else return false; - break; - // Normal Comparisons. - case Intrinsic::ppc_altivec_vcmpbfp: CompareOpc = 966; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpeqfp: CompareOpc = 198; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpequb: CompareOpc = 6; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpequh: CompareOpc = 70; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpequw: CompareOpc = 134; isDot = 0; break; + // Normal Comparisons. + case Intrinsic::ppc_altivec_vcmpbfp: + CompareOpc = 966; + break; + case Intrinsic::ppc_altivec_vcmpeqfp: + CompareOpc = 198; + break; + case Intrinsic::ppc_altivec_vcmpequb: + CompareOpc = 6; + break; + case Intrinsic::ppc_altivec_vcmpequh: + CompareOpc = 70; + break; + case Intrinsic::ppc_altivec_vcmpequw: + CompareOpc = 134; + break; case Intrinsic::ppc_altivec_vcmpequd: - if (Subtarget.hasP8Altivec()) { + if (Subtarget.hasP8Altivec()) CompareOpc = 199; - isDot = 0; - } else + else return false; - break; case Intrinsic::ppc_altivec_vcmpneb: case Intrinsic::ppc_altivec_vcmpneh: @@ -7899,43 +8317,67 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc, case Intrinsic::ppc_altivec_vcmpnezb: case Intrinsic::ppc_altivec_vcmpnezh: case Intrinsic::ppc_altivec_vcmpnezw: - if (Subtarget.hasP9Altivec()) { + if (Subtarget.hasP9Altivec()) switch (IntrinsicID) { - default: llvm_unreachable("Unknown comparison intrinsic."); - case Intrinsic::ppc_altivec_vcmpneb: CompareOpc = 7; break; - case Intrinsic::ppc_altivec_vcmpneh: CompareOpc = 71; break; - case Intrinsic::ppc_altivec_vcmpnew: CompareOpc = 135; break; - case Intrinsic::ppc_altivec_vcmpnezb: CompareOpc = 263; break; - case Intrinsic::ppc_altivec_vcmpnezh: CompareOpc = 327; break; - case Intrinsic::ppc_altivec_vcmpnezw: CompareOpc = 391; break; + default: + llvm_unreachable("Unknown comparison intrinsic."); + case Intrinsic::ppc_altivec_vcmpneb: + CompareOpc = 7; + break; + case Intrinsic::ppc_altivec_vcmpneh: + CompareOpc = 71; + break; + case Intrinsic::ppc_altivec_vcmpnew: + CompareOpc = 135; + break; + case Intrinsic::ppc_altivec_vcmpnezb: + CompareOpc = 263; + break; + case Intrinsic::ppc_altivec_vcmpnezh: + CompareOpc = 327; + break; + case Intrinsic::ppc_altivec_vcmpnezw: + CompareOpc = 391; + break; } - isDot = 0; - } else + else return false; break; - case Intrinsic::ppc_altivec_vcmpgefp: CompareOpc = 454; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpgtfp: CompareOpc = 710; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpgtsb: CompareOpc = 774; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpgtsh: CompareOpc = 838; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpgtsw: CompareOpc = 902; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgefp: + CompareOpc = 454; + break; + case Intrinsic::ppc_altivec_vcmpgtfp: + CompareOpc = 710; + break; + case Intrinsic::ppc_altivec_vcmpgtsb: + CompareOpc = 774; + break; + case Intrinsic::ppc_altivec_vcmpgtsh: + CompareOpc = 838; + break; + case Intrinsic::ppc_altivec_vcmpgtsw: + CompareOpc = 902; + break; case Intrinsic::ppc_altivec_vcmpgtsd: - if (Subtarget.hasP8Altivec()) { + if (Subtarget.hasP8Altivec()) CompareOpc = 967; - isDot = 0; - } else + else return false; - break; - case Intrinsic::ppc_altivec_vcmpgtub: CompareOpc = 518; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpgtuh: CompareOpc = 582; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpgtuw: CompareOpc = 646; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtub: + CompareOpc = 518; + break; + case Intrinsic::ppc_altivec_vcmpgtuh: + CompareOpc = 582; + break; + case Intrinsic::ppc_altivec_vcmpgtuw: + CompareOpc = 646; + break; case Intrinsic::ppc_altivec_vcmpgtud: - if (Subtarget.hasP8Altivec()) { + if (Subtarget.hasP8Altivec()) CompareOpc = 711; - isDot = 0; - } else + else return false; - break; } return true; @@ -7950,9 +8392,9 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (IntrinsicID == Intrinsic::thread_pointer) { // Reads the thread pointer register, used for __builtin_thread_pointer. - bool is64bit = Subtarget.isPPC64(); - return DAG.getRegister(is64bit ? PPC::X13 : PPC::R2, - is64bit ? MVT::i64 : MVT::i32); + if (Subtarget.isPPC64()) + return DAG.getRegister(PPC::X13, MVT::i64); + return DAG.getRegister(PPC::R2, MVT::i32); } // If this is a lowered altivec predicate compare, CompareOpc is set to the @@ -8019,6 +8461,40 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return Flags; } +SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op, + SelectionDAG &DAG) const { + // SelectionDAGBuilder::visitTargetIntrinsic may insert one extra chain to + // the beginning of the argument list. + int ArgStart = isa<ConstantSDNode>(Op.getOperand(0)) ? 0 : 1; + SDLoc DL(Op); + switch (cast<ConstantSDNode>(Op.getOperand(ArgStart))->getZExtValue()) { + case Intrinsic::ppc_cfence: { + assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument."); + assert(Subtarget.isPPC64() && "Only 64-bit is supported for now."); + return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other, + DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, + Op.getOperand(ArgStart + 1)), + Op.getOperand(0)), + 0); + } + default: + break; + } + return SDValue(); +} + +SDValue PPCTargetLowering::LowerREM(SDValue Op, SelectionDAG &DAG) const { + // Check for a DIV with the same operands as this REM. + for (auto UI : Op.getOperand(1)->uses()) { + if ((Op.getOpcode() == ISD::SREM && UI->getOpcode() == ISD::SDIV) || + (Op.getOpcode() == ISD::UREM && UI->getOpcode() == ISD::UDIV)) + if (UI->getOperand(0) == Op.getOperand(0) && + UI->getOperand(1) == Op.getOperand(1)) + return SDValue(); + } + return Op; +} + SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -8044,7 +8520,7 @@ SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, } SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, - SelectionDAG &DAG) const { + SelectionDAG &DAG) const { SDLoc dl(Op); // Create a stack slot that is 16-byte aligned. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); @@ -8484,6 +8960,12 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { // Frame & Return address. case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); + + case ISD::INTRINSIC_VOID: + return LowerINTRINSIC_VOID(Op, DAG); + case ISD::SREM: + case ISD::UREM: + return LowerREM(Op, DAG); } } @@ -8575,9 +9057,9 @@ static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) { // The mappings for emitLeading/TrailingFence is taken from // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html -Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder, - AtomicOrdering Ord, bool IsStore, - bool IsLoad) const { +Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder, + Instruction *Inst, + AtomicOrdering Ord) const { if (Ord == AtomicOrdering::SequentiallyConsistent) return callIntrinsic(Builder, Intrinsic::ppc_sync); if (isReleaseOrStronger(Ord)) @@ -8585,15 +9067,22 @@ Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder, return nullptr; } -Instruction* PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder, - AtomicOrdering Ord, bool IsStore, - bool IsLoad) const { - if (IsLoad && isAcquireOrStronger(Ord)) +Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder, + Instruction *Inst, + AtomicOrdering Ord) const { + if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) { + // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and + // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html + // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification. + if (isa<LoadInst>(Inst) && Subtarget.isPPC64()) + return Builder.CreateCall( + Intrinsic::getDeclaration( + Builder.GetInsertBlock()->getParent()->getParent(), + Intrinsic::ppc_cfence, {Inst->getType()}), + {Inst}); + // FIXME: Can use isync for rmw operation. return callIntrinsic(Builder, Intrinsic::ppc_lwsync); - // FIXME: this is too conservative, a dependent branch + isync is enough. - // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and - // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html - // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification. + } return nullptr; } @@ -8889,6 +9378,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI.getDebugLoc(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -8902,7 +9392,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, unsigned DstReg = MI.getOperand(0).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); - assert(RC->hasType(MVT::i32) && "Invalid destination!"); + assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"); unsigned mainDstReg = MRI.createVirtualRegister(RC); unsigned restoreDstReg = MRI.createVirtualRegister(RC); @@ -8985,7 +9475,6 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, // Setup MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB); - const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); MIB.addRegMask(TRI->getNoPreservedMask()); BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1); @@ -9174,10 +9663,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineFunction *F = BB->getParent(); - if (Subtarget.hasISEL() && - (MI.getOpcode() == PPC::SELECT_CC_I4 || + if (MI.getOpcode() == PPC::SELECT_CC_I4 || MI.getOpcode() == PPC::SELECT_CC_I8 || - MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8)) { + MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8) { SmallVector<MachineOperand, 2> Cond; if (MI.getOpcode() == PPC::SELECT_CC_I4 || MI.getOpcode() == PPC::SELECT_CC_I8) @@ -9417,7 +9905,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, BB = EmitAtomicBinary(MI, BB, 4, 0); else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64) BB = EmitAtomicBinary(MI, BB, 8, 0); - else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 || MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 || (Subtarget.hasPartwordAtomics() && @@ -10028,14 +10515,12 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { return false; } - /// This function is called when we have proved that a SETCC node can be replaced /// by subtraction (and other supporting instructions) so that the result of /// comparison is kept in a GPR instead of CR. This function is purely for /// codegen purposes and has some flags to guide the codegen process. static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement, bool Swap, SDLoc &DL, SelectionDAG &DAG) { - assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected."); // Zero extend the operands to the largest legal integer. Originally, they @@ -10068,7 +10553,6 @@ static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement, SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N, DAGCombinerInfo &DCI) const { - assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected."); SelectionDAG &DAG = DCI.DAG; @@ -10155,17 +10639,16 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, } else { // This is neither a signed nor an unsigned comparison, just make sure // that the high bits are equal. - APInt Op1Zero, Op1One; - APInt Op2Zero, Op2One; - DAG.computeKnownBits(N->getOperand(0), Op1Zero, Op1One); - DAG.computeKnownBits(N->getOperand(1), Op2Zero, Op2One); + KnownBits Op1Known, Op2Known; + DAG.computeKnownBits(N->getOperand(0), Op1Known); + DAG.computeKnownBits(N->getOperand(1), Op2Known); // We don't really care about what is known about the first bit (if // anything), so clear it in all masks prior to comparing them. - Op1Zero.clearBit(0); Op1One.clearBit(0); - Op2Zero.clearBit(0); Op2One.clearBit(0); + Op1Known.Zero.clearBit(0); Op1Known.One.clearBit(0); + Op2Known.Zero.clearBit(0); Op2Known.One.clearBit(0); - if (Op1Zero != Op2Zero || Op1One != Op2One) + if (Op1Known.Zero != Op2Known.Zero || Op1Known.One != Op2Known.One) return SDValue(); } } @@ -10842,6 +11325,132 @@ static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +// This function adds the required vector_shuffle needed to get +// the elements of the vector extract in the correct position +// as specified by the CorrectElems encoding. +static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG, + SDValue Input, uint64_t Elems, + uint64_t CorrectElems) { + SDLoc dl(N); + + unsigned NumElems = Input.getValueType().getVectorNumElements(); + SmallVector<int, 16> ShuffleMask(NumElems, -1); + + // Knowing the element indices being extracted from the original + // vector and the order in which they're being inserted, just put + // them at element indices required for the instruction. + for (unsigned i = 0; i < N->getNumOperands(); i++) { + if (DAG.getDataLayout().isLittleEndian()) + ShuffleMask[CorrectElems & 0xF] = Elems & 0xF; + else + ShuffleMask[(CorrectElems & 0xF0) >> 4] = (Elems & 0xF0) >> 4; + CorrectElems = CorrectElems >> 8; + Elems = Elems >> 8; + } + + SDValue Shuffle = + DAG.getVectorShuffle(Input.getValueType(), dl, Input, + DAG.getUNDEF(Input.getValueType()), ShuffleMask); + + EVT Ty = N->getValueType(0); + SDValue BV = DAG.getNode(PPCISD::SExtVElems, dl, Ty, Shuffle); + return BV; +} + +// Look for build vector patterns where input operands come from sign +// extended vector_extract elements of specific indices. If the correct indices +// aren't used, add a vector shuffle to fix up the indices and create a new +// PPCISD:SExtVElems node which selects the vector sign extend instructions +// during instruction selection. +static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) { + // This array encodes the indices that the vector sign extend instructions + // extract from when extending from one type to another for both BE and LE. + // The right nibble of each byte corresponds to the LE incides. + // and the left nibble of each byte corresponds to the BE incides. + // For example: 0x3074B8FC byte->word + // For LE: the allowed indices are: 0x0,0x4,0x8,0xC + // For BE: the allowed indices are: 0x3,0x7,0xB,0xF + // For example: 0x000070F8 byte->double word + // For LE: the allowed indices are: 0x0,0x8 + // For BE: the allowed indices are: 0x7,0xF + uint64_t TargetElems[] = { + 0x3074B8FC, // b->w + 0x000070F8, // b->d + 0x10325476, // h->w + 0x00003074, // h->d + 0x00001032, // w->d + }; + + uint64_t Elems = 0; + int Index; + SDValue Input; + + auto isSExtOfVecExtract = [&](SDValue Op) -> bool { + if (!Op) + return false; + if (Op.getOpcode() != ISD::SIGN_EXTEND) + return false; + + SDValue Extract = Op.getOperand(0); + if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return false; + + ConstantSDNode *ExtOp = dyn_cast<ConstantSDNode>(Extract.getOperand(1)); + if (!ExtOp) + return false; + + Index = ExtOp->getZExtValue(); + if (Input && Input != Extract.getOperand(0)) + return false; + + if (!Input) + Input = Extract.getOperand(0); + + Elems = Elems << 8; + Index = DAG.getDataLayout().isLittleEndian() ? Index : Index << 4; + Elems |= Index; + + return true; + }; + + // If the build vector operands aren't sign extended vector extracts, + // of the same input vector, then return. + for (unsigned i = 0; i < N->getNumOperands(); i++) { + if (!isSExtOfVecExtract(N->getOperand(i))) { + return SDValue(); + } + } + + // If the vector extract indicies are not correct, add the appropriate + // vector_shuffle. + int TgtElemArrayIdx; + int InputSize = Input.getValueType().getScalarSizeInBits(); + int OutputSize = N->getValueType(0).getScalarSizeInBits(); + if (InputSize + OutputSize == 40) + TgtElemArrayIdx = 0; + else if (InputSize + OutputSize == 72) + TgtElemArrayIdx = 1; + else if (InputSize + OutputSize == 48) + TgtElemArrayIdx = 2; + else if (InputSize + OutputSize == 80) + TgtElemArrayIdx = 3; + else if (InputSize + OutputSize == 96) + TgtElemArrayIdx = 4; + else + return SDValue(); + + uint64_t CorrectElems = TargetElems[TgtElemArrayIdx]; + CorrectElems = DAG.getDataLayout().isLittleEndian() + ? CorrectElems & 0x0F0F0F0F0F0F0F0F + : CorrectElems & 0xF0F0F0F0F0F0F0F0; + if (Elems != CorrectElems) { + return addShuffleForVecExtend(N, DAG, Input, Elems, CorrectElems); + } + + // Regular lowering will catch cases where a shuffle is not needed. + return SDValue(); +} + SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const { assert(N->getOpcode() == ISD::BUILD_VECTOR && @@ -10869,6 +11478,15 @@ SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N, if (Reduced) return Reduced; + // If we're building a vector out of extended elements from another vector + // we have P9 vector integer extend instructions. + if (Subtarget.hasP9Altivec()) { + Reduced = combineBVOfVecSExt(N, DAG); + if (Reduced) + return Reduced; + } + + if (N->getValueType(0) != MVT::v2f64) return SDValue(); @@ -11053,6 +11671,14 @@ SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N, } MVT VecTy = N->getValueType(0).getSimpleVT(); + + // Do not expand to PPCISD::LXVD2X + PPCISD::XXSWAPD when the load is + // aligned and the type is a vector with elements up to 4 bytes + if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16) + && VecTy.getScalarSizeInBits() <= 32 ) { + return SDValue(); + } + SDValue LoadOps[] = { Chain, Base }; SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl, DAG.getVTList(MVT::v2f64, MVT::Other), @@ -11117,6 +11743,13 @@ SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N, SDValue Src = N->getOperand(SrcOpnd); MVT VecTy = Src.getValueType().getSimpleVT(); + // Do not expand to PPCISD::XXSWAPD and PPCISD::STXVD2X when the load is + // aligned and the type is a vector with elements up to 4 bytes + if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16) + && VecTy.getScalarSizeInBits() <= 32 ) { + return SDValue(); + } + // All stores are done as v2f64 and possible bit cast. if (VecTy != MVT::v2f64) { Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src); @@ -11141,6 +11774,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, SDLoc dl(N); switch (N->getOpcode()) { default: break; + case ISD::SHL: + return combineSHL(N, DCI); + case ISD::SRA: + return combineSRA(N, DCI); + case ISD::SRL: + return combineSRL(N, DCI); case PPCISD::SHL: if (isNullConstant(N->getOperand(0))) // 0 << V -> 0. return N->getOperand(0); @@ -11227,9 +11866,20 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, if (BSwapOp.getValueType() == MVT::i16) BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp); + // If the type of BSWAP operand is wider than stored memory width + // it need to be shifted to the right side before STBRX. + EVT mVT = cast<StoreSDNode>(N)->getMemoryVT(); + if (Op1VT.bitsGT(mVT)) { + int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits(); + BSwapOp = DAG.getNode(ISD::SRL, dl, Op1VT, BSwapOp, + DAG.getConstant(Shift, dl, MVT::i32)); + // Need to truncate if this is a bswap of i64 stored as i32/i16. + if (Op1VT == MVT::i64) + BSwapOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BSwapOp); + } + SDValue Ops[] = { - N->getOperand(0), BSwapOp, N->getOperand(2), - DAG.getValueType(N->getOperand(1).getValueType()) + N->getOperand(0), BSwapOp, N->getOperand(2), DAG.getValueType(mVT) }; return DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other), @@ -11570,7 +12220,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } break; - case ISD::INTRINSIC_W_CHAIN: { + case ISD::INTRINSIC_W_CHAIN: // For little endian, VSX loads require generating lxvd2x/xxswapd. // Not needed on ISA 3.0 based CPUs since we have a non-permuting load. if (Subtarget.needsSwapsForVSXMemOps()) { @@ -11583,8 +12233,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } } break; - } - case ISD::INTRINSIC_VOID: { + case ISD::INTRINSIC_VOID: // For little endian, VSX stores require generating xxswapd/stxvd2x. // Not needed on ISA 3.0 based CPUs since we have a non-permuting store. if (Subtarget.needsSwapsForVSXMemOps()) { @@ -11597,7 +12246,6 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } } break; - } case ISD::BSWAP: // Turn BSWAP (LOAD) -> lhbrx/lwbrx. if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) && @@ -11635,9 +12283,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, // Return N so it doesn't get rechecked! return SDValue(N, 0); } - break; - case PPCISD::VCMP: { + case PPCISD::VCMP: // If a VCMPo node already exists with exactly the same operands as this // node, use its result instead of this node (VCMPo computes both a CR6 and // a normal output). @@ -11687,7 +12334,6 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, return SDValue(VCMPoNode, 0); } break; - } case ISD::BRCOND: { SDValue Cond = N->getOperand(1); SDValue Target = N->getOperand(2); @@ -11845,17 +12491,17 @@ PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, //===----------------------------------------------------------------------===// void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, - APInt &KnownZero, - APInt &KnownOne, + KnownBits &Known, + const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { - KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0); + Known.resetAll(); switch (Op.getOpcode()) { default: break; case PPCISD::LBRX: { // lhbrx is known to have the top bits cleared out. if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16) - KnownZero = 0xFFFF0000; + Known.Zero = 0xFFFF0000; break; } case ISD::INTRINSIC_WO_CHAIN: { @@ -11877,7 +12523,7 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, case Intrinsic::ppc_altivec_vcmpgtuh_p: case Intrinsic::ppc_altivec_vcmpgtuw_p: case Intrinsic::ppc_altivec_vcmpgtud_p: - KnownZero = ~1U; // All bits but the low one are known to be zero. + Known.Zero = ~1U; // All bits but the low one are known to be zero. break; } } @@ -12295,7 +12941,6 @@ PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const { - switch (Intrinsic) { case Intrinsic::ppc_qpx_qvlfd: case Intrinsic::ppc_qpx_qvlfs: @@ -12753,7 +13398,6 @@ void PPCTargetLowering::insertSSPDeclarations(Module &M) const { } bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { - if (!VT.isSimple() || !Subtarget.hasVSX()) return false; @@ -12768,3 +13412,58 @@ bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { return Imm.isPosZero(); } } + +// For vector shift operation op, fold +// (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y) +static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N, + SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + unsigned OpSizeInBits = VT.getScalarSizeInBits(); + unsigned Opcode = N->getOpcode(); + unsigned TargetOpcode; + + switch (Opcode) { + default: + llvm_unreachable("Unexpected shift operation"); + case ISD::SHL: + TargetOpcode = PPCISD::SHL; + break; + case ISD::SRL: + TargetOpcode = PPCISD::SRL; + break; + case ISD::SRA: + TargetOpcode = PPCISD::SRA; + break; + } + + if (VT.isVector() && TLI.isOperationLegal(Opcode, VT) && + N1->getOpcode() == ISD::AND) + if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) + if (Mask->getZExtValue() == OpSizeInBits - 1) + return DAG.getNode(TargetOpcode, SDLoc(N), VT, N0, N1->getOperand(0)); + + return SDValue(); +} + +SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const { + if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) + return Value; + + return SDValue(); +} + +SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const { + if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) + return Value; + + return SDValue(); +} + +SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const { + if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) + return Value; + + return SDValue(); +} |