diff options
Diffstat (limited to 'contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 1185 |
1 files changed, 652 insertions, 533 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 849058b..9d87988 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11,9 +11,9 @@ // //===----------------------------------------------------------------------===// +#include "AArch64ISelLowering.h" #include "AArch64CallingConvention.h" #include "AArch64MachineFunctionInfo.h" -#include "AArch64ISelLowering.h" #include "AArch64PerfectShuffle.h" #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" @@ -22,13 +22,14 @@ #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -50,10 +51,10 @@ #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/OperandTraits.h" #include "llvm/IR/Type.h" @@ -66,6 +67,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetCallingConv.h" @@ -90,6 +92,7 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); +STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); static cl::opt<bool> EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, @@ -104,6 +107,12 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false)); +static cl::opt<bool> +EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, + cl::desc("Enable AArch64 logical imm instruction " + "optimization"), + cl::init(true)); + /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; @@ -372,7 +381,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); setOperationAction(ISD::FNEG, MVT::v4f16, Expand); setOperationAction(ISD::FPOW, MVT::v4f16, Expand); - setOperationAction(ISD::FPOWI, MVT::v4f16, Expand); setOperationAction(ISD::FREM, MVT::v4f16, Expand); setOperationAction(ISD::FROUND, MVT::v4f16, Expand); setOperationAction(ISD::FRINT, MVT::v4f16, Expand); @@ -404,7 +412,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); setOperationAction(ISD::FNEG, MVT::v8f16, Expand); setOperationAction(ISD::FPOW, MVT::v8f16, Expand); - setOperationAction(ISD::FPOWI, MVT::v8f16, Expand); setOperationAction(ISD::FREM, MVT::v8f16, Expand); setOperationAction(ISD::FROUND, MVT::v8f16, Expand); setOperationAction(ISD::FRINT, MVT::v8f16, Expand); @@ -544,7 +551,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); - setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; @@ -554,8 +560,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setSchedulingPreference(Sched::Hybrid); - // Enable TBZ/TBNZ - MaskAndBranchFoldingIsLegal = true; EnableExtLdPromotion = true; // Set required alignment. @@ -652,6 +656,19 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); + // Vector reductions + for (MVT VT : MVT::integer_valuetypes()) { + setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + } + for (MVT VT : MVT::fp_valuetypes()) { + setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); + } + setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); // Likewise, narrowing and extending vector loads/stores aren't handled @@ -707,7 +724,6 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); - setOperationAction(ISD::FPOWI, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FLOG, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); @@ -751,6 +767,9 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { setOperationAction(ISD::FP_TO_SINT, VT, Custom); setOperationAction(ISD::FP_TO_UINT, VT, Custom); + if (!VT.isFloatingPoint()) + setOperationAction(ISD::ABS, VT, Legal); + // [SU][MIN|MAX] are available for all NEON types apart from i64. if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) @@ -788,21 +807,157 @@ EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &, return VT.changeVectorElementTypeToInteger(); } +static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, + const APInt &Demanded, + TargetLowering::TargetLoweringOpt &TLO, + unsigned NewOpc) { + uint64_t OldImm = Imm, NewImm, Enc; + uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask; + + // Return if the immediate is already all zeros, all ones, a bimm32 or a + // bimm64. + if (Imm == 0 || Imm == Mask || + AArch64_AM::isLogicalImmediate(Imm & Mask, Size)) + return false; + + unsigned EltSize = Size; + uint64_t DemandedBits = Demanded.getZExtValue(); + + // Clear bits that are not demanded. + Imm &= DemandedBits; + + while (true) { + // The goal here is to set the non-demanded bits in a way that minimizes + // the number of switching between 0 and 1. In order to achieve this goal, + // we set the non-demanded bits to the value of the preceding demanded bits. + // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a + // non-demanded bit), we copy bit0 (1) to the least significant 'x', + // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'. + // The final result is 0b11000011. + uint64_t NonDemandedBits = ~DemandedBits; + uint64_t InvertedImm = ~Imm & DemandedBits; + uint64_t RotatedImm = + ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) & + NonDemandedBits; + uint64_t Sum = RotatedImm + NonDemandedBits; + bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1)); + uint64_t Ones = (Sum + Carry) & NonDemandedBits; + NewImm = (Imm | Ones) & Mask; + + // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate + // or all-ones or all-zeros, in which case we can stop searching. Otherwise, + // we halve the element size and continue the search. + if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask))) + break; + + // We cannot shrink the element size any further if it is 2-bits. + if (EltSize == 2) + return false; + + EltSize /= 2; + Mask >>= EltSize; + uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize; + + // Return if there is mismatch in any of the demanded bits of Imm and Hi. + if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0) + return false; + + // Merge the upper and lower halves of Imm and DemandedBits. + Imm |= Hi; + DemandedBits |= DemandedBitsHi; + } + + ++NumOptimizedImms; + + // Replicate the element across the register width. + while (EltSize < Size) { + NewImm |= NewImm << EltSize; + EltSize *= 2; + } + + (void)OldImm; + assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 && + "demanded bits should never be altered"); + assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm"); + + // Create the new constant immediate node. + EVT VT = Op.getValueType(); + SDLoc DL(Op); + SDValue New; + + // If the new constant immediate is all-zeros or all-ones, let the target + // independent DAG combine optimize this node. + if (NewImm == 0 || NewImm == OrigMask) { + New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0), + TLO.DAG.getConstant(NewImm, DL, VT)); + // Otherwise, create a machine node so that target independent DAG combine + // doesn't undo this optimization. + } else { + Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size); + SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT); + New = SDValue( + TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0); + } + + return TLO.CombineTo(Op, New); +} + +bool AArch64TargetLowering::targetShrinkDemandedConstant( + SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const { + // Delay this optimization to as late as possible. + if (!TLO.LegalOps) + return false; + + if (!EnableOptimizeLogicalImm) + return false; + + EVT VT = Op.getValueType(); + if (VT.isVector()) + return false; + + unsigned Size = VT.getSizeInBits(); + assert((Size == 32 || Size == 64) && + "i32 or i64 is expected after legalization."); + + // Exit early if we demand all bits. + if (Demanded.countPopulation() == Size) + return false; + + unsigned NewOpc; + switch (Op.getOpcode()) { + default: + return false; + case ISD::AND: + NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri; + break; + case ISD::OR: + NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri; + break; + case ISD::XOR: + NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri; + break; + } + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + if (!C) + return false; + uint64_t Imm = C->getZExtValue(); + return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc); +} + /// computeKnownBitsForTargetNode - Determine which of the bits specified in -/// Mask are known to be either zero or one and return them in the -/// KnownZero/KnownOne bitsets. +/// Mask are known to be either zero or one and return them Known. void AArch64TargetLowering::computeKnownBitsForTargetNode( - const SDValue Op, APInt &KnownZero, APInt &KnownOne, - const SelectionDAG &DAG, unsigned Depth) const { + const SDValue Op, KnownBits &Known, + const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { switch (Op.getOpcode()) { default: break; case AArch64ISD::CSEL: { - APInt KnownZero2, KnownOne2; - DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1); - DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1); - KnownZero &= KnownZero2; - KnownOne &= KnownOne2; + KnownBits Known2; + DAG.computeKnownBits(Op->getOperand(0), Known, Depth + 1); + DAG.computeKnownBits(Op->getOperand(1), Known2, Depth + 1); + Known.Zero &= Known2.Zero; + Known.One &= Known2.One; break; } case ISD::INTRINSIC_W_CHAIN: { @@ -812,10 +967,10 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode( default: return; case Intrinsic::aarch64_ldaxr: case Intrinsic::aarch64_ldxr: { - unsigned BitWidth = KnownOne.getBitWidth(); + unsigned BitWidth = Known.getBitWidth(); EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); unsigned MemBits = VT.getScalarSizeInBits(); - KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); + Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); return; } } @@ -834,15 +989,15 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode( // bits larger than the element datatype. 32-bit or larget doesn't need // this as those are legal types and will be handled by isel directly. MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); - unsigned BitWidth = KnownZero.getBitWidth(); + unsigned BitWidth = Known.getBitWidth(); if (VT == MVT::v8i8 || VT == MVT::v16i8) { assert(BitWidth >= 8 && "Unexpected width!"); APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8); - KnownZero |= Mask; + Known.Zero |= Mask; } else if (VT == MVT::v4i16 || VT == MVT::v8i16) { assert(BitWidth >= 16 && "Unexpected width!"); APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16); - KnownZero |= Mask; + Known.Zero |= Mask; } break; } break; @@ -2113,8 +2268,8 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, Entry.Node = Arg; Entry.Ty = ArgTy; - Entry.isSExt = false; - Entry.isZExt = false; + Entry.IsSExt = false; + Entry.IsZExt = false; Args.push_back(Entry); const char *LibcallName = @@ -2122,10 +2277,11 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); - StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); + StructType *RetTy = StructType::get(ArgTy, ArgTy); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args)); + CLI.setDebugLoc(dl) + .setChain(DAG.getEntryNode()) + .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args)); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); return CallResult.first; @@ -2231,19 +2387,13 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { } static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { - if (N->getOpcode() == ISD::SIGN_EXTEND) - return true; - if (isExtendedBUILD_VECTOR(N, DAG, true)) - return true; - return false; + return N->getOpcode() == ISD::SIGN_EXTEND || + isExtendedBUILD_VECTOR(N, DAG, true); } static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { - if (N->getOpcode() == ISD::ZERO_EXTEND) - return true; - if (isExtendedBUILD_VECTOR(N, DAG, false)) - return true; - return false; + return N->getOpcode() == ISD::ZERO_EXTEND || + isExtendedBUILD_VECTOR(N, DAG, false); } static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { @@ -2347,6 +2497,9 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, EVT PtrVT = getPointerTy(DAG.getDataLayout()); return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); } + case Intrinsic::aarch64_neon_abs: + return DAG.getNode(ISD::ABS, dl, Op.getValueType(), + Op.getOperand(1)); case Intrinsic::aarch64_neon_smax: return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); @@ -2465,6 +2618,14 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerMUL(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMIN: + return LowerVECREDUCE(Op, DAG); } } @@ -2489,9 +2650,13 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, case CallingConv::PreserveMost: case CallingConv::CXX_FAST_TLS: case CallingConv::Swift: + if (Subtarget->isTargetWindows() && IsVarArg) + return CC_AArch64_Win64_VarArg; if (!Subtarget->isTargetDarwin()) return CC_AArch64_AAPCS; return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; + case CallingConv::Win64: + return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS; } } @@ -2507,6 +2672,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); + bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()); // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; @@ -2663,10 +2829,12 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // varargs AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); if (isVarArg) { - if (!Subtarget->isTargetDarwin()) { + if (!Subtarget->isTargetDarwin() || IsWin64) { // The AAPCS variadic function ABI is identical to the non-variadic // one. As a result there may be more arguments in registers and we should // save them for future reference. + // Win64 variadic functions also pass arguments in registers, but all float + // arguments are passed in integer registers. saveVarArgRegisters(CCInfo, DAG, DL, Chain); } @@ -2708,6 +2876,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, MachineFrameInfo &MFI = MF.getFrameInfo(); AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); auto PtrVT = getPointerTy(DAG.getDataLayout()); + bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()); SmallVector<SDValue, 8> MemOps; @@ -2720,7 +2889,13 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); int GPRIdx = 0; if (GPRSaveSize != 0) { - GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false); + if (IsWin64) { + GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false); + if (GPRSaveSize & 15) + // The extra size here, if triggered, will always be 8. + MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false); + } else + GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false); SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); @@ -2729,7 +2904,11 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); SDValue Store = DAG.getStore( Val.getValue(1), DL, Val, FIN, - MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8)); + IsWin64 + ? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), + GPRIdx, + (i - FirstVariadicGPR) * 8) + : MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8)); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); @@ -2738,7 +2917,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, FuncInfo->setVarArgsGPRIndex(GPRIdx); FuncInfo->setVarArgsGPRSize(GPRSaveSize); - if (Subtarget->hasFPARMv8()) { + if (Subtarget->hasFPARMv8() && !IsWin64) { static const MCPhysReg FPRArgRegs[] = { AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; @@ -3108,9 +3287,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL, - true), - DL); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL); SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy(DAG.getDataLayout())); @@ -3245,30 +3422,26 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol // node so that legalize doesn't hack it. - if (getTargetMachine().getCodeModel() == CodeModel::Large && - Subtarget->isTargetMachO()) { - if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + auto GV = G->getGlobal(); + if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) == + AArch64II::MO_GOT) { + Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT); + Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); + } else { const GlobalValue *GV = G->getGlobal(); - bool InternalLinkage = GV->hasInternalLinkage(); - if (InternalLinkage) - Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); - else { - Callee = - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT); - Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); - } - } else if (ExternalSymbolSDNode *S = - dyn_cast<ExternalSymbolSDNode>(Callee)) { + Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); + } + } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { + if (getTargetMachine().getCodeModel() == CodeModel::Large && + Subtarget->isTargetMachO()) { const char *Sym = S->getSymbol(); Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT); Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); + } else { + const char *Sym = S->getSymbol(); + Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); } - } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { - const GlobalValue *GV = G->getGlobal(); - Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); - } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { - const char *Sym = S->getSymbol(); - Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); } // We don't usually want to end the call-sequence here because we would tidy @@ -3428,11 +3601,75 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // Other Lowering Code //===----------------------------------------------------------------------===// +SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty, + SelectionDAG &DAG, + unsigned Flag) const { + return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 0, Flag); +} + +SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty, + SelectionDAG &DAG, + unsigned Flag) const { + return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag); +} + +SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty, + SelectionDAG &DAG, + unsigned Flag) const { + return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(), + N->getOffset(), Flag); +} + +SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty, + SelectionDAG &DAG, + unsigned Flag) const { + return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag); +} + +// (loadGOT sym) +template <class NodeTy> +SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG) const { + DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n"); + SDLoc DL(N); + EVT Ty = getPointerTy(DAG.getDataLayout()); + SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT); + // FIXME: Once remat is capable of dealing with instructions with register + // operands, expand this into two nodes instead of using a wrapper node. + return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr); +} + +// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym)) +template <class NodeTy> +SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG) + const { + DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n"); + SDLoc DL(N); + EVT Ty = getPointerTy(DAG.getDataLayout()); + const unsigned char MO_NC = AArch64II::MO_NC; + return DAG.getNode( + AArch64ISD::WrapperLarge, DL, Ty, + getTargetNode(N, Ty, DAG, AArch64II::MO_G3), + getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC), + getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC), + getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC)); +} + +// (addlow (adrp %hi(sym)) %lo(sym)) +template <class NodeTy> +SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG) const { + DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n"); + SDLoc DL(N); + EVT Ty = getPointerTy(DAG.getDataLayout()); + SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE); + SDValue Lo = getTargetNode(N, Ty, DAG, + AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi); + return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo); +} + SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDLoc DL(Op); - const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); + GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = GN->getGlobal(); unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); @@ -3440,32 +3677,15 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 && "unexpected offset in global node"); - // This also catched the large code model case for Darwin. + // This also catches the large code model case for Darwin. if ((OpFlags & AArch64II::MO_GOT) != 0) { - SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); - // FIXME: Once remat is capable of dealing with instructions with register - // operands, expand this into two nodes instead of using a wrapper node. - return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); + return getGOT(GN, DAG); } if (getTargetMachine().getCodeModel() == CodeModel::Large) { - const unsigned char MO_NC = AArch64II::MO_NC; - return DAG.getNode( - AArch64ISD::WrapperLarge, DL, PtrVT, - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3), - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC), - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC), - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); + return getAddrLarge(GN, DAG); } else { - // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and - // the only correct model on Darwin. - SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, - OpFlags | AArch64II::MO_PAGE); - unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC; - SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags); - - SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); - return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + return getAddr(GN, DAG); } } @@ -3578,7 +3798,7 @@ SDValue AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetELF() && "This function expects an ELF target"); - assert(getTargetMachine().getCodeModel() == CodeModel::Small && + assert(Subtarget->useSmallAddressing() && "ELF TLS only supported in small memory model"); // Different choices can be made for the maximum size of the TLS area for a // module. For the small address model, the default TLS size is 16MiB and the @@ -3679,7 +3899,7 @@ SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { if (Subtarget->isTargetDarwin()) return LowerDarwinGlobalTLSAddress(Op, DAG); - else if (Subtarget->isTargetELF()) + if (Subtarget->isTargetELF()) return LowerELFGlobalTLSAddress(Op, DAG); llvm_unreachable("Unexpected platform trying to use TLS"); @@ -4242,90 +4462,37 @@ SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, // Jump table entries as PC relative offsets. No additional tweaking // is necessary here. Just get the address of the jump table. JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDLoc DL(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large && !Subtarget->isTargetMachO()) { - const unsigned char MO_NC = AArch64II::MO_NC; - return DAG.getNode( - AArch64ISD::WrapperLarge, DL, PtrVT, - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3), - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC), - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC), - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, - AArch64II::MO_G0 | MO_NC)); + return getAddrLarge(JT, DAG); } - - SDValue Hi = - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE); - SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, - AArch64II::MO_PAGEOFF | AArch64II::MO_NC); - SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); - return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + return getAddr(JT, DAG); } SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDLoc DL(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large) { // Use the GOT for the large code model on iOS. if (Subtarget->isTargetMachO()) { - SDValue GotAddr = DAG.getTargetConstantPool( - CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), - AArch64II::MO_GOT); - return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); + return getGOT(CP, DAG); } - - const unsigned char MO_NC = AArch64II::MO_NC; - return DAG.getNode( - AArch64ISD::WrapperLarge, DL, PtrVT, - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), AArch64II::MO_G3), - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), AArch64II::MO_G2 | MO_NC), - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), AArch64II::MO_G1 | MO_NC), - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), AArch64II::MO_G0 | MO_NC)); + return getAddrLarge(CP, DAG); } else { - // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on - // ELF, the only valid one on Darwin. - SDValue Hi = - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), AArch64II::MO_PAGE); - SDValue Lo = DAG.getTargetConstantPool( - CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), - AArch64II::MO_PAGEOFF | AArch64II::MO_NC); - - SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); - return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + return getAddr(CP, DAG); } } SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { - const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDLoc DL(Op); + BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large && !Subtarget->isTargetMachO()) { - const unsigned char MO_NC = AArch64II::MO_NC; - return DAG.getNode( - AArch64ISD::WrapperLarge, DL, PtrVT, - DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3), - DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC), - DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC), - DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); + return getAddrLarge(BA, DAG); } else { - SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE); - SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF | - AArch64II::MO_NC); - SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); - return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + return getAddr(BA, DAG); } } @@ -4342,6 +4509,21 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, MachinePointerInfo(SV)); } +SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op, + SelectionDAG &DAG) const { + AArch64FunctionInfo *FuncInfo = + DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); + + SDLoc DL(Op); + SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0 + ? FuncInfo->getVarArgsGPRIndex() + : FuncInfo->getVarArgsStackIndex(), + getPointerTy(DAG.getDataLayout())); + const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); + return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), + MachinePointerInfo(SV)); +} + SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const { // The layout of the va_list struct is specified in the AArch64 Procedure Call @@ -4413,8 +4595,14 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { - return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG) - : LowerAAPCS_VASTART(Op, DAG); + MachineFunction &MF = DAG.getMachineFunction(); + + if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) + return LowerWin64_VASTART(Op, DAG); + else if (Subtarget->isTargetDarwin()) + return LowerDarwin_VASTART(Op, DAG); + else + return LowerAAPCS_VASTART(Op, DAG); } SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, @@ -4422,7 +4610,8 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single // pointer. SDLoc DL(Op); - unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32; + unsigned VaListSize = + Subtarget->isTargetDarwin() || Subtarget->isTargetWindows() ? 8 : 32; const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); @@ -4516,7 +4705,12 @@ unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const { unsigned Reg = StringSwitch<unsigned>(RegName) .Case("sp", AArch64::SP) + .Case("x18", AArch64::X18) + .Case("w18", AArch64::W18) .Default(0); + if ((Reg == AArch64::X18 || Reg == AArch64::W18) && + !Subtarget->isX18Reserved()) + Reg = 0; if (Reg) return Reg; report_fatal_error(Twine("Invalid register name \"" @@ -4717,9 +4911,9 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N) for (int i = ExtraSteps; i > 0; --i) { SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate, - &Flags); - Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, &Flags); - Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags); + Flags); + Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags); + Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); } if (!Reciprocal) { @@ -4728,7 +4922,7 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ); - Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, &Flags); + Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags); // Correct the result if the operand is 0.0. Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT, Eq, Operand, Estimate); @@ -4757,8 +4951,8 @@ SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand, // AArch64 reciprocal iteration instruction: (2 - M * N) for (int i = ExtraSteps; i > 0; --i) { SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand, - Estimate, &Flags); - Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags); + Estimate, Flags); + Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); } ExtraSteps = 0; @@ -6591,21 +6785,20 @@ FailedModImm: if (!isConstant && !usesOnlyOneValue) { SDValue Vec = DAG.getUNDEF(VT); SDValue Op0 = Op.getOperand(0); - unsigned ElemSize = VT.getScalarSizeInBits(); unsigned i = 0; - // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to + + // Use SCALAR_TO_VECTOR for lane zero to // a) Avoid a RMW dependency on the full vector register, and // b) Allow the register coalescer to fold away the copy if the - // value is already in an S or D register. - // Do not do this for UNDEF/LOAD nodes because we have better patterns - // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR. - if (!Op0.isUndef() && Op0.getOpcode() != ISD::LOAD && - (ElemSize == 32 || ElemSize == 64)) { - unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub; - MachineSDNode *N = - DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0, - DAG.getTargetConstant(SubIdx, dl, MVT::i32)); - Vec = SDValue(N, 0); + // value is already in an S or D register, and we're forced to emit an + // INSERT_SUBREG that we can't fold anywhere. + // + // We also allow types like i8 and i16 which are illegal scalar but legal + // vector element types. After type-legalization the inserted value is + // extended (i32) and it is safe to cast them to the vector type by ignoring + // the upper bits of the lowest lane (e.g. v8i8, v4i16). + if (!Op0.isUndef()) { + Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0); ++i; } for (; i < NumElts; ++i) { @@ -6995,6 +7188,47 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, return Cmp; } +static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, + SelectionDAG &DAG) { + SDValue VecOp = ScalarOp.getOperand(0); + auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx, + DAG.getConstant(0, DL, MVT::i64)); +} + +SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + switch (Op.getOpcode()) { + case ISD::VECREDUCE_ADD: + return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG); + case ISD::VECREDUCE_SMAX: + return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG); + case ISD::VECREDUCE_SMIN: + return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG); + case ISD::VECREDUCE_UMAX: + return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG); + case ISD::VECREDUCE_UMIN: + return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG); + case ISD::VECREDUCE_FMAX: { + assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag"); + return DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), + DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32), + Op.getOperand(0)); + } + case ISD::VECREDUCE_FMIN: { + assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag"); + return DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), + DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32), + Op.getOperand(0)); + } + default: + llvm_unreachable("Unhandled reduction"); + } +} + /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment /// specified in the intrinsic calls. @@ -7132,7 +7366,7 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { if (I->getOpcode() != Instruction::FMul) return true; - if (I->getNumUses() != 1) + if (!I->hasOneUse()) return true; Instruction *User = I->user_back(); @@ -7249,6 +7483,41 @@ bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, return NumBits == 32 || NumBits == 64; } +/// A helper function for determining the number of interleaved accesses we +/// will generate when lowering accesses of the given type. +unsigned +AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy, + const DataLayout &DL) const { + return (DL.getTypeSizeInBits(VecTy) + 127) / 128; +} + +MachineMemOperand::Flags +AArch64TargetLowering::getMMOFlags(const Instruction &I) const { + if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor && + I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr) + return MOStridedAccess; + return MachineMemOperand::MONone; +} + +bool AArch64TargetLowering::isLegalInterleavedAccessType( + VectorType *VecTy, const DataLayout &DL) const { + + unsigned VecSize = DL.getTypeSizeInBits(VecTy); + unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); + + // Ensure the number of vector elements is greater than 1. + if (VecTy->getNumElements() < 2) + return false; + + // Ensure the element type is legal. + if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64) + return false; + + // Ensure the total vector size is 64 or a multiple of 128. Types larger than + // 128 will be split into multiple interleaved accesses. + return VecSize == 64 || VecSize % 128 == 0; +} + /// \brief Lower an interleaved load into a ldN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): @@ -7272,12 +7541,15 @@ bool AArch64TargetLowering::lowerInterleavedLoad( const DataLayout &DL = LI->getModule()->getDataLayout(); VectorType *VecTy = Shuffles[0]->getType(); - unsigned VecSize = DL.getTypeSizeInBits(VecTy); - // Skip if we do not have NEON and skip illegal vector types. - if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128)) + // Skip if we do not have NEON and skip illegal vector types. We can + // "legalize" wide vector types into multiple interleaved accesses as long as + // the vector types are divisible by 128. + if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL)) return false; + unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); + // A pointer vector can not be the return type of the ldN intrinsics. Need to // load integer vectors first and then convert to pointer vectors. Type *EltTy = VecTy->getVectorElementType(); @@ -7285,6 +7557,25 @@ bool AArch64TargetLowering::lowerInterleavedLoad( VecTy = VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); + IRBuilder<> Builder(LI); + + // The base address of the load. + Value *BaseAddr = LI->getPointerOperand(); + + if (NumLoads > 1) { + // If we're going to generate more than one load, reset the sub-vector type + // to something legal. + VecTy = VectorType::get(VecTy->getVectorElementType(), + VecTy->getVectorNumElements() / NumLoads); + + // We will compute the pointer operand of each load from the original base + // address using GEPs. Cast the base address to a pointer to the scalar + // element type. + BaseAddr = Builder.CreateBitCast( + BaseAddr, VecTy->getVectorElementType()->getPointerTo( + LI->getPointerAddressSpace())); + } + Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace()); Type *Tys[2] = {VecTy, PtrTy}; static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2, @@ -7293,39 +7584,50 @@ bool AArch64TargetLowering::lowerInterleavedLoad( Function *LdNFunc = Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); - IRBuilder<> Builder(LI); - Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy); + // Holds sub-vectors extracted from the load intrinsic return values. The + // sub-vectors are associated with the shufflevector instructions they will + // replace. + DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs; - CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN"); + for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { - // Replace uses of each shufflevector with the corresponding vector loaded - // by ldN. - for (unsigned i = 0; i < Shuffles.size(); i++) { - ShuffleVectorInst *SVI = Shuffles[i]; - unsigned Index = Indices[i]; + // If we're generating more than one load, compute the base address of + // subsequent loads as an offset from the previous. + if (LoadCount > 0) + BaseAddr = Builder.CreateConstGEP1_32( + BaseAddr, VecTy->getVectorNumElements() * Factor); - Value *SubVec = Builder.CreateExtractValue(LdN, Index); + CallInst *LdN = Builder.CreateCall( + LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN"); - // Convert the integer vector to pointer vector if the element is pointer. - if (EltTy->isPointerTy()) - SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType()); + // Extract and store the sub-vectors returned by the load intrinsic. + for (unsigned i = 0; i < Shuffles.size(); i++) { + ShuffleVectorInst *SVI = Shuffles[i]; + unsigned Index = Indices[i]; - SVI->replaceAllUsesWith(SubVec); - } + Value *SubVec = Builder.CreateExtractValue(LdN, Index); - return true; -} + // Convert the integer vector to pointer vector if the element is pointer. + if (EltTy->isPointerTy()) + SubVec = Builder.CreateIntToPtr( + SubVec, VectorType::get(SVI->getType()->getVectorElementType(), + VecTy->getVectorNumElements())); + SubVecs[SVI].push_back(SubVec); + } + } -/// \brief Get a mask consisting of sequential integers starting from \p Start. -/// -/// I.e. <Start, Start + 1, ..., Start + NumElts - 1> -static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start, - unsigned NumElts) { - SmallVector<Constant *, 16> Mask; - for (unsigned i = 0; i < NumElts; i++) - Mask.push_back(Builder.getInt32(Start + i)); + // Replace uses of the shufflevector instructions with the sub-vectors + // returned by the load intrinsic. If a shufflevector instruction is + // associated with more than one sub-vector, those sub-vectors will be + // concatenated into a single wide vector. + for (ShuffleVectorInst *SVI : Shuffles) { + auto &SubVec = SubVecs[SVI]; + auto *WideVec = + SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; + SVI->replaceAllUsesWith(WideVec); + } - return ConstantVector::get(Mask); + return true; } /// \brief Lower an interleaved store into a stN intrinsic. @@ -7369,12 +7671,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); const DataLayout &DL = SI->getModule()->getDataLayout(); - unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); - // Skip if we do not have NEON and skip illegal vector types. - if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128)) + // Skip if we do not have NEON and skip illegal vector types. We can + // "legalize" wide vector types into multiple interleaved accesses as long as + // the vector types are divisible by 128. + if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL)) return false; + unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); + Value *Op0 = SVI->getOperand(0); Value *Op1 = SVI->getOperand(1); IRBuilder<> Builder(SI); @@ -7394,6 +7699,25 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, SubVecTy = VectorType::get(IntTy, LaneLen); } + // The base address of the store. + Value *BaseAddr = SI->getPointerOperand(); + + if (NumStores > 1) { + // If we're going to generate more than one store, reset the lane length + // and sub-vector type to something legal. + LaneLen /= NumStores; + SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); + + // We will compute the pointer operand of each store from the original base + // address using GEPs. Cast the base address to a pointer to the scalar + // element type. + BaseAddr = Builder.CreateBitCast( + BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( + SI->getPointerAddressSpace())); + } + + auto Mask = SVI->getShuffleMask(); + Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace()); Type *Tys[2] = {SubVecTy, PtrTy}; static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2, @@ -7402,34 +7726,43 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, Function *StNFunc = Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); - SmallVector<Value *, 5> Ops; + for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { - // Split the shufflevector operands into sub vectors for the new stN call. - auto Mask = SVI->getShuffleMask(); - for (unsigned i = 0; i < Factor; i++) { - if (Mask[i] >= 0) { - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen))); - } else { - unsigned StartMask = 0; - for (unsigned j = 1; j < LaneLen; j++) { - if (Mask[j*Factor + i] >= 0) { - StartMask = Mask[j*Factor + i] - j; - break; + SmallVector<Value *, 5> Ops; + + // Split the shufflevector operands into sub vectors for the new stN call. + for (unsigned i = 0; i < Factor; i++) { + unsigned IdxI = StoreCount * LaneLen * Factor + i; + if (Mask[IdxI] >= 0) { + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); + } else { + unsigned StartMask = 0; + for (unsigned j = 1; j < LaneLen; j++) { + unsigned IdxJ = StoreCount * LaneLen * Factor + j; + if (Mask[IdxJ * Factor + IdxI] >= 0) { + StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; + break; + } } + // Note: Filling undef gaps with random elements is ok, since + // those elements were being written anyway (with undefs). + // In the case of all undefs we're defaulting to using elems from 0 + // Note: StartMask cannot be negative, it's checked in + // isReInterleaveMask + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); } - // Note: If all elements in a chunk are undefs, StartMask=0! - // Note: Filling undef gaps with random elements is ok, since - // those elements were being written anyway (with undefs). - // In the case of all undefs we're defaulting to using elems from 0 - // Note: StartMask cannot be negative, it's checked in isReInterleaveMask - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen))); } - } - Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy)); - Builder.CreateCall(StNFunc, Ops); + // If we generating more than one store, we compute the base address of + // subsequent stores as an offset from the previous. + if (StoreCount > 0) + BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor); + + Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy)); + Builder.CreateCall(StNFunc, Ops); + } return true; } @@ -7690,7 +8023,7 @@ SDValue AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector<SDNode *> *Created) const { - AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes(); if (isIntDivCheap(N->getValueType(0), Attr)) return SDValue(N,0); // Lower SDIV as SDIV @@ -8079,9 +8412,9 @@ static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, /// EXTR instruction extracts a contiguous chunk of bits from two existing /// registers viewed as a high/low pair. This function looks for the pattern: -/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an -/// EXTR. Can't quite be done in TableGen because the two immediates aren't -/// independent. +/// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it +/// with an EXTR. Can't quite be done in TableGen because the two immediates +/// aren't independent. static SDValue tryCombineToEXTR(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; @@ -8935,16 +9268,26 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, // instructions (stp). SDLoc DL(&St); SDValue BasePtr = St.getBasePtr(); + uint64_t BaseOffset = 0; + const MachinePointerInfo &PtrInfo = St.getPointerInfo(); SDValue NewST1 = DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo, OrigAlignment, St.getMemOperand()->getFlags()); + // As this in ISel, we will not merge this add which may degrade results. + if (BasePtr->getOpcode() == ISD::ADD && + isa<ConstantSDNode>(BasePtr->getOperand(1))) { + BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue(); + BasePtr = BasePtr->getOperand(0); + } + unsigned Offset = EltOffset; while (--NumVecElts) { unsigned Alignment = MinAlign(OrigAlignment, Offset); - SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, - DAG.getConstant(Offset, DL, MVT::i64)); + SDValue OffsetPtr = + DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, + DAG.getConstant(BaseOffset + Offset, DL, MVT::i64)); NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, PtrInfo.getWithOffset(Offset), Alignment, St.getMemOperand()->getFlags()); @@ -9072,7 +9415,7 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return SDValue(); StoreSDNode *S = cast<StoreSDNode>(N); - if (S->isVolatile()) + if (S->isVolatile() || S->isIndexed()) return SDValue(); SDValue StVal = S->getValue(); @@ -9236,17 +9579,17 @@ static SDValue performPostLD1Combine(SDNode *N, return SDValue(); } -/// Simplify \Addr given that the top byte of it is ignored by HW during +/// Simplify ``Addr`` given that the top byte of it is ignored by HW during /// address translation. static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { APInt DemandedMask = APInt::getLowBitsSet(64, 56); - APInt KnownZero, KnownOne; - TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), - DCI.isBeforeLegalizeOps()); + KnownBits Known; + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLI.SimplifyDemandedBits(Addr, DemandedMask, KnownZero, KnownOne, TLO)) { + if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) { DCI.CommitTargetLoweringOpt(TLO); return true; } @@ -9267,266 +9610,6 @@ static SDValue performSTORECombine(SDNode *N, return SDValue(); } - /// This function handles the log2-shuffle pattern produced by the -/// LoopVectorizer for the across vector reduction. It consists of -/// log2(NumVectorElements) steps and, in each step, 2^(s) elements -/// are reduced, where s is an induction variable from 0 to -/// log2(NumVectorElements). -static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV, - unsigned Op, - SelectionDAG &DAG) { - EVT VTy = OpV->getOperand(0).getValueType(); - if (!VTy.isVector()) - return SDValue(); - - int NumVecElts = VTy.getVectorNumElements(); - if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { - if (NumVecElts != 4) - return SDValue(); - } else { - if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16) - return SDValue(); - } - - int NumExpectedSteps = APInt(8, NumVecElts).logBase2(); - SDValue PreOp = OpV; - // Iterate over each step of the across vector reduction. - for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) { - SDValue CurOp = PreOp.getOperand(0); - SDValue Shuffle = PreOp.getOperand(1); - if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) { - // Try to swap the 1st and 2nd operand as add and min/max instructions - // are commutative. - CurOp = PreOp.getOperand(1); - Shuffle = PreOp.getOperand(0); - if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) - return SDValue(); - } - - // Check if the input vector is fed by the operator we want to handle, - // except the last step; the very first input vector is not necessarily - // the same operator we are handling. - if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1))) - return SDValue(); - - // Check if it forms one step of the across vector reduction. - // E.g., - // %cur = add %1, %0 - // %shuffle = vector_shuffle %cur, <2, 3, u, u> - // %pre = add %cur, %shuffle - if (Shuffle.getOperand(0) != CurOp) - return SDValue(); - - int NumMaskElts = 1 << CurStep; - ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Shuffle)->getMask(); - // Check mask values in each step. - // We expect the shuffle mask in each step follows a specific pattern - // denoted here by the <M, U> form, where M is a sequence of integers - // starting from NumMaskElts, increasing by 1, and the number integers - // in M should be NumMaskElts. U is a sequence of UNDEFs and the number - // of undef in U should be NumVecElts - NumMaskElts. - // E.g., for <8 x i16>, mask values in each step should be : - // step 0 : <1,u,u,u,u,u,u,u> - // step 1 : <2,3,u,u,u,u,u,u> - // step 2 : <4,5,6,7,u,u,u,u> - for (int i = 0; i < NumVecElts; ++i) - if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) || - (i >= NumMaskElts && !(Mask[i] < 0))) - return SDValue(); - - PreOp = CurOp; - } - unsigned Opcode; - bool IsIntrinsic = false; - - switch (Op) { - default: - llvm_unreachable("Unexpected operator for across vector reduction"); - case ISD::ADD: - Opcode = AArch64ISD::UADDV; - break; - case ISD::SMAX: - Opcode = AArch64ISD::SMAXV; - break; - case ISD::UMAX: - Opcode = AArch64ISD::UMAXV; - break; - case ISD::SMIN: - Opcode = AArch64ISD::SMINV; - break; - case ISD::UMIN: - Opcode = AArch64ISD::UMINV; - break; - case ISD::FMAXNUM: - Opcode = Intrinsic::aarch64_neon_fmaxnmv; - IsIntrinsic = true; - break; - case ISD::FMINNUM: - Opcode = Intrinsic::aarch64_neon_fminnmv; - IsIntrinsic = true; - break; - } - SDLoc DL(N); - - return IsIntrinsic - ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0), - DAG.getConstant(Opcode, DL, MVT::i32), PreOp) - : DAG.getNode( - ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), - DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp), - DAG.getConstant(0, DL, MVT::i64)); -} - -/// Target-specific DAG combine for the across vector min/max reductions. -/// This function specifically handles the final clean-up step of the vector -/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle -/// pattern, which narrows down and finds the final min/max value from all -/// elements of the vector. -/// For example, for a <16 x i8> vector : -/// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> -/// %smax0 = smax %arr, svn0 -/// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u> -/// %smax1 = smax %smax0, %svn1 -/// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -/// %smax2 = smax %smax1, svn2 -/// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -/// %sc = setcc %smax2, %svn3, gt -/// %n0 = extract_vector_elt %sc, #0 -/// %n1 = extract_vector_elt %smax2, #0 -/// %n2 = extract_vector_elt $smax2, #1 -/// %result = select %n0, %n1, n2 -/// becomes : -/// %1 = smaxv %0 -/// %result = extract_vector_elt %1, 0 -static SDValue -performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG, - const AArch64Subtarget *Subtarget) { - if (!Subtarget->hasNEON()) - return SDValue(); - - SDValue N0 = N->getOperand(0); - SDValue IfTrue = N->getOperand(1); - SDValue IfFalse = N->getOperand(2); - - // Check if the SELECT merges up the final result of the min/max - // from a vector. - if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT) - return SDValue(); - - // Expect N0 is fed by SETCC. - SDValue SetCC = N0.getOperand(0); - EVT SetCCVT = SetCC.getValueType(); - if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() || - SetCCVT.getVectorElementType() != MVT::i1) - return SDValue(); - - SDValue VectorOp = SetCC.getOperand(0); - unsigned Op = VectorOp->getOpcode(); - // Check if the input vector is fed by the operator we want to handle. - if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN && - Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM) - return SDValue(); - - EVT VTy = VectorOp.getValueType(); - if (!VTy.isVector()) - return SDValue(); - - if (VTy.getSizeInBits() < 64) - return SDValue(); - - EVT EltTy = VTy.getVectorElementType(); - if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { - if (EltTy != MVT::f32) - return SDValue(); - } else { - if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) - return SDValue(); - } - - // Check if extracting from the same vector. - // For example, - // %sc = setcc %vector, %svn1, gt - // %n0 = extract_vector_elt %sc, #0 - // %n1 = extract_vector_elt %vector, #0 - // %n2 = extract_vector_elt $vector, #1 - if (!(VectorOp == IfTrue->getOperand(0) && - VectorOp == IfFalse->getOperand(0))) - return SDValue(); - - // Check if the condition code is matched with the operator type. - ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get(); - if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) || - (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) || - (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) || - (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) || - (Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE && - CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT && - CC != ISD::SETGE) || - (Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE && - CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT && - CC != ISD::SETLE)) - return SDValue(); - - // Expect to check only lane 0 from the vector SETCC. - if (!isNullConstant(N0.getOperand(1))) - return SDValue(); - - // Expect to extract the true value from lane 0. - if (!isNullConstant(IfTrue.getOperand(1))) - return SDValue(); - - // Expect to extract the false value from lane 1. - if (!isOneConstant(IfFalse.getOperand(1))) - return SDValue(); - - return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG); -} - -/// Target-specific DAG combine for the across vector add reduction. -/// This function specifically handles the final clean-up step of the vector -/// add reduction produced by the LoopVectorizer. It is the log2-shuffle -/// pattern, which adds all elements of a vector together. -/// For example, for a <4 x i32> vector : -/// %1 = vector_shuffle %0, <2,3,u,u> -/// %2 = add %0, %1 -/// %3 = vector_shuffle %2, <1,u,u,u> -/// %4 = add %2, %3 -/// %result = extract_vector_elt %4, 0 -/// becomes : -/// %0 = uaddv %0 -/// %result = extract_vector_elt %0, 0 -static SDValue -performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG, - const AArch64Subtarget *Subtarget) { - if (!Subtarget->hasNEON()) - return SDValue(); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - - // Check if the input vector is fed by the ADD. - if (N0->getOpcode() != ISD::ADD) - return SDValue(); - - // The vector extract idx must constant zero because we only expect the final - // result of the reduction is placed in lane 0. - if (!isNullConstant(N1)) - return SDValue(); - - EVT VTy = N0.getValueType(); - if (!VTy.isVector()) - return SDValue(); - - EVT EltTy = VTy.getVectorElementType(); - if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) - return SDValue(); - - if (VTy.getSizeInBits() < 64) - return SDValue(); - - return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG); -} /// Target-specific DAG combine function for NEON load/store intrinsics /// to merge base address updates. @@ -10205,12 +10288,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performBitcastCombine(N, DCI, DAG); case ISD::CONCAT_VECTORS: return performConcatVectorsCombine(N, DCI, DAG); - case ISD::SELECT: { - SDValue RV = performSelectCombine(N, DCI); - if (!RV.getNode()) - RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget); - return RV; - } + case ISD::SELECT: + return performSelectCombine(N, DCI); case ISD::VSELECT: return performVSelectCombine(N, DCI.DAG); case ISD::LOAD: @@ -10232,8 +10311,6 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performNVCASTCombine(N); case ISD::INSERT_VECTOR_ELT: return performPostLD1Combine(N, DCI, true); - case ISD::EXTRACT_VECTOR_ELT: - return performAcrossLaneAddReductionCombine(N, DAG, Subtarget); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { @@ -10307,7 +10384,7 @@ bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N, // call. This will cause the optimizers to attempt to move, or duplicate, // return instructions to help enable tail call optimizations for this // instruction. -bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { +bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { return CI->isTailCall(); } @@ -10453,6 +10530,14 @@ void AArch64TargetLowering::ReplaceNodeResults( case ISD::BITCAST: ReplaceBITCASTResults(N, Results, DAG); return; + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); + return; + case AArch64ISD::SADDV: ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); return; @@ -10483,9 +10568,9 @@ void AArch64TargetLowering::ReplaceNodeResults( } bool AArch64TargetLowering::useLoadStackGuardNode() const { - if (!Subtarget->isTargetAndroid()) - return true; - return TargetLowering::useLoadStackGuardNode(); + if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia()) + return TargetLowering::useLoadStackGuardNode(); + return true; } unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const { @@ -10527,11 +10612,17 @@ AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned Size = AI->getType()->getPrimitiveSizeInBits(); - return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; + if (Size > 128) return AtomicExpansionKind::None; + // Nand not supported in LSE. + if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC; + // Leave 128 bits to LLSC. + return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC; } bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( AtomicCmpXchgInst *AI) const { + // If subtarget has LSE, leave cmpxchg intact for codegen. + if (Subtarget->hasLSE()) return false; // At -O0, fast-regalloc cannot cope with the live vregs necessary to // implement cmpxchg without spilling. If the address being exchanged is also // on the stack and close enough to the spill slot, this can lead to a @@ -10623,36 +10714,56 @@ bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, return false; } -Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { - if (!Subtarget->isTargetAndroid()) - return TargetLowering::getIRStackGuard(IRB); - - // Android provides a fixed TLS slot for the stack cookie. See the definition - // of TLS_SLOT_STACK_GUARD in - // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h - const unsigned TlsOffset = 0x28; +static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) { Module *M = IRB.GetInsertBlock()->getParent()->getParent(); Function *ThreadPointerFunc = Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); return IRB.CreatePointerCast( - IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset), + IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), Offset), Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); } -Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { - if (!Subtarget->isTargetAndroid()) - return TargetLowering::getSafeStackPointerLocation(IRB); +Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { + // Android provides a fixed TLS slot for the stack cookie. See the definition + // of TLS_SLOT_STACK_GUARD in + // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h + if (Subtarget->isTargetAndroid()) + return UseTlsOffset(IRB, 0x28); + // Fuchsia is similar. + // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value. + if (Subtarget->isTargetFuchsia()) + return UseTlsOffset(IRB, -0x10); + + return TargetLowering::getIRStackGuard(IRB); +} + +Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { // Android provides a fixed TLS slot for the SafeStack pointer. See the // definition of TLS_SLOT_SAFESTACK in // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h - const unsigned TlsOffset = 0x48; - Module *M = IRB.GetInsertBlock()->getParent()->getParent(); - Function *ThreadPointerFunc = - Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); - return IRB.CreatePointerCast( - IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset), - Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); + if (Subtarget->isTargetAndroid()) + return UseTlsOffset(IRB, 0x48); + + // Fuchsia is similar. + // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value. + if (Subtarget->isTargetFuchsia()) + return UseTlsOffset(IRB, -0x8); + + return TargetLowering::getSafeStackPointerLocation(IRB); +} + +bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial( + const Instruction &AndI) const { + // Only sink 'and' mask to cmp use block if it is masking a single bit, since + // this is likely to be fold the and/cmp/br into a single tbz instruction. It + // may be beneficial to sink in other cases, but we would have to check that + // the cmp would not get folded into the br to form a cbz for these to be + // beneficial. + ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1)); + if (!Mask) + return false; + return Mask->getUniqueInteger().isPowerOf2(); } void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { @@ -10702,7 +10813,7 @@ void AArch64TargetLowering::insertCopiesSplitCSR( } } -bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { +bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { // Integer division on AArch64 is expensive. However, when aggressively // optimizing for code size, we prefer to use a div instruction, as it is // usually smaller than the alternative sequence. @@ -10711,6 +10822,14 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { // size, because it will have to be scalarized, while the alternative code // sequence can be performed in vector form. bool OptSize = - Attr.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); + Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize); return OptSize && !VT.isVector(); } + +unsigned +AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const { + if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) + return getPointerTy(DL).getSizeInBits(); + + return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32; +} |