diff options
Diffstat (limited to 'contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 1355 |
1 files changed, 890 insertions, 465 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 6458d56..1616ff1 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -25,6 +25,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Type.h" #include "llvm/Support/CommandLine.h" @@ -75,10 +76,12 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false)); +/// Value type used for condition codes. +static const MVT MVT_CC = MVT::i32; -AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM) - : TargetLowering(TM) { - Subtarget = &TM.getSubtarget<AArch64Subtarget>(); +AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, + const AArch64Subtarget &STI) + : TargetLowering(TM), Subtarget(&STI) { // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so // we have to make something up. Arbitrarily, choose ZeroOrOne. @@ -120,7 +123,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM) } // Compute derived properties from the register classes - computeRegisterProperties(); + computeRegisterProperties(Subtarget->getRegisterInfo()); // Provide all sorts of operation actions setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); @@ -282,14 +285,39 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); - // f16 is storage-only, so we promote operations to f32 if we know this is - // valid, and ignore them otherwise. The operations not mentioned here will - // fail to select, but this is not a major problem as no source language - // should be emitting native f16 operations yet. - setOperationAction(ISD::FADD, MVT::f16, Promote); - setOperationAction(ISD::FDIV, MVT::f16, Promote); - setOperationAction(ISD::FMUL, MVT::f16, Promote); - setOperationAction(ISD::FSUB, MVT::f16, Promote); + // f16 is a storage-only type, always promote it to f32. + setOperationAction(ISD::SETCC, MVT::f16, Promote); + setOperationAction(ISD::BR_CC, MVT::f16, Promote); + setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); + setOperationAction(ISD::SELECT, MVT::f16, Promote); + setOperationAction(ISD::FADD, MVT::f16, Promote); + setOperationAction(ISD::FSUB, MVT::f16, Promote); + setOperationAction(ISD::FMUL, MVT::f16, Promote); + setOperationAction(ISD::FDIV, MVT::f16, Promote); + setOperationAction(ISD::FREM, MVT::f16, Promote); + setOperationAction(ISD::FMA, MVT::f16, Promote); + setOperationAction(ISD::FNEG, MVT::f16, Promote); + setOperationAction(ISD::FABS, MVT::f16, Promote); + setOperationAction(ISD::FCEIL, MVT::f16, Promote); + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); + setOperationAction(ISD::FCOS, MVT::f16, Promote); + setOperationAction(ISD::FFLOOR, MVT::f16, Promote); + setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); + setOperationAction(ISD::FPOW, MVT::f16, Promote); + setOperationAction(ISD::FPOWI, MVT::f16, Promote); + setOperationAction(ISD::FRINT, MVT::f16, Promote); + setOperationAction(ISD::FSIN, MVT::f16, Promote); + setOperationAction(ISD::FSINCOS, MVT::f16, Promote); + setOperationAction(ISD::FSQRT, MVT::f16, Promote); + setOperationAction(ISD::FEXP, MVT::f16, Promote); + setOperationAction(ISD::FEXP2, MVT::f16, Promote); + setOperationAction(ISD::FLOG, MVT::f16, Promote); + setOperationAction(ISD::FLOG2, MVT::f16, Promote); + setOperationAction(ISD::FLOG10, MVT::f16, Promote); + setOperationAction(ISD::FROUND, MVT::f16, Promote); + setOperationAction(ISD::FTRUNC, MVT::f16, Promote); + setOperationAction(ISD::FMINNUM, MVT::f16, Promote); + setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); // v4f16 is also a storage-only type, so promote it to v4f32 when that is // known to be safe. @@ -371,9 +399,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM) setOperationAction(ISD::FLOG10, MVT::v8f16, Expand); // AArch64 has implementations of a lot of rounding-like FP operations. - static MVT RoundingTypes[] = { MVT::f32, MVT::f64}; - for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) { - MVT Ty = RoundingTypes[I]; + for (MVT Ty : {MVT::f32, MVT::f64}) { setOperationAction(ISD::FFLOOR, Ty, Legal); setOperationAction(ISD::FNEARBYINT, Ty, Legal); setOperationAction(ISD::FCEIL, Ty, Legal); @@ -469,6 +495,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM) setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::VSELECT); + setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); @@ -484,6 +511,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM) // Enable TBZ/TBNZ MaskAndBranchFoldingIsLegal = true; + EnableExtLdPromotion = true; setMinFunctionAlignment(2); @@ -534,11 +562,21 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM) setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote); setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote); setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote); + // i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16 + // -> v8f16 conversions. + setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Promote); // Similarly, there is no direct i32 -> f64 vector conversion instruction. setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom); + // Or, direct i32 -> f16 vector conversion. Set it so custom, so the + // conversion happens in two steps: v4i32 -> v4f32 -> v4f16 + setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); // AArch64 doesn't have MUL.2d: setOperationAction(ISD::MUL, MVT::v2i64, Expand); @@ -570,9 +608,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM) } // AArch64 has implementations of a lot of rounding-like FP operations. - static MVT RoundingVecTypes[] = {MVT::v2f32, MVT::v4f32, MVT::v2f64 }; - for (unsigned I = 0; I < array_lengthof(RoundingVecTypes); ++I) { - MVT Ty = RoundingVecTypes[I]; + for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) { setOperationAction(ISD::FFLOOR, Ty, Legal); setOperationAction(ISD::FNEARBYINT, Ty, Legal); setOperationAction(ISD::FCEIL, Ty, Legal); @@ -647,6 +683,12 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom); setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom); + // [SU][MIN|MAX] are available for all NEON types apart from i64. + if (!VT.isFloatingPoint() && + VT.getSimpleVT() != MVT::v2i64 && VT.getSimpleVT() != MVT::v1i64) + for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) + setOperationAction(Opcode, VT.getSimpleVT(), Legal); + if (Subtarget->isLittleEndian()) { for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { @@ -739,13 +781,6 @@ MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i64; } -unsigned AArch64TargetLowering::getMaximalGlobalOffset() const { - // FIXME: On AArch64, this depends on the type. - // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes(). - // and the offset has to be a multiple of the related size in bytes. - return 4095; -} - FastISel * AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { @@ -753,9 +788,8 @@ AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, } const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { - switch (Opcode) { - default: - return nullptr; + switch ((AArch64ISD::NodeType)Opcode) { + case AArch64ISD::FIRST_NUMBER: break; case AArch64ISD::CALL: return "AArch64ISD::CALL"; case AArch64ISD::ADRP: return "AArch64ISD::ADRP"; case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow"; @@ -776,6 +810,9 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::ADCS: return "AArch64ISD::ADCS"; case AArch64ISD::SBCS: return "AArch64ISD::SBCS"; case AArch64ISD::ANDS: return "AArch64ISD::ANDS"; + case AArch64ISD::CCMP: return "AArch64ISD::CCMP"; + case AArch64ISD::CCMN: return "AArch64ISD::CCMN"; + case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP"; case AArch64ISD::FCMP: return "AArch64ISD::FCMP"; case AArch64ISD::FMIN: return "AArch64ISD::FMIN"; case AArch64ISD::FMAX: return "AArch64ISD::FMAX"; @@ -827,6 +864,12 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz"; case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz"; case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz"; + case AArch64ISD::SADDV: return "AArch64ISD::SADDV"; + case AArch64ISD::UADDV: return "AArch64ISD::UADDV"; + case AArch64ISD::SMINV: return "AArch64ISD::SMINV"; + case AArch64ISD::UMINV: return "AArch64ISD::UMINV"; + case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV"; + case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV"; case AArch64ISD::NOT: return "AArch64ISD::NOT"; case AArch64ISD::BIT: return "AArch64ISD::BIT"; case AArch64ISD::CBZ: return "AArch64ISD::CBZ"; @@ -834,6 +877,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::TBZ: return "AArch64ISD::TBZ"; case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ"; case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN"; + case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH"; case AArch64ISD::SITOF: return "AArch64ISD::SITOF"; case AArch64ISD::UITOF: return "AArch64ISD::UITOF"; case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST"; @@ -869,6 +913,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; } + return nullptr; } MachineBasicBlock * @@ -886,9 +931,8 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, // EndBB: // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] - const TargetInstrInfo *TII = - getTargetMachine().getSubtargetImpl()->getInstrInfo(); MachineFunction *MF = MBB->getParent(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); DebugLoc DL = MI->getDebugLoc(); MachineFunction::iterator It = MBB; @@ -1127,10 +1171,133 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, LHS = LHS.getOperand(0); } - return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS) + return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) .getValue(1); } +static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, + ISD::CondCode CC, SDValue CCOp, + SDValue Condition, unsigned NZCV, + SDLoc DL, SelectionDAG &DAG) { + unsigned Opcode = 0; + if (LHS.getValueType().isFloatingPoint()) + Opcode = AArch64ISD::FCCMP; + else if (RHS.getOpcode() == ISD::SUB) { + SDValue SubOp0 = RHS.getOperand(0); + if (const ConstantSDNode *SubOp0C = dyn_cast<ConstantSDNode>(SubOp0)) + if (SubOp0C->isNullValue() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { + // See emitComparison() on why we can only do this for SETEQ and SETNE. + Opcode = AArch64ISD::CCMN; + RHS = RHS.getOperand(1); + } + } + if (Opcode == 0) + Opcode = AArch64ISD::CCMP; + + SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); + return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); +} + +/// Returns true if @p Val is a tree of AND/OR/SETCC operations. +static bool isConjunctionDisjunctionTree(const SDValue Val, unsigned Depth) { + if (!Val.hasOneUse()) + return false; + if (Val->getOpcode() == ISD::SETCC) + return true; + // Protect against stack overflow. + if (Depth > 1000) + return false; + if (Val->getOpcode() == ISD::AND || Val->getOpcode() == ISD::OR) { + SDValue O0 = Val->getOperand(0); + SDValue O1 = Val->getOperand(1); + return isConjunctionDisjunctionTree(O0, Depth+1) && + isConjunctionDisjunctionTree(O1, Depth+1); + } + return false; +} + +/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain +/// of CCMP/CFCMP ops. For example (SETCC_0 & SETCC_1) with condition cond0 and +/// cond1 can be transformed into "CMP; CCMP" with CCMP executing on cond_0 +/// and setting flags to inversed(cond_1) otherwise. +/// This recursive function produces DAG nodes that produce condition flags +/// suitable to determine the truth value of @p Val (which is AND/OR/SETCC) +/// by testing the result for the condition set to @p OutCC. If @p Negate is +/// set the opposite truth value is produced. If @p CCOp and @p Condition are +/// given then conditional comparison are created so that false is reported +/// when they are false. +static SDValue emitConjunctionDisjunctionTree( + SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, + SDValue CCOp = SDValue(), AArch64CC::CondCode Condition = AArch64CC::AL) { + assert(isConjunctionDisjunctionTree(Val, 0)); + // We're at a tree leaf, produce a c?f?cmp. + unsigned Opcode = Val->getOpcode(); + if (Opcode == ISD::SETCC) { + SDValue LHS = Val->getOperand(0); + SDValue RHS = Val->getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get(); + bool isInteger = LHS.getValueType().isInteger(); + if (Negate) + CC = getSetCCInverse(CC, isInteger); + SDLoc DL(Val); + // Determine OutCC and handle FP special case. + if (isInteger) { + OutCC = changeIntCCToAArch64CC(CC); + } else { + assert(LHS.getValueType().isFloatingPoint()); + AArch64CC::CondCode ExtraCC; + changeFPCCToAArch64CC(CC, OutCC, ExtraCC); + // Surpisingly some floating point conditions can't be tested with a + // single condition code. Construct an additional comparison in this case. + // See comment below on how we deal with OR conditions. + if (ExtraCC != AArch64CC::AL) { + SDValue ExtraCmp; + if (!CCOp.getNode()) + ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); + else { + SDValue ConditionOp = DAG.getConstant(Condition, DL, MVT_CC); + // Note that we want the inverse of ExtraCC, so NZCV is not inversed. + unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(ExtraCC); + ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, + NZCV, DL, DAG); + } + CCOp = ExtraCmp; + Condition = AArch64CC::getInvertedCondCode(ExtraCC); + OutCC = AArch64CC::getInvertedCondCode(OutCC); + } + } + + // Produce a normal comparison if we are first in the chain + if (!CCOp.getNode()) + return emitComparison(LHS, RHS, CC, DL, DAG); + // Otherwise produce a ccmp. + SDValue ConditionOp = DAG.getConstant(Condition, DL, MVT_CC); + AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); + unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); + return emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL, + DAG); + } + + // Construct comparison sequence for the left hand side. + SDValue LHS = Val->getOperand(0); + SDValue RHS = Val->getOperand(1); + + // We can only implement AND-like behaviour here, but negation is free. So we + // use (not (and (not x) (not y))) to implement (or x y). + bool isOr = Val->getOpcode() == ISD::OR; + assert((isOr || Val->getOpcode() == ISD::AND) && "Should have AND or OR."); + Negate ^= isOr; + + AArch64CC::CondCode RHSCC; + SDValue CmpR = + emitConjunctionDisjunctionTree(DAG, RHS, RHSCC, isOr, CCOp, Condition); + SDValue CmpL = + emitConjunctionDisjunctionTree(DAG, LHS, OutCC, isOr, CmpR, RHSCC); + if (Negate) + OutCC = AArch64CC::getInvertedCondCode(OutCC); + return CmpL; +} + static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) { SDValue Cmp; @@ -1151,7 +1318,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, isLegalArithImmed(C - 1ULL))) { CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; - RHS = DAG.getConstant(C, VT); + RHS = DAG.getConstant(C, dl, VT); } break; case ISD::SETULT: @@ -1161,7 +1328,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) { CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; - RHS = DAG.getConstant(C, VT); + RHS = DAG.getConstant(C, dl, VT); } break; case ISD::SETLE: @@ -1172,7 +1339,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, isLegalArithImmed(C + 1ULL))) { CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; - RHS = DAG.getConstant(C, VT); + RHS = DAG.getConstant(C, dl, VT); } break; case ISD::SETULE: @@ -1183,52 +1350,61 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, isLegalArithImmed(C + 1ULL))) { CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; - RHS = DAG.getConstant(C, VT); + RHS = DAG.getConstant(C, dl, VT); } break; } } } - // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. - // For the i8 operand, the largest immediate is 255, so this can be easily - // encoded in the compare instruction. For the i16 operand, however, the - // largest immediate cannot be encoded in the compare. - // Therefore, use a sign extending load and cmn to avoid materializing the -1 - // constant. For example, - // movz w1, #65535 - // ldrh w0, [x0, #0] - // cmp w0, w1 - // > - // ldrsh w0, [x0, #0] - // cmn w0, #1 - // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) - // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure - // both the LHS and RHS are truely zero extended and to make sure the - // transformation is profitable. if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) { - if ((cast<ConstantSDNode>(RHS)->getZExtValue() >> 16 == 0) && - isa<LoadSDNode>(LHS)) { - if (cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && - cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && - LHS.getNode()->hasNUsesOfValue(1, 0)) { - int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); - if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { - SDValue SExt = - DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, - DAG.getValueType(MVT::i16)); - Cmp = emitComparison(SExt, - DAG.getConstant(ValueofRHS, RHS.getValueType()), - CC, dl, DAG); - AArch64CC = changeIntCCToAArch64CC(CC); - AArch64cc = DAG.getConstant(AArch64CC, MVT::i32); - return Cmp; - } + const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS); + + // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. + // For the i8 operand, the largest immediate is 255, so this can be easily + // encoded in the compare instruction. For the i16 operand, however, the + // largest immediate cannot be encoded in the compare. + // Therefore, use a sign extending load and cmn to avoid materializing the + // -1 constant. For example, + // movz w1, #65535 + // ldrh w0, [x0, #0] + // cmp w0, w1 + // > + // ldrsh w0, [x0, #0] + // cmn w0, #1 + // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) + // if and only if (sext LHS) == (sext RHS). The checks are in place to + // ensure both the LHS and RHS are truely zero extended and to make sure the + // transformation is profitable. + if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) && + cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && + cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && + LHS.getNode()->hasNUsesOfValue(1, 0)) { + int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); + if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { + SDValue SExt = + DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, + DAG.getValueType(MVT::i16)); + Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl, + RHS.getValueType()), + CC, dl, DAG); + AArch64CC = changeIntCCToAArch64CC(CC); + goto CreateCCNode; } } + + if ((RHSC->isNullValue() || RHSC->isOne()) && + isConjunctionDisjunctionTree(LHS, 0)) { + bool Negate = (CC == ISD::SETNE) ^ RHSC->isNullValue(); + Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC, Negate); + goto CreateCCNode; + } } + Cmp = emitComparison(LHS, RHS, CC, dl, DAG); AArch64CC = changeIntCCToAArch64CC(CC); - AArch64cc = DAG.getConstant(AArch64CC, MVT::i32); + +CreateCCNode: + AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC); return Cmp; } @@ -1264,7 +1440,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { case ISD::SMULO: case ISD::UMULO: { CC = AArch64CC::NE; - bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false; + bool IsSigned = Op.getOpcode() == ISD::SMULO; if (Op.getValueType() == MVT::i32) { unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; // For a 32 bit multiply with overflow check we want the instruction @@ -1275,7 +1451,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS); SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul, - DAG.getConstant(0, MVT::i64)); + DAG.getConstant(0, DL, MVT::i64)); // On AArch64 the upper 32 bits are always zero extended for a 32 bit // operation. We need to clear out the upper 32 bits, because we used a // widening multiply that wrote all 64 bits. In the end this should be a @@ -1288,10 +1464,10 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { // check we have to arithmetic shift right the 32nd bit of the result by // 31 bits. Then we compare the result to the upper 32 bits. SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add, - DAG.getConstant(32, MVT::i64)); + DAG.getConstant(32, DL, MVT::i64)); UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits); SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value, - DAG.getConstant(31, MVT::i64)); + DAG.getConstant(31, DL, MVT::i64)); // It is important that LowerBits is last, otherwise the arithmetic // shift will not be folded into the compare (SUBS). SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32); @@ -1304,10 +1480,11 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { // pattern: // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32) SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul, - DAG.getConstant(32, MVT::i64)); + DAG.getConstant(32, DL, MVT::i64)); SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); Overflow = - DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64), + DAG.getNode(AArch64ISD::SUBS, DL, VTs, + DAG.getConstant(0, DL, MVT::i64), UpperBits).getValue(1); } break; @@ -1318,7 +1495,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { if (IsSigned) { SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS); SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value, - DAG.getConstant(63, MVT::i64)); + DAG.getConstant(63, DL, MVT::i64)); // It is important that LowerBits is last, otherwise the arithmetic // shift will not be folded into the compare (SUBS). SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); @@ -1328,7 +1505,8 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); Overflow = - DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64), + DAG.getNode(AArch64ISD::SUBS, DL, VTs, + DAG.getConstant(0, DL, MVT::i64), UpperBits).getValue(1); } break; @@ -1347,10 +1525,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, RTLIB::Libcall Call) const { - SmallVector<SDValue, 2> Ops; - for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) - Ops.push_back(Op.getOperand(i)); - + SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false, SDLoc(Op)).first; } @@ -1405,7 +1580,7 @@ static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { FVal = Other; TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other, - DAG.getConstant(-1ULL, Other.getValueType())); + DAG.getConstant(-1ULL, dl, Other.getValueType())); return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal, CCVal, Cmp); @@ -1455,24 +1630,25 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) return SDValue(); + SDLoc dl(Op); AArch64CC::CondCode CC; // The actual operation that sets the overflow or carry flag. SDValue Value, Overflow; std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG); // We use 0 and 1 as false and true values. - SDValue TVal = DAG.getConstant(1, MVT::i32); - SDValue FVal = DAG.getConstant(0, MVT::i32); + SDValue TVal = DAG.getConstant(1, dl, MVT::i32); + SDValue FVal = DAG.getConstant(0, dl, MVT::i32); // We use an inverted condition, because the conditional select is inverted // too. This will allow it to be selected to a single instruction: // CSINC Wd, WZR, WZR, invert(cond). - SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), MVT::i32); - Overflow = DAG.getNode(AArch64ISD::CSEL, SDLoc(Op), MVT::i32, FVal, TVal, + SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); + Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal, CCVal, Overflow); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); - return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow); + return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); } // Prefetch operands are: @@ -1503,7 +1679,7 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { (Locality << 1) | // Cache level bits (unsigned)IsStream; // Stream bit return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0), - DAG.getConstant(PrfOp, MVT::i32), Op.getOperand(1)); + DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1)); } SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, @@ -1567,6 +1743,14 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, if (Op.getOperand(0).getValueType().isVector()) return LowerVectorFP_TO_INT(Op, DAG); + // f16 conversions are promoted to f32. + if (Op.getOperand(0).getValueType() == MVT::f16) { + SDLoc dl(Op); + return DAG.getNode( + Op.getOpcode(), dl, Op.getValueType(), + DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0))); + } + if (Op.getOperand(0).getValueType() != MVT::f128) { // It's legal except when f128 is involved return Op; @@ -1578,10 +1762,7 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, else LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); - SmallVector<SDValue, 2> Ops; - for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) - Ops.push_back(Op.getOperand(i)); - + SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false, SDLoc(Op)).first; } @@ -1600,7 +1781,7 @@ static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), InVT.getVectorNumElements()); In = DAG.getNode(Op.getOpcode(), dl, CastVT, In); - return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0)); + return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl)); } if (VT.getSizeInBits() > InVT.getSizeInBits()) { @@ -1619,6 +1800,15 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, if (Op.getValueType().isVector()) return LowerVectorINT_TO_FP(Op, DAG); + // f16 conversions are promoted to f32. + if (Op.getValueType() == MVT::f16) { + SDLoc dl(Op); + return DAG.getNode( + ISD::FP_ROUND, dl, MVT::f16, + DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)), + DAG.getIntPtrConstant(0, dl)); + } + // i128 conversions are libcalls. if (Op.getOperand(0).getValueType() == MVT::i128) return SDValue(); @@ -1679,7 +1869,7 @@ static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); return SDValue( DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op, - DAG.getTargetConstant(AArch64::hsub, MVT::i32)), + DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 0); } @@ -1753,6 +1943,7 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); EVT VT = N->getValueType(0); + SDLoc dl(N); unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2; unsigned NumElts = VT.getVectorNumElements(); MVT TruncVT = MVT::getIntegerVT(EltSize); @@ -1762,9 +1953,9 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { const APInt &CInt = C->getAPIntValue(); // Element types smaller than 32 bits are not legal, so use i32 elements. // The values are implicitly truncated so sext vs. zext doesn't matter. - Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32)); + Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); } - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), + return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::getVectorVT(TruncVT, NumElts), Ops); } @@ -2219,8 +2410,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, AArch64::X3, AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7 }; static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs); - unsigned FirstVariadicGPR = - CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs); + unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs); unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); int GPRIdx = 0; @@ -2237,7 +2427,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, MachinePointerInfo::getStack(i * 8), false, false, 0); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, - DAG.getConstant(8, getPointerTy())); + DAG.getConstant(8, DL, getPointerTy())); } } FuncInfo->setVarArgsGPRIndex(GPRIdx); @@ -2248,8 +2438,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs); - unsigned FirstVariadicFPR = - CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs); + unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs); unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); int FPRIdx = 0; @@ -2267,7 +2456,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, MachinePointerInfo::getStack(i * 16), false, false, 0); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, - DAG.getConstant(16, getPointerTy())); + DAG.getConstant(16, DL, getPointerTy())); } } FuncInfo->setVarArgsFPRIndex(FPRIdx); @@ -2619,8 +2808,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) - Chain = - DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL); + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL, + true), + DL); SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy()); @@ -2690,7 +2880,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } unsigned LocMemOffset = VA.getLocMemOffset(); int32_t Offset = LocMemOffset + BEAlign; - SDValue PtrOff = DAG.getIntPtrConstant(Offset); + SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff); if (IsTailCall) { @@ -2705,7 +2895,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // clobbered. Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); } else { - SDValue PtrOff = DAG.getIntPtrConstant(Offset); + SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff); DstInfo = MachinePointerInfo::getStack(LocMemOffset); @@ -2713,11 +2903,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, if (Outs[i].Flags.isByVal()) { SDValue SizeNode = - DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64); + DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64); SDValue Cpy = DAG.getMemcpy( Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), - /*isVol = */ false, - /*AlwaysInline = */ false, DstInfo, MachinePointerInfo()); + /*isVol = */ false, /*AlwaysInline = */ false, + /*isTailCall = */ false, + DstInfo, MachinePointerInfo()); MemOpChains.push_back(Cpy); } else { @@ -2782,8 +2973,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // we've carefully laid out the parameters so that when sp is reset they'll be // in the correct location. if (IsTailCall && !IsSibCall) { - Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), - DAG.getIntPtrConstant(0, true), InFlag, DL); + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), + DAG.getIntPtrConstant(0, DL, true), InFlag, DL); InFlag = Chain.getValue(1); } @@ -2795,7 +2986,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Each tail call may have to adjust the stack by a different amount, so // this information must travel along with the operation for eventual // consumption by emitEpilogue. - Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32)); + Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); } // Add argument registers to the end of the list so that they are known live @@ -2806,19 +2997,16 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Add a register mask operand representing the call-preserved registers. const uint32_t *Mask; - const TargetRegisterInfo *TRI = - getTargetMachine().getSubtargetImpl()->getRegisterInfo(); - const AArch64RegisterInfo *ARI = - static_cast<const AArch64RegisterInfo *>(TRI); + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); if (IsThisReturn) { // For 'this' returns, use the X0-preserving mask if applicable - Mask = ARI->getThisReturnPreservedMask(CallConv); + Mask = TRI->getThisReturnPreservedMask(MF, CallConv); if (!Mask) { IsThisReturn = false; - Mask = ARI->getCallPreservedMask(CallConv); + Mask = TRI->getCallPreservedMask(MF, CallConv); } } else - Mask = ARI->getCallPreservedMask(CallConv); + Mask = TRI->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -2830,8 +3018,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // If we're doing a tall call, use a TC_RETURN here rather than an // actual call instruction. - if (IsTailCall) + if (IsTailCall) { + MF.getFrameInfo()->setHasTailCall(); return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); + } // Returns a chain and a flag for retval copy to use. Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); @@ -2841,8 +3031,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, ? RoundUpToAlignment(NumBytes, 16) : 0; - Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), - DAG.getIntPtrConstant(CalleePopBytes, true), + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), + DAG.getIntPtrConstant(CalleePopBytes, DL, true), InFlag, DL); if (!Ins.empty()) InFlag = Chain.getValue(1); @@ -2958,7 +3148,7 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, /*isInvariant=*/ true, 8); if (GN->getOffset() != 0) return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr, - DAG.getConstant(GN->getOffset(), PtrVT)); + DAG.getConstant(GN->getOffset(), DL, PtrVT)); return GlobalAddr; } @@ -3038,11 +3228,8 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, // TLS calls preserve all registers except those that absolutely must be // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be // silly). - const TargetRegisterInfo *TRI = - getTargetMachine().getSubtargetImpl()->getRegisterInfo(); - const AArch64RegisterInfo *ARI = - static_cast<const AArch64RegisterInfo *>(TRI); - const uint32_t *Mask = ARI->getTLSCallPreservedMask(); + const uint32_t *Mask = + Subtarget->getRegisterInfo()->getTLSCallPreservedMask(); // Finally, we can make the call. This is just a degenerate version of a // normal AArch64 call node: x0 takes the address of the descriptor, and @@ -3125,11 +3312,13 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, SDValue TPWithOff_lo = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, - HiVar, DAG.getTargetConstant(0, MVT::i32)), + HiVar, + DAG.getTargetConstant(0, DL, MVT::i32)), 0); SDValue TPWithOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo, - LoVar, DAG.getTargetConstant(0, MVT::i32)), + LoVar, + DAG.getTargetConstant(0, DL, MVT::i32)), 0); return TPWithOff; } else if (Model == TLSModel::InitialExec) { @@ -3165,10 +3354,10 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar, - DAG.getTargetConstant(0, MVT::i32)), + DAG.getTargetConstant(0, DL, MVT::i32)), 0); TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar, - DAG.getTargetConstant(0, MVT::i32)), + DAG.getTargetConstant(0, DL, MVT::i32)), 0); } else if (Model == TLSModel::GeneralDynamic) { // The call needs a relocation too for linker relaxation. It doesn't make @@ -3211,7 +3400,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { // If softenSetCCOperands returned a scalar, we need to compare the result // against zero to select between true and false values. if (!RHS.getNode()) { - RHS = DAG.getConstant(0, LHS.getValueType()); + RHS = DAG.getConstant(0, dl, LHS.getValueType()); CC = ISD::SETNE; } } @@ -3236,10 +3425,10 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { if (CC == ISD::SETNE) OFCC = getInvertedCondCode(OFCC); - SDValue CCVal = DAG.getConstant(OFCC, MVT::i32); + SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32); - return DAG.getNode(AArch64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest, - CCVal, Overflow); + return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, + Overflow); } if (LHS.getValueType().isInteger()) { @@ -3261,7 +3450,8 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Test = LHS.getOperand(0); uint64_t Mask = LHS.getConstantOperandVal(1); return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test, - DAG.getConstant(Log2_64(Mask), MVT::i64), Dest); + DAG.getConstant(Log2_64(Mask), dl, MVT::i64), + Dest); } return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest); @@ -3276,7 +3466,8 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Test = LHS.getOperand(0); uint64_t Mask = LHS.getConstantOperandVal(1); return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test, - DAG.getConstant(Log2_64(Mask), MVT::i64), Dest); + DAG.getConstant(Log2_64(Mask), dl, MVT::i64), + Dest); } return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest); @@ -3286,7 +3477,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { // becomes redundant. This would also increase register pressure. uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, - DAG.getConstant(Mask, MVT::i64), Dest); + DAG.getConstant(Mask, dl, MVT::i64), Dest); } } if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && @@ -3296,7 +3487,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { // becomes redundant. This would also increase register pressure. uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, - DAG.getConstant(Mask, MVT::i64), Dest); + DAG.getConstant(Mask, dl, MVT::i64), Dest); } SDValue CCVal; @@ -3312,11 +3503,11 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); - SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); + SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); SDValue BR1 = DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp); if (CC2 != AArch64CC::AL) { - SDValue CC2Val = DAG.getConstant(CC2, MVT::i32); + SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val, Cmp); } @@ -3336,7 +3527,8 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, if (SrcVT == MVT::f32 && VT == MVT::f64) In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); else if (SrcVT == MVT::f64 && VT == MVT::f32) - In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0)); + In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, + DAG.getIntPtrConstant(0, DL)); else // FIXME: Src type is different, bail out for now. Can VT really be a // vector type? @@ -3345,11 +3537,12 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, EVT VecVT; EVT EltVT; - SDValue EltMask, VecVal1, VecVal2; + uint64_t EltMask; + SDValue VecVal1, VecVal2; if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { EltVT = MVT::i32; VecVT = MVT::v4i32; - EltMask = DAG.getConstant(0x80000000ULL, EltVT); + EltMask = 0x80000000ULL; if (!VT.isVector()) { VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, @@ -3367,7 +3560,7 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, // We want to materialize a mask with the the high bit set, but the AdvSIMD // immediate moves cannot materialize that in a single instruction for // 64-bit elements. Instead, materialize zero and then negate it. - EltMask = DAG.getConstant(0, EltVT); + EltMask = 0; if (!VT.isVector()) { VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, @@ -3382,11 +3575,7 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, llvm_unreachable("Invalid type for copysign!"); } - std::vector<SDValue> BuildVectorOps; - for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i) - BuildVectorOps.push_back(EltMask); - - SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, BuildVectorOps); + SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT); // If we couldn't materialize the mask above, then the mask vector will be // the zero vector, and we need to negate it here. @@ -3408,8 +3597,8 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, } SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { - if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute( - AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) + if (DAG.getMachineFunction().getFunction()->hasFnAttribute( + Attribute::NoImplicitFloat)) return SDValue(); if (!Subtarget->hasNEON()) @@ -3426,21 +3615,15 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { SDValue Val = Op.getOperand(0); SDLoc DL(Op); EVT VT = Op.getValueType(); - SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8); - SDValue VecVal; - if (VT == MVT::i32) { - VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val); - VecVal = DAG.getTargetInsertSubreg(AArch64::ssub, DL, MVT::v8i8, ZeroVec, - VecVal); - } else { - VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); - } + if (VT == MVT::i32) + Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); + Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); - SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal); + SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val); SDValue UaddLV = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, - DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, MVT::i32), CtPop); + DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); if (VT == MVT::i64) UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); @@ -3459,8 +3642,8 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // We chose ZeroOrOneBooleanContents, so use zero and one. EVT VT = Op.getValueType(); - SDValue TVal = DAG.getConstant(1, VT); - SDValue FVal = DAG.getConstant(0, VT); + SDValue TVal = DAG.getConstant(1, dl, VT); + SDValue FVal = DAG.getConstant(0, dl, VT); // Handle f128 first, since one possible outcome is a normal integer // comparison which gets picked up by the next if statement. @@ -3497,7 +3680,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { changeFPCCToAArch64CC(CC, CC1, CC2); if (CC2 == AArch64CC::AL) { changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2); - SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); + SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); // Note that we inverted the condition above, so we reverse the order of // the true and false operands here. This will allow the setcc to be @@ -3510,11 +3693,11 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // of the first as the RHS. We're effectively OR'ing the two CC's together. // FIXME: It would be nice if we could match the two CSELs to two CSINCs. - SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); + SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); - SDValue CC2Val = DAG.getConstant(CC2, MVT::i32); + SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); } } @@ -3529,7 +3712,8 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { /// operations would *not* be semantically equivalent. static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) { if (Cmp == Result) - return true; + return (Cmp.getValueType() == MVT::f32 || + Cmp.getValueType() == MVT::f64); ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp); ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result); @@ -3544,49 +3728,10 @@ static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) { return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp; } -SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, - SelectionDAG &DAG) const { - SDValue CC = Op->getOperand(0); - SDValue TVal = Op->getOperand(1); - SDValue FVal = Op->getOperand(2); - SDLoc DL(Op); - - unsigned Opc = CC.getOpcode(); - // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select - // instruction. - if (CC.getResNo() == 1 && - (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || - Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { - // Only lower legal XALUO ops. - if (!DAG.getTargetLoweringInfo().isTypeLegal(CC->getValueType(0))) - return SDValue(); - - AArch64CC::CondCode OFCC; - SDValue Value, Overflow; - std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CC.getValue(0), DAG); - SDValue CCVal = DAG.getConstant(OFCC, MVT::i32); - - return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, - CCVal, Overflow); - } - - if (CC.getOpcode() == ISD::SETCC) - return DAG.getSelectCC(DL, CC.getOperand(0), CC.getOperand(1), TVal, FVal, - cast<CondCodeSDNode>(CC.getOperand(2))->get()); - else - return DAG.getSelectCC(DL, CC, DAG.getConstant(0, CC.getValueType()), TVal, - FVal, ISD::SETNE); -} - -SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, +SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, + SDValue RHS, SDValue TVal, + SDValue FVal, SDLoc dl, SelectionDAG &DAG) const { - ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - SDValue TVal = Op.getOperand(2); - SDValue FVal = Op.getOperand(3); - SDLoc dl(Op); - // Handle f128 first, because it will result in a comparison of some RTLIB // call result against zero. if (LHS.getValueType() == MVT::f128) { @@ -3595,7 +3740,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, // If softenSetCCOperands returned a scalar, we need to compare the result // against zero to select between true and false values. if (!RHS.getNode()) { - RHS = DAG.getConstant(0, LHS.getValueType()); + RHS = DAG.getConstant(0, dl, LHS.getValueType()); CC = ISD::SETNE; } } @@ -3694,67 +3839,27 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SDValue CCVal; SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); - EVT VT = Op.getValueType(); + EVT VT = TVal.getValueType(); return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); } // Now we know we're dealing with FP values. assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); assert(LHS.getValueType() == RHS.getValueType()); - EVT VT = Op.getValueType(); - - // Try to match this select into a max/min operation, which have dedicated - // opcode in the instruction set. - // FIXME: This is not correct in the presence of NaNs, so we only enable this - // in no-NaNs mode. - if (getTargetMachine().Options.NoNaNsFPMath) { - SDValue MinMaxLHS = TVal, MinMaxRHS = FVal; - if (selectCCOpsAreFMaxCompatible(LHS, MinMaxRHS) && - selectCCOpsAreFMaxCompatible(RHS, MinMaxLHS)) { - CC = ISD::getSetCCSwappedOperands(CC); - std::swap(MinMaxLHS, MinMaxRHS); - } - - if (selectCCOpsAreFMaxCompatible(LHS, MinMaxLHS) && - selectCCOpsAreFMaxCompatible(RHS, MinMaxRHS)) { - switch (CC) { - default: - break; - case ISD::SETGT: - case ISD::SETGE: - case ISD::SETUGT: - case ISD::SETUGE: - case ISD::SETOGT: - case ISD::SETOGE: - return DAG.getNode(AArch64ISD::FMAX, dl, VT, MinMaxLHS, MinMaxRHS); - break; - case ISD::SETLT: - case ISD::SETLE: - case ISD::SETULT: - case ISD::SETULE: - case ISD::SETOLT: - case ISD::SETOLE: - return DAG.getNode(AArch64ISD::FMIN, dl, VT, MinMaxLHS, MinMaxRHS); - break; - } - } - } - - // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead - // and do the comparison. + EVT VT = TVal.getValueType(); SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally // clean. Some of them require two CSELs to implement. AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); - SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); + SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); // If we need a second CSEL, emit it, using the output of the first as the // RHS. We're effectively OR'ing the two CC's together. if (CC2 != AArch64CC::AL) { - SDValue CC2Val = DAG.getConstant(CC2, MVT::i32); + SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); } @@ -3762,6 +3867,58 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, return CS1; } +SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, + SelectionDAG &DAG) const { + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDValue TVal = Op.getOperand(2); + SDValue FVal = Op.getOperand(3); + SDLoc DL(Op); + return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); +} + +SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, + SelectionDAG &DAG) const { + SDValue CCVal = Op->getOperand(0); + SDValue TVal = Op->getOperand(1); + SDValue FVal = Op->getOperand(2); + SDLoc DL(Op); + + unsigned Opc = CCVal.getOpcode(); + // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select + // instruction. + if (CCVal.getResNo() == 1 && + (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || + Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { + // Only lower legal XALUO ops. + if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0))) + return SDValue(); + + AArch64CC::CondCode OFCC; + SDValue Value, Overflow; + std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG); + SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32); + + return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, + CCVal, Overflow); + } + + // Lower it the same way as we would lower a SELECT_CC node. + ISD::CondCode CC; + SDValue LHS, RHS; + if (CCVal.getOpcode() == ISD::SETCC) { + LHS = CCVal.getOperand(0); + RHS = CCVal.getOperand(1); + CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get(); + } else { + LHS = CCVal; + RHS = DAG.getConstant(0, DL, CCVal.getValueType()); + CC = ISD::SETNE; + } + return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); +} + SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { // Jump table entries as PC relative offsets. No additional tweaking @@ -3892,11 +4049,11 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, SDValue GRTop, GRTopAddr; GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(8, getPointerTy())); + DAG.getConstant(8, DL, getPointerTy())); GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy()); GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop, - DAG.getConstant(GPRSize, getPointerTy())); + DAG.getConstant(GPRSize, DL, getPointerTy())); MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, MachinePointerInfo(SV, 8), false, false, 8)); @@ -3907,11 +4064,11 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, if (FPRSize > 0) { SDValue VRTop, VRTopAddr; VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(16, getPointerTy())); + DAG.getConstant(16, DL, getPointerTy())); VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy()); VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop, - DAG.getConstant(FPRSize, getPointerTy())); + DAG.getConstant(FPRSize, DL, getPointerTy())); MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, MachinePointerInfo(SV, 16), false, false, 8)); @@ -3919,15 +4076,17 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, // int __gr_offs at offset 24 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(24, getPointerTy())); - MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32), + DAG.getConstant(24, DL, getPointerTy())); + MemOps.push_back(DAG.getStore(Chain, DL, + DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr, MachinePointerInfo(SV, 24), false, false, 4)); // int __vr_offs at offset 28 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(28, getPointerTy())); - MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32), + DAG.getConstant(28, DL, getPointerTy())); + MemOps.push_back(DAG.getStore(Chain, DL, + DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr, MachinePointerInfo(SV, 28), false, false, 4)); @@ -3944,13 +4103,15 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single // pointer. + SDLoc DL(Op); unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32; const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); - return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1), - Op.getOperand(2), DAG.getConstant(VaListSize, MVT::i32), - 8, false, false, MachinePointerInfo(DestSV), + return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), + Op.getOperand(2), + DAG.getConstant(VaListSize, DL, MVT::i32), + 8, false, false, false, MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); } @@ -3972,9 +4133,9 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { if (Align > 8) { assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2"); VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(Align - 1, getPointerTy())); + DAG.getConstant(Align - 1, DL, getPointerTy())); VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList, - DAG.getConstant(-(int64_t)Align, getPointerTy())); + DAG.getConstant(-(int64_t)Align, DL, getPointerTy())); } Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); @@ -3994,7 +4155,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { // Increment the pointer, VAList, to the next vaarg SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(ArgSize, getPointerTy())); + DAG.getConstant(ArgSize, DL, getPointerTy())); // Store the incremented VAList to the legalized pointer SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V), false, false, 0); @@ -4006,7 +4167,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { MachinePointerInfo(), false, false, false, 0); // Round the value down to an f32. SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), - DAG.getIntPtrConstant(1)); + DAG.getIntPtrConstant(1, DL)); SDValue Ops[] = { NarrowFP, WideFP.getValue(1) }; // Merge the rounded value with the chain output of the load. return DAG.getMergeValues(Ops, DL); @@ -4041,7 +4202,8 @@ unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, .Default(0); if (Reg) return Reg; - report_fatal_error("Invalid register name global variable"); + report_fatal_error(Twine("Invalid register name \"" + + StringRef(RegName) + "\".")); } SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, @@ -4055,7 +4217,7 @@ SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); if (Depth) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); - SDValue Offset = DAG.getConstant(8, getPointerTy()); + SDValue Offset = DAG.getConstant(8, DL, getPointerTy()); return DAG.getLoad(VT, DL, DAG.getEntryNode(), DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo(), false, false, false, 0); @@ -4083,15 +4245,15 @@ SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op, assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, - DAG.getConstant(VTBits, MVT::i64), ShAmt); + DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, - DAG.getConstant(VTBits, MVT::i64)); + DAG.getConstant(VTBits, dl, MVT::i64)); SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); - SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64), + SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, dl, DAG); - SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32); + SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); @@ -4103,8 +4265,9 @@ SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op, SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); SDValue TrueValHi = Opc == ISD::SRA ? DAG.getNode(Opc, dl, VT, ShOpHi, - DAG.getConstant(VTBits - 1, MVT::i64)) - : DAG.getConstant(0, VT); + DAG.getConstant(VTBits - 1, dl, + MVT::i64)) + : DAG.getConstant(0, dl, VT); SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp); @@ -4127,24 +4290,24 @@ SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, assert(Op.getOpcode() == ISD::SHL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, - DAG.getConstant(VTBits, MVT::i64), ShAmt); + DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, - DAG.getConstant(VTBits, MVT::i64)); + DAG.getConstant(VTBits, dl, MVT::i64)); SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); - SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64), + SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, dl, DAG); - SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32); + SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp); // AArch64 shifts of larger than register sizes are wrapped rather than // clamped, so we can't just emit "lo << a" if a is too big. - SDValue TrueValLo = DAG.getConstant(0, VT); + SDValue TrueValLo = DAG.getConstant(0, dl, VT); SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp); @@ -4258,7 +4421,8 @@ AArch64TargetLowering::getSingleConstraintMatchWeight( std::pair<unsigned, const TargetRegisterClass *> AArch64TargetLowering::getRegForInlineAsmConstraint( - const std::string &Constraint, MVT VT) const { + const TargetRegisterInfo *TRI, const std::string &Constraint, + MVT VT) const { if (Constraint.size() == 1) { switch (Constraint[0]) { case 'r': @@ -4287,7 +4451,7 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( // Use the default implementation in TargetLowering to convert the register // constraint into a member of a register class. std::pair<unsigned, const TargetRegisterClass *> Res; - Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); + Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); // Not found as a standard register? if (!Res.second) { @@ -4436,7 +4600,7 @@ void AArch64TargetLowering::LowerAsmOperandForConstraint( } // All assembler immediates are 64-bit integers. - Result = DAG.getTargetConstant(CVal, MVT::i64); + Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64); break; } @@ -4462,7 +4626,7 @@ static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) { SDLoc DL(V64Reg); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy), - V64Reg, DAG.getConstant(0, MVT::i32)); + V64Reg, DAG.getConstant(0, DL, MVT::i32)); } /// getExtFactor - Determine the adjustment factor for the position when @@ -4594,25 +4758,26 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, // The extraction can just take the second half Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, - DAG.getConstant(NumSrcElts, MVT::i64)); + DAG.getConstant(NumSrcElts, dl, MVT::i64)); Src.WindowBase = -NumSrcElts; } else if (Src.MaxElt < NumSrcElts) { // The extraction can just take the first half Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, - DAG.getConstant(0, MVT::i64)); + DAG.getConstant(0, dl, MVT::i64)); } else { // An actual VEXT is needed SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, - DAG.getConstant(0, MVT::i64)); + DAG.getConstant(0, dl, MVT::i64)); SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, - DAG.getConstant(NumSrcElts, MVT::i64)); + DAG.getConstant(NumSrcElts, dl, MVT::i64)); unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, - VEXTSrc2, DAG.getConstant(Imm, MVT::i32)); + VEXTSrc2, + DAG.getConstant(Imm, dl, MVT::i32)); Src.WindowBase = -Src.MinElt; } } @@ -4947,11 +5112,11 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { VT.getVectorNumElements() / 2); if (SplitV0) { V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, - DAG.getConstant(0, MVT::i64)); + DAG.getConstant(0, DL, MVT::i64)); } if (V1.getValueType().getSizeInBits() == 128) { V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1, - DAG.getConstant(0, MVT::i64)); + DAG.getConstant(0, DL, MVT::i64)); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1); } @@ -5018,7 +5183,7 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, unsigned Opcode; if (EltTy == MVT::i8) Opcode = AArch64ISD::DUPLANE8; - else if (EltTy == MVT::i16) + else if (EltTy == MVT::i16 || EltTy == MVT::f16) Opcode = AArch64ISD::DUPLANE16; else if (EltTy == MVT::i32 || EltTy == MVT::f32) Opcode = AArch64ISD::DUPLANE32; @@ -5029,7 +5194,7 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, if (VT.getSizeInBits() == 64) OpLHS = WidenVector(OpLHS, DAG); - SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, MVT::i64); + SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64); return DAG.getNode(Opcode, dl, VT, OpLHS, Lane); } case OP_VEXT1: @@ -5037,7 +5202,7 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, case OP_VEXT3: { unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS); return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS, - DAG.getConstant(Imm, MVT::i32)); + DAG.getConstant(Imm, dl, MVT::i32)); } case OP_VUZPL: return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, @@ -5074,7 +5239,7 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, for (int Val : ShuffleMask) { for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { unsigned Offset = Byte + Val * BytesPerElt; - TBLMask.push_back(DAG.getConstant(Offset, MVT::i32)); + TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32)); } } @@ -5094,7 +5259,7 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, - DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst, + DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, makeArrayRef(TBLMask.data(), IndexLen))); } else { @@ -5102,7 +5267,7 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, - DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst, + DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, makeArrayRef(TBLMask.data(), IndexLen))); } else { @@ -5114,7 +5279,8 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, // &TBLMask[0], IndexLen)); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, - DAG.getConstant(Intrinsic::aarch64_neon_tbl2, MVT::i32), V1Cst, V2Cst, + DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), + V1Cst, V2Cst, DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, makeArrayRef(TBLMask.data(), IndexLen))); } @@ -5183,7 +5349,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, } else if (VT.getSizeInBits() == 64) V1 = WidenVector(V1, DAG); - return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, MVT::i64)); + return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64)); } if (isREVMask(ShuffleMask, VT, 64)) @@ -5200,12 +5366,12 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, std::swap(V1, V2); Imm *= getExtFactor(V1); return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2, - DAG.getConstant(Imm, MVT::i32)); + DAG.getConstant(Imm, dl, MVT::i32)); } else if (V2->getOpcode() == ISD::UNDEF && isSingletonEXTMask(ShuffleMask, VT, Imm)) { Imm *= getExtFactor(V1); return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1, - DAG.getConstant(Imm, MVT::i32)); + DAG.getConstant(Imm, dl, MVT::i32)); } unsigned WhichResult; @@ -5244,7 +5410,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, int NumInputElements = V1.getValueType().getVectorNumElements(); if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) { SDValue DstVec = DstIsLeft ? V1 : V2; - SDValue DstLaneV = DAG.getConstant(Anomaly, MVT::i64); + SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64); SDValue SrcVec = V1; int SrcLane = ShuffleMask[Anomaly]; @@ -5252,7 +5418,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SrcVec = V2; SrcLane -= VT.getVectorNumElements(); } - SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64); + SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64); EVT ScalarVT = VT.getVectorElementType(); @@ -5342,8 +5508,8 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5351,8 +5517,8 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(8, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(8, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5360,8 +5526,8 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(16, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(16, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5369,8 +5535,8 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(24, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(24, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5378,8 +5544,8 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5387,8 +5553,8 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(8, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(8, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } @@ -5492,7 +5658,8 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli; SDValue ResultSLI = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrin, MVT::i32), X, Y, Shift.getOperand(1)); + DAG.getConstant(Intrin, DL, MVT::i32), X, Y, + Shift.getOperand(1)); DEBUG(dbgs() << "aarch64-lower: transformed: \n"); DEBUG(N->dump(&DAG)); @@ -5542,8 +5709,8 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5551,8 +5718,8 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(8, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(8, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5560,8 +5727,8 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(16, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(16, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5569,8 +5736,8 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(24, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(24, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5578,8 +5745,8 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5587,8 +5754,8 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(8, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(8, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } @@ -5623,7 +5790,7 @@ static SDValue NormalizeBuildVector(SDValue Op, if (Lane.getOpcode() == ISD::Constant) { APInt LowBits(EltTy.getSizeInBits(), cast<ConstantSDNode>(Lane)->getZExtValue()); - Lane = DAG.getConstant(LowBits.getZExtValue(), MVT::i32); + Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32); } Ops.push_back(Lane); } @@ -5661,13 +5828,13 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal); if (VT.getSizeInBits() == 128) { SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64, - DAG.getConstant(CnstVal, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } // Support the V64 version via subregister insertion. SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64, - DAG.getConstant(CnstVal, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5675,8 +5842,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5684,8 +5851,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(8, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(8, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5693,8 +5860,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(16, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(16, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5702,8 +5869,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(24, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(24, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5711,8 +5878,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5720,8 +5887,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(8, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(8, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5729,8 +5896,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(264, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(264, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5738,8 +5905,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(272, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(272, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5747,7 +5914,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8; SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5756,7 +5923,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32; SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5764,7 +5931,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, VT.getSizeInBits() == 128) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal); SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64, - DAG.getConstant(CnstVal, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5774,8 +5941,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5783,8 +5950,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(8, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(8, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5792,8 +5959,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(16, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(16, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5801,8 +5968,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(24, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(24, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5810,8 +5977,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5819,8 +5986,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(8, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(8, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5828,8 +5995,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(264, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(264, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } @@ -5837,8 +6004,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(272, MVT::i32)); + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(272, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } @@ -5921,8 +6088,10 @@ FailedModImm: if (VT.getVectorElementType().isFloatingPoint()) { SmallVector<SDValue, 8> Ops; - MVT NewType = - (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64; + EVT EltTy = VT.getVectorElementType(); + assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) && + "Unsupported floating-point vector type"); + MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits()); for (unsigned i = 0; i < NumElts; ++i) Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); @@ -5942,7 +6111,7 @@ FailedModImm: // Now insert the non-constant lanes. for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); - SDValue LaneIdx = DAG.getConstant(i, MVT::i64); + SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) { // Note that type legalization likely mucked about with the VT of the // source operand, so we may have to convert it here before inserting. @@ -5984,7 +6153,7 @@ FailedModImm: unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub; MachineSDNode *N = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0, - DAG.getTargetConstant(SubIdx, MVT::i32)); + DAG.getTargetConstant(SubIdx, dl, MVT::i32)); Vec = SDValue(N, 0); ++i; } @@ -5992,7 +6161,7 @@ FailedModImm: SDValue V = Op.getOperand(i); if (V.getOpcode() == ISD::UNDEF) continue; - SDValue LaneIdx = DAG.getConstant(i, MVT::i64); + SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); } return Vec; @@ -6215,10 +6384,11 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, case ISD::SHL: if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) - return DAG.getNode(AArch64ISD::VSHL, SDLoc(Op), VT, Op.getOperand(0), - DAG.getConstant(Cnt, MVT::i32)); + return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0), + DAG.getConstant(Cnt, DL, MVT::i32)); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::aarch64_neon_ushl, MVT::i32), + DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL, + MVT::i32), Op.getOperand(0), Op.getOperand(1)); case ISD::SRA: case ISD::SRL: @@ -6227,8 +6397,8 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, Cnt < EltSize) { unsigned Opc = (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR; - return DAG.getNode(Opc, SDLoc(Op), VT, Op.getOperand(0), - DAG.getConstant(Cnt, MVT::i32)); + return DAG.getNode(Opc, DL, VT, Op.getOperand(0), + DAG.getConstant(Cnt, DL, MVT::i32)); } // Right shift register. Note, there is not a shift right register @@ -6240,7 +6410,8 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1)); SDValue NegShiftLeft = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Opc, MVT::i32), Op.getOperand(0), NegShift); + DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0), + NegShift); return NegShiftLeft; } @@ -6521,6 +6692,34 @@ bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { return NumBits1 > NumBits2; } +/// Check if it is profitable to hoist instruction in then/else to if. +/// Not profitable if I and it's user can form a FMA instruction +/// because we prefer FMSUB/FMADD. +bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { + if (I->getOpcode() != Instruction::FMul) + return true; + + if (I->getNumUses() != 1) + return true; + + Instruction *User = I->user_back(); + + if (User && + !(User->getOpcode() == Instruction::FSub || + User->getOpcode() == Instruction::FAdd)) + return true; + + const TargetOptions &Options = getTargetMachine().Options; + EVT VT = getValueType(User->getOperand(0)->getType()); + + if (isFMAFasterThanFMulAndFAdd(VT) && + isOperationLegalOrCustom(ISD::FMA, VT) && + (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath)) + return false; + + return true; +} + // All 32-bit GPR operations implicitly zero the high-half of the corresponding // 64-bit GPR. bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { @@ -6553,6 +6752,59 @@ bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { VT1.getSizeInBits() <= 32); } +bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { + if (isa<FPExtInst>(Ext)) + return false; + + // Vector types are next free. + if (Ext->getType()->isVectorTy()) + return false; + + for (const Use &U : Ext->uses()) { + // The extension is free if we can fold it with a left shift in an + // addressing mode or an arithmetic operation: add, sub, and cmp. + + // Is there a shift? + const Instruction *Instr = cast<Instruction>(U.getUser()); + + // Is this a constant shift? + switch (Instr->getOpcode()) { + case Instruction::Shl: + if (!isa<ConstantInt>(Instr->getOperand(1))) + return false; + break; + case Instruction::GetElementPtr: { + gep_type_iterator GTI = gep_type_begin(Instr); + std::advance(GTI, U.getOperandNo()); + Type *IdxTy = *GTI; + // This extension will end up with a shift because of the scaling factor. + // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0. + // Get the shift amount based on the scaling factor: + // log2(sizeof(IdxTy)) - log2(8). + uint64_t ShiftAmt = + countTrailingZeros(getDataLayout()->getTypeStoreSizeInBits(IdxTy)) - 3; + // Is the constant foldable in the shift of the addressing mode? + // I.e., shift amount is between 1 and 4 inclusive. + if (ShiftAmt == 0 || ShiftAmt > 4) + return false; + break; + } + case Instruction::Trunc: + // Check if this is a noop. + // trunc(sext ty1 to ty2) to ty1. + if (Instr->getType() == Ext->getOperand(0)->getType()) + continue; + // FALL THROUGH. + default: + return false; + } + + // At this point we can use the bfm family, so this extension is free + // for that use. + } + return true; +} + bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType, unsigned &RequiredAligment) const { if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy()) @@ -6591,13 +6843,22 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, bool Fast; const Function *F = MF.getFunction(); if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 && - !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::NoImplicitFloat) && + !F->hasFnAttribute(Attribute::NoImplicitFloat) && (memOpAlign(SrcAlign, DstAlign, 16) || (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast))) return MVT::f128; - return Size >= 8 ? MVT::i64 : MVT::i32; + if (Size >= 8 && + (memOpAlign(SrcAlign, DstAlign, 8) || + (allowsMisalignedMemoryAccesses(MVT::i64, 0, 1, &Fast) && Fast))) + return MVT::i64; + + if (Size >= 4 && + (memOpAlign(SrcAlign, DstAlign, 4) || + (allowsMisalignedMemoryAccesses(MVT::i32, 0, 1, &Fast) && Fast))) + return MVT::i32; + + return MVT::Other; } // 12-bit optionally shifted immediates are legal for adds. @@ -6618,7 +6879,8 @@ bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM, - Type *Ty) const { + Type *Ty, + unsigned AS) const { // AArch64 has five basic addressing modes: // reg // reg + 9-bit signed offset @@ -6669,7 +6931,8 @@ bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM, } int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM, - Type *Ty) const { + Type *Ty, + unsigned AS) const { // Scaling factors are not free at all. // Operands | Rt Latency // ------------------------------------------- @@ -6677,7 +6940,7 @@ int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM, // ------------------------------------------- // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 // Rt, [Xn, Wm, <extend> #imm] | - if (isLegalAddressingMode(AM, Ty)) + if (isLegalAddressingMode(AM, Ty, AS)) // Scale represents reg2 * scale, thus account for 1 if // it is not equal to 0 or 1. return AM.Scale != 0 && AM.Scale != 1; @@ -6748,7 +7011,7 @@ bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, unsigned LZ = countLeadingZeros((uint64_t)Val); unsigned Shift = (63 - LZ) / 16; // MOVZ is free so return true for one or fewer MOVK. - return (Shift < 3) ? true : false; + return Shift < 3; } // Generate SUBS and CSEL for integer abs. @@ -6766,14 +7029,14 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) { - SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), + SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0.getOperand(0)); // Generate SUBS & CSEL. SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), - N0.getOperand(0), DAG.getConstant(0, VT)); + N0.getOperand(0), DAG.getConstant(0, DL, VT)); return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg, - DAG.getConstant(AArch64CC::PL, MVT::i32), + DAG.getConstant(AArch64CC::PL, DL, MVT::i32), SDValue(Cmp.getNode(), 1)); } return SDValue(); @@ -6802,8 +7065,8 @@ AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SDLoc DL(N); SDValue N0 = N->getOperand(0); unsigned Lg2 = Divisor.countTrailingZeros(); - SDValue Zero = DAG.getConstant(0, VT); - SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, VT); + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT); // Add (N0 < 0) ? Pow2 - 1 : 0; SDValue CCVal; @@ -6819,7 +7082,7 @@ AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, // Divide by pow2. SDValue SRA = - DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, MVT::i64)); + DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64)); // If we're dividing by a positive value, we're done. Otherwise, we must // negate the result. @@ -6828,7 +7091,7 @@ AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, if (Created) Created->push_back(SRA.getNode()); - return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), SRA); + return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA); } static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, @@ -6845,45 +7108,46 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) { APInt Value = C->getAPIntValue(); EVT VT = N->getValueType(0); + SDLoc DL(N); if (Value.isNonNegative()) { // (mul x, 2^N + 1) => (add (shl x, N), x) APInt VM1 = Value - 1; if (VM1.isPowerOf2()) { SDValue ShiftedVal = - DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), - DAG.getConstant(VM1.logBase2(), MVT::i64)); - return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(VM1.logBase2(), DL, MVT::i64)); + return DAG.getNode(ISD::ADD, DL, VT, ShiftedVal, N->getOperand(0)); } // (mul x, 2^N - 1) => (sub (shl x, N), x) APInt VP1 = Value + 1; if (VP1.isPowerOf2()) { SDValue ShiftedVal = - DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), - DAG.getConstant(VP1.logBase2(), MVT::i64)); - return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal, + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(VP1.logBase2(), DL, MVT::i64)); + return DAG.getNode(ISD::SUB, DL, VT, ShiftedVal, N->getOperand(0)); } } else { - // (mul x, -(2^N + 1)) => - (add (shl x, N), x) - APInt VNM1 = -Value - 1; - if (VNM1.isPowerOf2()) { - SDValue ShiftedVal = - DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), - DAG.getConstant(VNM1.logBase2(), MVT::i64)); - SDValue Add = - DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0)); - return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(0, VT), Add); - } // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) APInt VNP1 = -Value + 1; if (VNP1.isPowerOf2()) { SDValue ShiftedVal = - DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), - DAG.getConstant(VNP1.logBase2(), MVT::i64)); - return DAG.getNode(ISD::SUB, SDLoc(N), VT, N->getOperand(0), + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(VNP1.logBase2(), DL, MVT::i64)); + return DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), ShiftedVal); } + // (mul x, -(2^N + 1)) => - (add (shl x, N), x) + APInt VNM1 = -Value - 1; + if (VNM1.isPowerOf2()) { + SDValue ShiftedVal = + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(VNM1.logBase2(), DL, MVT::i64)); + SDValue Add = + DAG.getNode(ISD::ADD, DL, VT, ShiftedVal, N->getOperand(0)); + return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Add); + } } } return SDValue(); @@ -7037,7 +7301,7 @@ static SDValue tryCombineToEXTR(SDNode *N, } return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS, - DAG.getConstant(ShiftRHS, MVT::i64)); + DAG.getConstant(ShiftRHS, DL, MVT::i64)); } static SDValue tryCombineToBSL(SDNode *N, @@ -7165,10 +7429,10 @@ static SDValue performBitcastCombine(SDNode *N, SDLoc dl(N); unsigned NumElements = VT.getVectorNumElements(); if (idx) { - SDValue HalfIdx = DAG.getConstant(NumElements, MVT::i64); + SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx); } else { - SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, MVT::i32); + SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32); return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT, Source, SubReg), 0); @@ -7178,22 +7442,55 @@ static SDValue performBitcastCombine(SDNode *N, static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); + + // Optimize concat_vectors of truncated vectors, where the intermediate + // type is illegal, to avoid said illegality, e.g., + // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))), + // (v2i16 (truncate (v2i64))))) + // -> + // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))), + // (v4i32 (bitcast (v2i64))), + // <0, 2, 4, 6>))) + // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed + // on both input and result type, so we might generate worse code. + // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8. + if (N->getNumOperands() == 2 && + N0->getOpcode() == ISD::TRUNCATE && + N1->getOpcode() == ISD::TRUNCATE) { + SDValue N00 = N0->getOperand(0); + SDValue N10 = N1->getOperand(0); + EVT N00VT = N00.getValueType(); + + if (N00VT == N10.getValueType() && + (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) && + N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) { + MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16); + SmallVector<int, 8> Mask(MidVT.getVectorNumElements()); + for (size_t i = 0; i < Mask.size(); ++i) + Mask[i] = i * 2; + return DAG.getNode(ISD::TRUNCATE, dl, VT, + DAG.getVectorShuffle( + MidVT, dl, + DAG.getNode(ISD::BITCAST, dl, MidVT, N00), + DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask)); + } + } + // Wait 'til after everything is legalized to try this. That way we have // legal vector types and such. if (DCI.isBeforeLegalizeOps()) return SDValue(); - SDLoc dl(N); - EVT VT = N->getValueType(0); - // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector // splat. The indexed instructions are going to be expecting a DUPLANE64, so // canonicalise to that. - if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) { + if (N0 == N1 && VT.getVectorNumElements() == 2) { assert(VT.getVectorElementType().getSizeInBits() == 64); - return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, - WidenVector(N->getOperand(0), DAG), - DAG.getConstant(0, MVT::i64)); + return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG), + DAG.getConstant(0, dl, MVT::i64)); } // Canonicalise concat_vectors so that the right-hand vector has as few @@ -7205,10 +7502,9 @@ static SDValue performConcatVectorsCombine(SDNode *N, // becomes // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) - SDValue Op1 = N->getOperand(1); - if (Op1->getOpcode() != ISD::BITCAST) + if (N1->getOpcode() != ISD::BITCAST) return SDValue(); - SDValue RHS = Op1->getOperand(0); + SDValue RHS = N1->getOperand(0); MVT RHSTy = RHS.getValueType().getSimpleVT(); // If the RHS is not a vector, this is not the pattern we're looking for. if (!RHSTy.isVector()) @@ -7218,10 +7514,10 @@ static SDValue performConcatVectorsCombine(SDNode *N, MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(), RHSTy.getVectorNumElements() * 2); - return DAG.getNode( - ISD::BITCAST, dl, VT, - DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy, - DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS)); + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy, + DAG.getNode(ISD::BITCAST, dl, RHSTy, N0), + RHS)); } static SDValue tryCombineFixedPointConvert(SDNode *N, @@ -7310,15 +7606,16 @@ static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { unsigned NumElems = NarrowTy.getVectorNumElements(); MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2); + SDLoc dl(N); SDValue NewDUP; if (IsDUPLANE) - NewDUP = DAG.getNode(N.getOpcode(), SDLoc(N), NewDUPVT, N.getOperand(0), + NewDUP = DAG.getNode(N.getOpcode(), dl, NewDUPVT, N.getOperand(0), N.getOperand(1)); else - NewDUP = DAG.getNode(AArch64ISD::DUP, SDLoc(N), NewDUPVT, N.getOperand(0)); + NewDUP = DAG.getNode(AArch64ISD::DUP, dl, NewDUPVT, N.getOperand(0)); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N.getNode()), NarrowTy, - NewDUP, DAG.getConstant(NumElems, MVT::i64)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy, NewDUP, + DAG.getConstant(NumElems, dl, MVT::i64)); } static bool isEssentiallyExtractSubvector(SDValue N) { @@ -7443,7 +7740,8 @@ static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { SDLoc dl(Op); if (InfoAndKind.IsAArch64) { CCVal = DAG.getConstant( - AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), MVT::i32); + AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl, + MVT::i32); Cmp = *InfoAndKind.Info.AArch64.Cmp; } else Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0, @@ -7452,7 +7750,7 @@ static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { CCVal, DAG, dl); EVT VT = Op->getValueType(0); - LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, VT)); + LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT)); return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp); } @@ -7592,12 +7890,15 @@ static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { break; } - if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) - return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1), - DAG.getConstant(-ShiftAmount, MVT::i32)); - else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) - return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1), - DAG.getConstant(ShiftAmount, MVT::i32)); + if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) { + SDLoc dl(N); + return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1), + DAG.getConstant(-ShiftAmount, dl, MVT::i32)); + } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) { + SDLoc dl(N); + return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1), + DAG.getConstant(ShiftAmount, dl, MVT::i32)); + } return SDValue(); } @@ -7618,6 +7919,16 @@ static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) { N->getOperand(0), N->getOperand(1), AndN.getOperand(0)); } +static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, + SelectionDAG &DAG) { + SDLoc dl(N); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), + DAG.getNode(Opc, dl, + N->getOperand(1).getSimpleValueType(), + N->getOperand(1)), + DAG.getConstant(0, dl, MVT::i64)); +} + static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { @@ -7630,6 +7941,18 @@ static SDValue performIntrinsicCombine(SDNode *N, case Intrinsic::aarch64_neon_vcvtfxu2fp: return tryCombineFixedPointConvert(N, DCI, DAG); break; + case Intrinsic::aarch64_neon_saddv: + return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG); + case Intrinsic::aarch64_neon_uaddv: + return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG); + case Intrinsic::aarch64_neon_sminv: + return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG); + case Intrinsic::aarch64_neon_uminv: + return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG); + case Intrinsic::aarch64_neon_smaxv: + return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG); + case Intrinsic::aarch64_neon_umaxv: + return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG); case Intrinsic::aarch64_neon_fmax: return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); @@ -7744,9 +8067,9 @@ static SDValue performExtendCombine(SDNode *N, EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), LoVT.getVectorNumElements()); Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, - DAG.getConstant(0, MVT::i64)); + DAG.getConstant(0, DL, MVT::i64)); Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, - DAG.getConstant(InNVT.getVectorNumElements(), MVT::i64)); + DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64)); Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo); Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi); @@ -7806,7 +8129,7 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) { unsigned Offset = EltOffset; while (--NumVecElts) { SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, - DAG.getConstant(Offset, MVT::i64)); + DAG.getConstant(Offset, DL, MVT::i64)); NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), Alignment); @@ -7827,14 +8150,13 @@ static SDValue performSTORECombine(SDNode *N, return SDValue(); // Cyclone has bad performance on unaligned 16B stores when crossing line and - // page boundries. We want to split such stores. + // page boundaries. We want to split such stores. if (!Subtarget->isCyclone()) return SDValue(); // Don't split at Oz. MachineFunction &MF = DAG.getMachineFunction(); - bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute( - AttributeSet::FunctionIndex, Attribute::MinSize); + bool IsMinSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize); if (IsMinSize) return SDValue(); @@ -7868,15 +8190,15 @@ static SDValue performSTORECombine(SDNode *N, EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts); SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, - DAG.getConstant(0, MVT::i64)); + DAG.getConstant(0, DL, MVT::i64)); SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, - DAG.getConstant(NumElts, MVT::i64)); + DAG.getConstant(NumElts, DL, MVT::i64)); SDValue BasePtr = S->getBasePtr(); SDValue NewST1 = DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(), S->getAlignment()); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, - DAG.getConstant(8, MVT::i64)); + DAG.getConstant(8, DL, MVT::i64)); return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr, S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(), S->getAlignment()); @@ -7944,6 +8266,13 @@ static SDValue performPostLD1Combine(SDNode *N, Inc = DAG.getRegister(AArch64::XZR, MVT::i64); } + // Finally, check that the vector doesn't depend on the load. + // Again, this would create a cycle. + // The load depending on the vector is fine, as that's the case for the + // LD1*post we'll eventually generate anyway. + if (LoadSDN->isPredecessorOf(Vector.getNode())) + continue; + SmallVector<SDValue, 8> Ops; Ops.push_back(LD->getOperand(0)); // Chain if (IsLaneOp) { @@ -7961,7 +8290,7 @@ static SDValue performPostLD1Combine(SDNode *N, LoadSDN->getMemOperand()); // Update the uses. - std::vector<SDValue> NewResults; + SmallVector<SDValue, 2> NewResults; NewResults.push_back(SDValue(LD, 0)); // The result of load NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain DCI.CombineTo(LD, NewResults); @@ -8455,13 +8784,21 @@ static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { /// the compare-mask instructions rather than going via NZCV, even if LHS and /// RHS are really scalar. This replaces any scalar setcc in the above pattern /// with a vector one followed by a DUP shuffle on the result. -static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue performSelectCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; SDValue N0 = N->getOperand(0); EVT ResVT = N->getValueType(0); - if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1) + if (N0.getOpcode() != ISD::SETCC) return SDValue(); + // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered + // scalar SetCCResultType. We also don't expect vectors, because we assume + // that selects fed by vector SETCCs are canonicalized to VSELECT. + assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) && + "Scalar-SETCC feeding SELECT has unexpected result type!"); + // If NumMaskElts == 0, the comparison is larger than select result. The // largest real NEON comparison is 64-bits per lane, which means the result is // at most 32-bits and an illegal vector. Just bail out for now. @@ -8479,6 +8816,16 @@ static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) { SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts); EVT CCVT = SrcVT.changeVectorElementTypeToInteger(); + // Also bail out if the vector CCVT isn't the same size as ResVT. + // This can happen if the SETCC operand size doesn't divide the ResVT size + // (e.g., f64 vs v3f32). + if (CCVT.getSizeInBits() != ResVT.getSizeInBits()) + return SDValue(); + + // Make sure we didn't create illegal types, if we're not supposed to. + assert(DCI.isBeforeLegalize() || + DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)); + // First perform a vector comparison, where lane 0 is the one we're interested // in. SDLoc DL(N0); @@ -8497,6 +8844,75 @@ static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) { return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); } +/// performSelectCCCombine - Target-specific DAG combining for ISD::SELECT_CC +/// to match FMIN/FMAX patterns. +static SDValue performSelectCCCombine(SDNode *N, SelectionDAG &DAG) { + // Try to use FMIN/FMAX instructions for FP selects like "x < y ? x : y". + // Unless the NoNaNsFPMath option is set, be careful about NaNs: + // vmax/vmin return NaN if either operand is a NaN; + // only do the transformation when it matches that behavior. + + SDValue CondLHS = N->getOperand(0); + SDValue CondRHS = N->getOperand(1); + SDValue LHS = N->getOperand(2); + SDValue RHS = N->getOperand(3); + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get(); + + unsigned Opcode; + bool IsReversed; + if (selectCCOpsAreFMaxCompatible(CondLHS, LHS) && + selectCCOpsAreFMaxCompatible(CondRHS, RHS)) { + IsReversed = false; // x CC y ? x : y + } else if (selectCCOpsAreFMaxCompatible(CondRHS, LHS) && + selectCCOpsAreFMaxCompatible(CondLHS, RHS)) { + IsReversed = true ; // x CC y ? y : x + } else { + return SDValue(); + } + + bool IsUnordered = false, IsOrEqual; + switch (CC) { + default: + return SDValue(); + case ISD::SETULT: + case ISD::SETULE: + IsUnordered = true; + case ISD::SETOLT: + case ISD::SETOLE: + case ISD::SETLT: + case ISD::SETLE: + IsOrEqual = (CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE); + Opcode = IsReversed ? AArch64ISD::FMAX : AArch64ISD::FMIN; + break; + + case ISD::SETUGT: + case ISD::SETUGE: + IsUnordered = true; + case ISD::SETOGT: + case ISD::SETOGE: + case ISD::SETGT: + case ISD::SETGE: + IsOrEqual = (CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE); + Opcode = IsReversed ? AArch64ISD::FMIN : AArch64ISD::FMAX; + break; + } + + // If LHS is NaN, an ordered comparison will be false and the result will be + // the RHS, but FMIN(NaN, RHS) = FMAX(NaN, RHS) = NaN. Avoid this by checking + // that LHS != NaN. Likewise, for unordered comparisons, check for RHS != NaN. + if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS)) + return SDValue(); + + // For xxx-or-equal comparisons, "+0 <= -0" and "-0 >= +0" will both be true, + // but FMIN will return -0, and FMAX will return +0. So FMIN/FMAX can only be + // used for unsafe math or if one of the operands is known to be nonzero. + if (IsOrEqual && !DAG.getTarget().Options.UnsafeFPMath && + !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) + return SDValue(); + + return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -8526,9 +8942,11 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case ISD::CONCAT_VECTORS: return performConcatVectorsCombine(N, DCI, DAG); case ISD::SELECT: - return performSelectCombine(N, DAG); + return performSelectCombine(N, DCI); case ISD::VSELECT: return performVSelectCombine(N, DCI.DAG); + case ISD::SELECT_CC: + return performSelectCCCombine(N, DCI.DAG); case ISD::STORE: return performSTORECombine(N, DCI, DAG, Subtarget); case AArch64ISD::BRCOND: @@ -8699,7 +9117,7 @@ static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results, Op = SDValue( DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32, DAG.getUNDEF(MVT::i32), Op, - DAG.getTargetConstant(AArch64::hsub, MVT::i32)), + DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 0); Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); @@ -8760,9 +9178,11 @@ bool AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { } // For the real atomic operations, we have ldxr/stxr up to 128 bits, -bool AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { +TargetLoweringBase::AtomicRMWExpansionKind +AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned Size = AI->getType()->getPrimitiveSizeInBits(); - return Size <= 128; + return Size <= 128 ? AtomicRMWExpansionKind::LLSC + : AtomicRMWExpansionKind::None; } bool AArch64TargetLowering::hasLoadLinkedStoreConditional() const { @@ -8822,7 +9242,7 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo"); Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi"); Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); - return Builder.CreateCall3(Stxr, Lo, Hi, Addr); + return Builder.CreateCall(Stxr, {Lo, Hi, Addr}); } Intrinsic::ID Int = @@ -8830,13 +9250,18 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, Type *Tys[] = { Addr->getType() }; Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys); - return Builder.CreateCall2( - Stxr, Builder.CreateZExtOrBitCast( - Val, Stxr->getFunctionType()->getParamType(0)), - Addr); + return Builder.CreateCall(Stxr, + {Builder.CreateZExtOrBitCast( + Val, Stxr->getFunctionType()->getParamType(0)), + Addr}); } bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { return Ty->isArrayTy(); } + +bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, + EVT) const { + return false; +} |