diff options
Diffstat (limited to 'contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 837 |
1 files changed, 583 insertions, 254 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 06bfe34..849058b 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11,28 +11,79 @@ // //===----------------------------------------------------------------------===// -#include "AArch64ISelLowering.h" #include "AArch64CallingConvention.h" #include "AArch64MachineFunctionInfo.h" +#include "AArch64ISelLowering.h" #include "AArch64PerfectShuffle.h" +#include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" -#include "AArch64TargetMachine.h" -#include "AArch64TargetObjectFile.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Triple.h" +#include "llvm/ADT/Twine.h" #include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/OperandTraits.h" #include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/Value.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetCallingConv.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" +#include <algorithm> +#include <bitset> +#include <cassert> +#include <cctype> +#include <cstdint> +#include <cstdlib> +#include <iterator> +#include <limits> +#include <tuple> +#include <utility> +#include <vector> + using namespace llvm; #define DEBUG_TYPE "aarch64-lower" @@ -53,20 +104,12 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false)); -// Disabled for causing self-hosting failures once returned-attribute inference -// was enabled. -static cl::opt<bool> -EnableThisRetForwarding("aarch64-this-return-forwarding", cl::Hidden, - cl::desc("Directly forward this return"), - cl::init(false)); - /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { - // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so // we have to make something up. Arbitrarily, choose ZeroOrOne. setBooleanContents(ZeroOrOneBooleanContent); @@ -116,6 +159,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SETCC, MVT::i64, Custom); setOperationAction(ISD::SETCC, MVT::f32, Custom); setOperationAction(ISD::SETCC, MVT::f64, Custom); + setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); setOperationAction(ISD::BRCOND, MVT::Other, Expand); setOperationAction(ISD::BR_CC, MVT::i32, Custom); setOperationAction(ISD::BR_CC, MVT::i64, Custom); @@ -225,7 +270,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); - setOperationAction(ISD::CTPOP, MVT::i32, Custom); setOperationAction(ISD::CTPOP, MVT::i64, Custom); @@ -520,6 +564,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setPrefFunctionAlignment(STI.getPrefFunctionAlignment()); setPrefLoopAlignment(STI.getPrefLoopAlignment()); + // Only change the limit for entries in a jump table if specified by + // the subtarget, but not at the command line. + unsigned MaxJT = STI.getMaximumJumpTableSize(); + if (MaxJT && getMaximumJumpTableSize() == 0) + setMaximumJumpTableSize(MaxJT); + setHasExtractBitsInsn(true); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -764,7 +814,7 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode( case Intrinsic::aarch64_ldxr: { unsigned BitWidth = KnownOne.getBitWidth(); EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); - unsigned MemBits = VT.getScalarType().getSizeInBits(); + unsigned MemBits = VT.getScalarSizeInBits(); KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); return; } @@ -960,8 +1010,10 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; - case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE"; case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE"; + case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS"; + case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE"; + case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS"; } return nullptr; } @@ -1186,7 +1238,8 @@ static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, changeFPCCToAArch64CC(CC, CondCode, CondCode2); break; case ISD::SETUO: - Invert = true; // Fallthrough + Invert = true; + LLVM_FALLTHROUGH; case ISD::SETO: CondCode = AArch64CC::MI; CondCode2 = AArch64CC::GE; @@ -2136,7 +2189,7 @@ static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, for (const SDValue &Elt : N->op_values()) { if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { - unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + unsigned EltSize = VT.getScalarSizeInBits(); unsigned HalfSize = EltSize / 2; if (isSigned) { if (!isIntN(HalfSize, C->getSExtValue())) @@ -2163,7 +2216,7 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); EVT VT = N->getValueType(0); SDLoc dl(N); - unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2; + unsigned EltSize = VT.getScalarSizeInBits() / 2; unsigned NumElts = VT.getVectorNumElements(); MVT TruncVT = MVT::getIntegerVT(EltSize); SmallVector<SDValue, 8> Ops; @@ -2435,18 +2488,25 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, case CallingConv::Fast: case CallingConv::PreserveMost: case CallingConv::CXX_FAST_TLS: + case CallingConv::Swift: if (!Subtarget->isTargetDarwin()) return CC_AArch64_AAPCS; return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; } } +CCAssignFn * +AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const { + return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS + : RetCC_AArch64_AAPCS; +} + SDValue AArch64TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); - MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; @@ -2499,7 +2559,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // FIXME: This works on big-endian for composite byvals, which are the common // case. It should also work for fundamental types too. unsigned FrameIdx = - MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); + MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT); InVals.push_back(FrameIdxN); @@ -2564,7 +2624,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( !Ins[i].Flags.isInConsecutiveRegs()) BEAlign = 8 - ArgSize; - int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); + int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); // Create load nodes to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); @@ -2614,7 +2674,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( unsigned StackOffset = CCInfo.getNextStackOffset(); // We currently pass all varargs at 8-byte alignment. StackOffset = ((StackOffset + 7) & ~7); - FuncInfo->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true)); + FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true)); } unsigned StackArgSize = CCInfo.getNextStackOffset(); @@ -2645,7 +2705,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, const SDLoc &DL, SDValue &Chain) const { MachineFunction &MF = DAG.getMachineFunction(); - MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); auto PtrVT = getPointerTy(DAG.getDataLayout()); @@ -2660,7 +2720,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); int GPRIdx = 0; if (GPRSaveSize != 0) { - GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false); + GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false); SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); @@ -2688,7 +2748,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); int FPRIdx = 0; if (FPRSaveSize != 0) { - FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false); + FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false); SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT); @@ -2735,7 +2795,7 @@ SDValue AArch64TargetLowering::LowerCallResult( // Pass 'this' value directly from the argument to return value, to avoid // reg unit interference - if (i == 0 && isThisReturn && EnableThisRetForwarding) { + if (i == 0 && isThisReturn) { assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && "unexpected return calling convention register assignment"); InVals.push_back(ThisVal); @@ -2763,15 +2823,29 @@ SDValue AArch64TargetLowering::LowerCallResult( return Chain; } +/// Return true if the calling convention is one that we can guarantee TCO for. +static bool canGuaranteeTCO(CallingConv::ID CC) { + return CC == CallingConv::Fast; +} + +/// Return true if we might ever do TCO for calls with this calling convention. +static bool mayTailCallThisCC(CallingConv::ID CC) { + switch (CC) { + case CallingConv::C: + case CallingConv::PreserveMost: + case CallingConv::Swift: + return true; + default: + return canGuaranteeTCO(CC); + } +} + bool AArch64TargetLowering::isEligibleForTailCallOptimization( SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { - // For CallingConv::C this function knows whether the ABI needs - // changing. That's not true for other conventions so they will have to opt in - // manually. - if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) + if (!mayTailCallThisCC(CalleeCC)) return false; MachineFunction &MF = DAG.getMachineFunction(); @@ -2788,9 +2862,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( if (i->hasByValAttr()) return false; - if (getTargetMachine().Options.GuaranteedTailCallOpt) { - return IsTailCallConvention(CalleeCC) && CCMatch; - } + if (getTargetMachine().Options.GuaranteedTailCallOpt) + return canGuaranteeTCO(CalleeCC) && CCMatch; // Externally-defined functions with weak linkage should not be // tail-called on AArch64 when the OS does not support dynamic @@ -2872,11 +2945,11 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, SelectionDAG &DAG, - MachineFrameInfo *MFI, + MachineFrameInfo &MFI, int ClobberedFI) const { SmallVector<SDValue, 8> ArgChains; - int64_t FirstByte = MFI->getObjectOffset(ClobberedFI); - int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1; + int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); + int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; // Include the original chain at the beginning of the list. When this is // used by target LowerCall hooks, this helps legalize find the @@ -2890,9 +2963,9 @@ SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) if (FI->getIndex() < 0) { - int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex()); + int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); int64_t InLastByte = InFirstByte; - InLastByte += MFI->getObjectSize(FI->getIndex()) - 1; + InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || (FirstByte <= InFirstByte && InFirstByte <= LastByte)) @@ -2908,11 +2981,6 @@ bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, return CallCC == CallingConv::Fast && TailCallOpt; } -bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const { - return CallCC == CallingConv::Fast || - CallCC == CallingConv::PreserveMost; -} - /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, /// and add input and output parameter nodes. SDValue @@ -3087,7 +3155,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } if (VA.isRegLoc()) { - if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) { + if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && + Outs[0].VT == MVT::i64) { assert(VA.getLocVT() == MVT::i64 && "unexpected calling convention register assignment"); assert(!Ins.empty() && Ins[0].VT == MVT::i64 && @@ -3119,7 +3188,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, if (IsTailCall) { Offset = Offset + FPDiff; - int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); + int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); DstAddr = DAG.getFrameIndex(FI, PtrVT); DstInfo = @@ -3253,7 +3322,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // If we're doing a tall call, use a TC_RETURN here rather than an // actual call instruction. if (IsTailCall) { - MF.getFrameInfo()->setHasTailCall(); + MF.getFrameInfo().setHasTailCall(); return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); } @@ -3444,15 +3513,16 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, // The first entry in the descriptor is a function pointer that we must call // to obtain the address of the variable. SDValue Chain = DAG.getEntryNode(); - SDValue FuncTLVGet = - DAG.getLoad(MVT::i64, DL, Chain, DescAddr, - MachinePointerInfo::getGOT(DAG.getMachineFunction()), - /* Alignment = */ 8, MachineMemOperand::MONonTemporal | - MachineMemOperand::MOInvariant); + SDValue FuncTLVGet = DAG.getLoad( + MVT::i64, DL, Chain, DescAddr, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + /* Alignment = */ 8, + MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant | + MachineMemOperand::MODereferenceable); Chain = FuncTLVGet.getValue(1); - MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); - MFI->setAdjustsStack(true); + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + MFI.setAdjustsStack(true); // TLS calls preserve all registers except those that absolutely must be // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be @@ -3614,6 +3684,7 @@ SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, llvm_unreachable("Unexpected platform trying to use TLS"); } + SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); @@ -3705,7 +3776,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { // Don't combine AND since emitComparison converts the AND to an ANDS // (a.k.a. TST) and the test in the test bit and branch instruction // becomes redundant. This would also increase register pressure. - uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; + uint64_t Mask = LHS.getValueSizeInBits() - 1; return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, DAG.getConstant(Mask, dl, MVT::i64), Dest); } @@ -3715,7 +3786,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { // Don't combine AND since emitComparison converts the AND to an ANDS // (a.k.a. TST) and the test in the test bit and branch instruction // becomes redundant. This would also increase register pressure. - uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; + uint64_t Mask = LHS.getValueSizeInBits() - 1; return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, DAG.getConstant(Mask, dl, MVT::i64), Dest); } @@ -4036,6 +4107,33 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, } } + // Avoid materializing a constant when possible by reusing a known value in + // a register. However, don't perform this optimization if the known value + // is one, zero or negative one in the case of a CSEL. We can always + // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the + // FVal, respectively. + ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS); + if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() && + !RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) { + AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); + // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to + // "a != C ? x : a" to avoid materializing C. + if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ) + TVal = LHS; + else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE) + FVal = LHS; + } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) { + assert (CTVal && CFVal && "Expected constant operands for CSNEG."); + // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to + // avoid materializing C. + AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); + if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) { + Opcode = AArch64ISD::CSINV; + TVal = LHS; + FVal = DAG.getConstant(0, dl, FVal.getValueType()); + } + } + SDValue CCVal; SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); @@ -4053,6 +4151,26 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, // clean. Some of them require two CSELs to implement. AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); + + if (DAG.getTarget().Options.UnsafeFPMath) { + // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and + // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0. + ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS); + if (RHSVal && RHSVal->isZero()) { + ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal); + ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal); + + if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) && + CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType()) + TVal = LHS; + else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) && + CFVal && CFVal->isZero() && + FVal.getValueType() == LHS.getValueType()) + FVal = LHS; + } + } + + // Emit first, and possibly only, CSEL. SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); @@ -4378,8 +4496,8 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { - MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); - MFI->setFrameAddressIsTaken(true); + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + MFI.setFrameAddressIsTaken(true); EVT VT = Op.getValueType(); SDLoc DL(Op); @@ -4408,8 +4526,8 @@ unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT, SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); - MachineFrameInfo *MFI = MF.getFrameInfo(); - MFI->setReturnAddressIsTaken(true); + MachineFrameInfo &MFI = MF.getFrameInfo(); + MFI.setReturnAddressIsTaken(true); EVT VT = Op.getValueType(); SDLoc DL(Op); @@ -4484,7 +4602,6 @@ SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op, return DAG.getMergeValues(Ops, dl); } - /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two /// i64 values and take a 2 x i64 value to shift plus a shift amount. SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, @@ -4559,38 +4676,96 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { // AArch64 Optimization Hooks //===----------------------------------------------------------------------===// -/// getEstimate - Return the appropriate estimate DAG for either the reciprocal -/// or the reciprocal square root. -static SDValue getEstimate(const AArch64Subtarget &ST, - const AArch64TargetLowering::DAGCombinerInfo &DCI, unsigned Opcode, - const SDValue &Operand, unsigned &ExtraSteps) { - if (!ST.hasNEON()) - return SDValue(); - +static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, + SDValue Operand, SelectionDAG &DAG, + int &ExtraSteps) { EVT VT = Operand.getValueType(); + if (ST->hasNEON() && + (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 || + VT == MVT::f32 || VT == MVT::v1f32 || + VT == MVT::v2f32 || VT == MVT::v4f32)) { + if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) + // For the reciprocal estimates, convergence is quadratic, so the number + // of digits is doubled after each iteration. In ARMv8, the accuracy of + // the initial estimate is 2^-8. Thus the number of extra steps to refine + // the result for float (23 mantissa bits) is 2 and for double (52 + // mantissa bits) is 3. + ExtraSteps = VT == MVT::f64 ? 3 : 2; - std::string RecipOp; - RecipOp = Opcode == (AArch64ISD::FRECPE) ? "div": "sqrt"; - RecipOp = ((VT.isVector()) ? "vec-": "") + RecipOp; - RecipOp += (VT.getScalarType() == MVT::f64) ? "d": "f"; + return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); + } - TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; - if (!Recips.isEnabled(RecipOp)) - return SDValue(); + return SDValue(); +} + +SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, + SelectionDAG &DAG, int Enabled, + int &ExtraSteps, + bool &UseOneConst, + bool Reciprocal) const { + if (Enabled == ReciprocalEstimate::Enabled || + (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt())) + if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand, + DAG, ExtraSteps)) { + SDLoc DL(Operand); + EVT VT = Operand.getValueType(); + + SDNodeFlags Flags; + Flags.setUnsafeAlgebra(true); + + // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2) + // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N) + for (int i = ExtraSteps; i > 0; --i) { + SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate, + &Flags); + Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, &Flags); + Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags); + } + + if (!Reciprocal) { + EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), + VT); + SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); + SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ); + + Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, &Flags); + // Correct the result if the operand is 0.0. + Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, + VT, Eq, Operand, Estimate); + } + + ExtraSteps = 0; + return Estimate; + } - ExtraSteps = Recips.getRefinementSteps(RecipOp); - return DCI.DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); + return SDValue(); } SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand, - DAGCombinerInfo &DCI, unsigned &ExtraSteps) const { - return getEstimate(*Subtarget, DCI, AArch64ISD::FRECPE, Operand, ExtraSteps); -} + SelectionDAG &DAG, int Enabled, + int &ExtraSteps) const { + if (Enabled == ReciprocalEstimate::Enabled) + if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand, + DAG, ExtraSteps)) { + SDLoc DL(Operand); + EVT VT = Operand.getValueType(); + + SDNodeFlags Flags; + Flags.setUnsafeAlgebra(true); + + // Newton reciprocal iteration: E * (2 - X * E) + // AArch64 reciprocal iteration instruction: (2 - M * N) + for (int i = ExtraSteps; i > 0; --i) { + SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand, + Estimate, &Flags); + Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags); + } -SDValue AArch64TargetLowering::getRsqrtEstimate(SDValue Operand, - DAGCombinerInfo &DCI, unsigned &ExtraSteps, bool &UseOneConst) const { - UseOneConst = true; - return getEstimate(*Subtarget, DCI, AArch64ISD::FRSQRTE, Operand, ExtraSteps); + ExtraSteps = 0; + return Estimate; + } + + return SDValue(); } //===----------------------------------------------------------------------===// @@ -4704,7 +4879,9 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( return std::make_pair(0U, &AArch64::GPR64commonRegClass); return std::make_pair(0U, &AArch64::GPR32commonRegClass); case 'w': - if (VT == MVT::f32) + if (VT.getSizeInBits() == 16) + return std::make_pair(0U, &AArch64::FPR16RegClass); + if (VT.getSizeInBits() == 32) return std::make_pair(0U, &AArch64::FPR32RegClass); if (VT.getSizeInBits() == 64) return std::make_pair(0U, &AArch64::FPR64RegClass); @@ -4949,10 +5126,11 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, int WindowBase; int WindowScale; - bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } ShuffleSourceInfo(SDValue Vec) - : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0), - WindowScale(1) {} + : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0), + ShuffleVec(Vec), WindowBase(0), WindowScale(1) {} + + bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } }; // First gather all vectors used as an immediate source for this BUILD_VECTOR @@ -4971,7 +5149,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, // Add this element source to the list if it's not already there. SDValue SourceVec = V.getOperand(0); - auto Source = std::find(Sources.begin(), Sources.end(), SourceVec); + auto Source = find(Sources, SourceVec); if (Source == Sources.end()) Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); @@ -4996,7 +5174,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, } } unsigned ResMultiplier = - VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits(); + VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits(); NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); @@ -5081,21 +5259,21 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, // The stars all align, our next step is to produce the mask for the shuffle. SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); - int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits(); + int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { SDValue Entry = Op.getOperand(i); if (Entry.isUndef()) continue; - auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0)); + auto Src = find(Sources, Entry.getOperand(0)); int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit // trunc. So only std::min(SrcBits, DestBits) actually get defined in this // segment. EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); - int BitsDefined = std::min(OrigEltTy.getSizeInBits(), - VT.getVectorElementType().getSizeInBits()); + int BitsDefined = + std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits()); int LanesDefined = BitsDefined / BitsPerShuffleLane; // This source is expected to fill ResMultiplier lanes of the final shuffle, @@ -5157,8 +5335,7 @@ static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT, unsigned &Imm) { // Look for the first non-undef element. - const int *FirstRealElt = std::find_if(M.begin(), M.end(), - [](int Elt) {return Elt >= 0;}); + const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; }); // Benefit form APInt to handle overflow when calculating expected element. unsigned NumElts = VT.getVectorNumElements(); @@ -5200,7 +5377,7 @@ static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && "Only possible block sizes for REV are: 16, 32, 64"); - unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + unsigned EltSz = VT.getScalarSizeInBits(); if (EltSz == 64) return false; @@ -5381,7 +5558,7 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { VT.getVectorElementType() != V1.getValueType().getVectorElementType()) return SDValue(); - bool SplitV0 = V0.getValueType().getSizeInBits() == 128; + bool SplitV0 = V0.getValueSizeInBits() == 128; if (!isConcatMask(Mask, VT, SplitV0)) return SDValue(); @@ -5392,7 +5569,7 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, DAG.getConstant(0, DL, MVT::i64)); } - if (V1.getValueType().getSizeInBits() == 128) { + if (V1.getValueSizeInBits() == 128) { V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1, DAG.getConstant(0, DL, MVT::i64)); } @@ -5523,7 +5700,7 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, MVT IndexVT = MVT::v8i8; unsigned IndexLen = 8; - if (Op.getValueType().getSizeInBits() == 128) { + if (Op.getValueSizeInBits() == 128) { IndexVT = MVT::v16i8; IndexLen = 16; } @@ -5918,7 +6095,7 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { // Is C1 == ~C2, taking into account how much one can shift elements of a // particular size? uint64_t C2 = C2node->getZExtValue(); - unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits(); + unsigned ElemSizeInBits = VT.getScalarSizeInBits(); if (C2 > ElemSizeInBits) return SDValue(); unsigned ElemMask = (1 << ElemSizeInBits) - 1; @@ -6351,7 +6528,7 @@ FailedModImm: // DUPLANE works on 128-bit vectors, widen it if necessary. SDValue Lane = Value.getOperand(1); Value = Value.getOperand(0); - if (Value.getValueType().getSizeInBits() == 64) + if (Value.getValueSizeInBits() == 64) Value = WidenVector(Value, DAG); unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); @@ -6414,7 +6591,7 @@ FailedModImm: if (!isConstant && !usesOnlyOneValue) { SDValue Vec = DAG.getUNDEF(VT); SDValue Op0 = Op.getOperand(0); - unsigned ElemSize = VT.getVectorElementType().getSizeInBits(); + unsigned ElemSize = VT.getScalarSizeInBits(); unsigned i = 0; // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to // a) Avoid a RMW dependency on the full vector register, and @@ -6528,7 +6705,7 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, return SDValue(); unsigned Val = Cst->getZExtValue(); - unsigned Size = Op.getValueType().getSizeInBits(); + unsigned Size = Op.getValueSizeInBits(); // This will get lowered to an appropriate EXTRACT_SUBREG in ISel. if (Val == 0) @@ -6536,7 +6713,7 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, // If this is extracting the upper 64-bits of a 128-bit vector, we match // that directly. - if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64) + if (Size == 64 && Val * VT.getScalarSizeInBits() == 64) return Op; return SDValue(); @@ -6606,7 +6783,7 @@ static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { /// 0 <= Value <= ElementBits for a long left shift. static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); - int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); + int64_t ElementBits = VT.getScalarSizeInBits(); if (!getVShiftImm(Op, ElementBits, Cnt)) return false; return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); @@ -6617,7 +6794,7 @@ static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { /// 1 <= Value <= ElementBits for a right shift; or static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); - int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); + int64_t ElementBits = VT.getScalarSizeInBits(); if (!getVShiftImm(Op, ElementBits, Cnt)) return false; return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); @@ -6631,7 +6808,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, if (!Op.getOperand(1).getValueType().isVector()) return Op; - unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + unsigned EltSize = VT.getScalarSizeInBits(); switch (Op.getOpcode()) { default: @@ -6716,8 +6893,8 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, case AArch64CC::LT: if (!NoNans) return SDValue(); - // If we ignore NaNs then we can use to the MI implementation. - // Fallthrough. + // If we ignore NaNs then we can use to the MI implementation. + LLVM_FALLTHROUGH; case AArch64CC::MI: if (IsZero) return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS); @@ -6904,7 +7081,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return true; } case Intrinsic::aarch64_ldaxp: - case Intrinsic::aarch64_ldxp: { + case Intrinsic::aarch64_ldxp: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::i128; Info.ptrVal = I.getArgOperand(0); @@ -6914,9 +7091,8 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.readMem = true; Info.writeMem = false; return true; - } case Intrinsic::aarch64_stlxp: - case Intrinsic::aarch64_stxp: { + case Intrinsic::aarch64_stxp: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::i128; Info.ptrVal = I.getArgOperand(2); @@ -6926,7 +7102,6 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.readMem = false; Info.writeMem = true; return true; - } default: break; } @@ -7033,8 +7208,8 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { case Instruction::GetElementPtr: { gep_type_iterator GTI = gep_type_begin(Instr); auto &DL = Ext->getModule()->getDataLayout(); - std::advance(GTI, U.getOperandNo()); - Type *IdxTy = *GTI; + std::advance(GTI, U.getOperandNo()-1); + Type *IdxTy = GTI.getIndexedType(); // This extension will end up with a shift because of the scaling factor. // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0. // Get the shift amount based on the scaling factor: @@ -7052,7 +7227,7 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { // trunc(sext ty1 to ty2) to ty1. if (Instr->getType() == Ext->getOperand(0)->getType()) continue; - // FALL THROUGH. + LLVM_FALLTHROUGH; default: return false; } @@ -7063,16 +7238,6 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { return true; } -bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType, - unsigned &RequiredAligment) const { - if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy()) - return false; - // Cyclone supports unaligned accesses. - RequiredAligment = 0; - unsigned NumBits = LoadedType->getPrimitiveSizeInBits(); - return NumBits == 32 || NumBits == 64; -} - bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const { if (!LoadedType.isSimple() || @@ -7167,7 +7332,7 @@ static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start, /// /// E.g. Lower an interleaved store (Factor = 3): /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, -/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> +/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> /// store <12 x i32> %i.vec, <12 x i32>* %ptr /// /// Into: @@ -7178,6 +7343,17 @@ static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start, /// /// Note that the new shufflevectors will be removed and we'll only generate one /// st3 instruction in CodeGen. +/// +/// Example for a more general valid mask (Factor 3). Lower: +/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, +/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> +/// store <12 x i32> %i.vec, <12 x i32>* %ptr +/// +/// Into: +/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> +/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> +/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> +/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { @@ -7188,9 +7364,9 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, assert(VecTy->getVectorNumElements() % Factor == 0 && "Invalid interleaved store"); - unsigned NumSubElts = VecTy->getVectorNumElements() / Factor; + unsigned LaneLen = VecTy->getVectorNumElements() / Factor; Type *EltTy = VecTy->getVectorElementType(); - VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts); + VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); const DataLayout &DL = SI->getModule()->getDataLayout(); unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); @@ -7215,7 +7391,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); - SubVecTy = VectorType::get(IntTy, NumSubElts); + SubVecTy = VectorType::get(IntTy, LaneLen); } Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace()); @@ -7229,9 +7405,28 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, SmallVector<Value *, 5> Ops; // Split the shufflevector operands into sub vectors for the new stN call. - for (unsigned i = 0; i < Factor; i++) - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts))); + auto Mask = SVI->getShuffleMask(); + for (unsigned i = 0; i < Factor; i++) { + if (Mask[i] >= 0) { + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen))); + } else { + unsigned StartMask = 0; + for (unsigned j = 1; j < LaneLen; j++) { + if (Mask[j*Factor + i] >= 0) { + StartMask = Mask[j*Factor + i] - j; + break; + } + } + // Note: If all elements in a chunk are undefs, StartMask=0! + // Note: Filling undef gaps with random elements is ok, since + // those elements were being written anyway (with undefs). + // In the case of all undefs we're defaulting to using elems from 0 + // Note: StartMask cannot be negative, it's checked in isReInterleaveMask + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen))); + } + } Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy)); Builder.CreateCall(StNFunc, Ops); @@ -7323,7 +7518,7 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, int64_t Offset = AM.BaseOffs; // 9-bit signed offset - if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1) + if (isInt<9>(Offset)) return true; // 12-bit unsigned offset @@ -7337,8 +7532,7 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 - return !AM.Scale || AM.Scale == 1 || - (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes); + return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes); } int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL, @@ -7544,57 +7738,98 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); + // The below optimizations require a constant RHS. + if (!isa<ConstantSDNode>(N->getOperand(1))) + return SDValue(); + + ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1)); + const APInt &ConstValue = C->getAPIntValue(); + // Multiplication of a power of two plus/minus one can be done more // cheaply as as shift+add/sub. For now, this is true unilaterally. If // future CPUs have a cheaper MADD instruction, this may need to be // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and // 64-bit is 5 cycles, so this is always a win. - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) { - const APInt &Value = C->getAPIntValue(); - EVT VT = N->getValueType(0); - SDLoc DL(N); - if (Value.isNonNegative()) { - // (mul x, 2^N + 1) => (add (shl x, N), x) - APInt VM1 = Value - 1; - if (VM1.isPowerOf2()) { - SDValue ShiftedVal = - DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), - DAG.getConstant(VM1.logBase2(), DL, MVT::i64)); - return DAG.getNode(ISD::ADD, DL, VT, ShiftedVal, - N->getOperand(0)); - } - // (mul x, 2^N - 1) => (sub (shl x, N), x) - APInt VP1 = Value + 1; - if (VP1.isPowerOf2()) { - SDValue ShiftedVal = - DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), - DAG.getConstant(VP1.logBase2(), DL, MVT::i64)); - return DAG.getNode(ISD::SUB, DL, VT, ShiftedVal, - N->getOperand(0)); - } - } else { - // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) - APInt VNP1 = -Value + 1; - if (VNP1.isPowerOf2()) { - SDValue ShiftedVal = - DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), - DAG.getConstant(VNP1.logBase2(), DL, MVT::i64)); - return DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), - ShiftedVal); - } - // (mul x, -(2^N + 1)) => - (add (shl x, N), x) - APInt VNM1 = -Value - 1; - if (VNM1.isPowerOf2()) { - SDValue ShiftedVal = - DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), - DAG.getConstant(VNM1.logBase2(), DL, MVT::i64)); - SDValue Add = - DAG.getNode(ISD::ADD, DL, VT, ShiftedVal, N->getOperand(0)); - return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Add); - } - } + // More aggressively, some multiplications N0 * C can be lowered to + // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M, + // e.g. 6=3*2=(2+1)*2. + // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45 + // which equals to (1+2)*16-(1+2). + SDValue N0 = N->getOperand(0); + // TrailingZeroes is used to test if the mul can be lowered to + // shift+add+shift. + unsigned TrailingZeroes = ConstValue.countTrailingZeros(); + if (TrailingZeroes) { + // Conservatively do not lower to shift+add+shift if the mul might be + // folded into smul or umul. + if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) || + isZeroExtended(N0.getNode(), DAG))) + return SDValue(); + // Conservatively do not lower to shift+add+shift if the mul might be + // folded into madd or msub. + if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD || + N->use_begin()->getOpcode() == ISD::SUB)) + return SDValue(); } - return SDValue(); + // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub + // and shift+add+shift. + APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes); + + unsigned ShiftAmt, AddSubOpc; + // Is the shifted value the LHS operand of the add/sub? + bool ShiftValUseIsN0 = true; + // Do we need to negate the result? + bool NegateResult = false; + + if (ConstValue.isNonNegative()) { + // (mul x, 2^N + 1) => (add (shl x, N), x) + // (mul x, 2^N - 1) => (sub (shl x, N), x) + // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M) + APInt SCVMinus1 = ShiftedConstValue - 1; + APInt CVPlus1 = ConstValue + 1; + if (SCVMinus1.isPowerOf2()) { + ShiftAmt = SCVMinus1.logBase2(); + AddSubOpc = ISD::ADD; + } else if (CVPlus1.isPowerOf2()) { + ShiftAmt = CVPlus1.logBase2(); + AddSubOpc = ISD::SUB; + } else + return SDValue(); + } else { + // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) + // (mul x, -(2^N + 1)) => - (add (shl x, N), x) + APInt CVNegPlus1 = -ConstValue + 1; + APInt CVNegMinus1 = -ConstValue - 1; + if (CVNegPlus1.isPowerOf2()) { + ShiftAmt = CVNegPlus1.logBase2(); + AddSubOpc = ISD::SUB; + ShiftValUseIsN0 = false; + } else if (CVNegMinus1.isPowerOf2()) { + ShiftAmt = CVNegMinus1.logBase2(); + AddSubOpc = ISD::ADD; + NegateResult = true; + } else + return SDValue(); + } + + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0, + DAG.getConstant(ShiftAmt, DL, MVT::i64)); + + SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0; + SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal; + SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1); + assert(!(NegateResult && TrailingZeroes) && + "NegateResult and TrailingZeroes cannot both be true for now."); + // Negate the result. + if (NegateResult) + return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res); + // Shift the result. + if (TrailingZeroes) + return DAG.getNode(ISD::SHL, DL, VT, Res, + DAG.getConstant(TrailingZeroes, DL, MVT::i64)); + return Res; } static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, @@ -7655,7 +7890,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); // Only optimize when the source and destination types have the same width. - if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits()) + if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits()) return SDValue(); // If the result of an integer load is only used by an integer-to-float @@ -7757,13 +7992,15 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, /// Fold a floating-point divide by power of two into fixed-point to /// floating-point conversion. static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (!Subtarget->hasNEON()) return SDValue(); SDValue Op = N->getOperand(0); unsigned Opc = Op->getOpcode(); - if (!Op.getValueType().isVector() || + if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || + !Op.getOperand(0).getValueType().isSimple() || (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP)) return SDValue(); @@ -7800,10 +8037,13 @@ static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; break; case 4: - ResTy = MVT::v4i32; + ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64; break; } + if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps()) + return SDValue(); + SDLoc DL(N); SDValue ConvInput = Op.getOperand(0); bool IsSigned = Opc == ISD::SINT_TO_FP; @@ -7855,13 +8095,13 @@ static SDValue tryCombineToEXTR(SDNode *N, SDValue LHS; uint32_t ShiftLHS = 0; - bool LHSFromHi = 0; + bool LHSFromHi = false; if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) return SDValue(); SDValue RHS; uint32_t ShiftRHS = 0; - bool RHSFromHi = 0; + bool RHSFromHi = false; if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) return SDValue(); @@ -7901,7 +8141,7 @@ static SDValue tryCombineToBSL(SDNode *N, // We only have to look for constant vectors here since the general, variable // case can be handled in TableGen. - unsigned Bits = VT.getVectorElementType().getSizeInBits(); + unsigned Bits = VT.getScalarSizeInBits(); uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1); for (int i = 1; i >= 0; --i) for (int j = 1; j >= 0; --j) { @@ -8090,7 +8330,7 @@ static SDValue performConcatVectorsCombine(SDNode *N, // splat. The indexed instructions are going to be expecting a DUPLANE64, so // canonicalise to that. if (N0 == N1 && VT.getVectorNumElements() == 2) { - assert(VT.getVectorElementType().getSizeInBits() == 64); + assert(VT.getScalarSizeInBits() == 64); return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG), DAG.getConstant(0, dl, MVT::i64)); } @@ -8153,7 +8393,7 @@ static SDValue tryCombineFixedPointConvert(SDNode *N, // The vector width should be 128 bits by the time we get here, even // if it started as 64 bits (the extract_vector handling will have // done so). - assert(Vec.getValueType().getSizeInBits() == 128 && + assert(Vec.getValueSizeInBits() == 128 && "unexpected vector size on extract_vector_elt!"); if (Vec.getValueType() == MVT::v4i32) VecResTy = MVT::v4f32; @@ -8655,7 +8895,7 @@ static SDValue performExtendCombine(SDNode *N, if (SrcVT.getSizeInBits() != 64) return SDValue(); - unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits(); + unsigned SrcEltSize = SrcVT.getScalarSizeInBits(); unsigned ElementCount = SrcVT.getVectorNumElements(); SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount); SDLoc DL(N); @@ -8684,13 +8924,101 @@ static SDValue performExtendCombine(SDNode *N, return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi); } +static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, + SDValue SplatVal, unsigned NumVecElts) { + unsigned OrigAlignment = St.getAlignment(); + unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8; + + // Create scalar stores. This is at least as good as the code sequence for a + // split unaligned store which is a dup.s, ext.b, and two stores. + // Most of the time the three stores should be replaced by store pair + // instructions (stp). + SDLoc DL(&St); + SDValue BasePtr = St.getBasePtr(); + const MachinePointerInfo &PtrInfo = St.getPointerInfo(); + SDValue NewST1 = + DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo, + OrigAlignment, St.getMemOperand()->getFlags()); + + unsigned Offset = EltOffset; + while (--NumVecElts) { + unsigned Alignment = MinAlign(OrigAlignment, Offset); + SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, + DAG.getConstant(Offset, DL, MVT::i64)); + NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, + PtrInfo.getWithOffset(Offset), Alignment, + St.getMemOperand()->getFlags()); + Offset += EltOffset; + } + return NewST1; +} + +/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The +/// load store optimizer pass will merge them to store pair stores. This should +/// be better than a movi to create the vector zero followed by a vector store +/// if the zero constant is not re-used, since one instructions and one register +/// live range will be removed. +/// +/// For example, the final generated code should be: +/// +/// stp xzr, xzr, [x0] +/// +/// instead of: +/// +/// movi v0.2d, #0 +/// str q0, [x0] +/// +static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) { + SDValue StVal = St.getValue(); + EVT VT = StVal.getValueType(); + + // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or + // 2, 3 or 4 i32 elements. + int NumVecElts = VT.getVectorNumElements(); + if (!(((NumVecElts == 2 || NumVecElts == 3) && + VT.getVectorElementType().getSizeInBits() == 64) || + ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) && + VT.getVectorElementType().getSizeInBits() == 32))) + return SDValue(); + + if (StVal.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + // If the zero constant has more than one use then the vector store could be + // better since the constant mov will be amortized and stp q instructions + // should be able to be formed. + if (!StVal.hasOneUse()) + return SDValue(); + + // If the immediate offset of the address operand is too large for the stp + // instruction, then bail out. + if (DAG.isBaseWithConstantOffset(St.getBasePtr())) { + int64_t Offset = St.getBasePtr()->getConstantOperandVal(1); + if (Offset < -512 || Offset > 504) + return SDValue(); + } + + for (int I = 0; I < NumVecElts; ++I) { + SDValue EltVal = StVal.getOperand(I); + if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal)) + return SDValue(); + } + + // Use WZR/XZR here to prevent DAGCombiner::MergeConsecutiveStores from + // undoing this transformation. + SDValue SplatVal = VT.getVectorElementType().getSizeInBits() == 32 + ? DAG.getRegister(AArch64::WZR, MVT::i32) + : DAG.getRegister(AArch64::XZR, MVT::i64); + return splitStoreSplat(DAG, St, SplatVal, NumVecElts); +} + /// Replace a splat of a scalar to a vector store by scalar stores of the scalar /// value. The load store optimizer pass will merge them to store pair stores. /// This has better performance than a splat of the scalar followed by a split /// vector store. Even if the stores are not merged it is four stores vs a dup, /// followed by an ext.b and two stores. -static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) { - SDValue StVal = St->getValue(); +static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) { + SDValue StVal = St.getValue(); EVT VT = StVal.getValueType(); // Don't replace floating point stores, they possibly won't be transformed to @@ -8698,55 +9026,48 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) { if (VT.isFloatingPoint()) return SDValue(); - // Check for insert vector elements. - if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT) - return SDValue(); - // We can express a splat as store pair(s) for 2 or 4 elements. unsigned NumVecElts = VT.getVectorNumElements(); if (NumVecElts != 4 && NumVecElts != 2) return SDValue(); - SDValue SplatVal = StVal.getOperand(1); - unsigned RemainInsertElts = NumVecElts - 1; // Check that this is a splat. - while (--RemainInsertElts) { - SDValue NextInsertElt = StVal.getOperand(0); - if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT) + // Make sure that each of the relevant vector element locations are inserted + // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32. + std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1); + SDValue SplatVal; + for (unsigned I = 0; I < NumVecElts; ++I) { + // Check for insert vector elements. + if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT) return SDValue(); - if (NextInsertElt.getOperand(1) != SplatVal) + + // Check that same value is inserted at each vector element. + if (I == 0) + SplatVal = StVal.getOperand(1); + else if (StVal.getOperand(1) != SplatVal) return SDValue(); - StVal = NextInsertElt; - } - unsigned OrigAlignment = St->getAlignment(); - unsigned EltOffset = NumVecElts == 4 ? 4 : 8; - unsigned Alignment = std::min(OrigAlignment, EltOffset); - // Create scalar stores. This is at least as good as the code sequence for a - // split unaligned store which is a dup.s, ext.b, and two stores. - // Most of the time the three stores should be replaced by store pair - // instructions (stp). - SDLoc DL(St); - SDValue BasePtr = St->getBasePtr(); - SDValue NewST1 = - DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(), - St->getAlignment(), St->getMemOperand()->getFlags()); + // Check insert element index. + ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2)); + if (!CIndex) + return SDValue(); + uint64_t IndexVal = CIndex->getZExtValue(); + if (IndexVal >= NumVecElts) + return SDValue(); + IndexNotInserted.reset(IndexVal); - unsigned Offset = EltOffset; - while (--NumVecElts) { - SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, - DAG.getConstant(Offset, DL, MVT::i64)); - NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, - St->getPointerInfo(), Alignment, - St->getMemOperand()->getFlags()); - Offset += EltOffset; + StVal = StVal.getOperand(0); } - return NewST1; + // Check that all vector element locations were inserted to. + if (IndexNotInserted.any()) + return SDValue(); + + return splitStoreSplat(DAG, St, SplatVal, NumVecElts); } -static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG, - const AArch64Subtarget *Subtarget) { +static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { if (!DCI.isBeforeLegalize()) return SDValue(); @@ -8754,6 +9075,17 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, if (S->isVolatile()) return SDValue(); + SDValue StVal = S->getValue(); + EVT VT = StVal.getValueType(); + if (!VT.isVector()) + return SDValue(); + + // If we get a splat of zeros, convert this vector store to a store of + // scalars. They will be merged into store pairs of xzr thereby removing one + // instruction and one register. + if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S)) + return ReplacedZeroSplat; + // FIXME: The logic for deciding if an unaligned store should be split should // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be // a call to that function here. @@ -8765,12 +9097,9 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, if (DAG.getMachineFunction().getFunction()->optForMinSize()) return SDValue(); - SDValue StVal = S->getValue(); - EVT VT = StVal.getValueType(); - // Don't split v2i64 vectors. Memcpy lowering produces those and splitting // those up regresses performance on micro-benchmarks and olden/bh. - if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64) + if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64) return SDValue(); // Split unaligned 16B stores. They are terrible for performance. @@ -8785,7 +9114,7 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, // If we get a splat of a scalar convert this vector store to a store of // scalars. They will be merged into store pairs thereby removing two // instructions. - if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S)) + if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S)) return ReplacedSplat; SDLoc DL(S); @@ -8928,7 +9257,7 @@ static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { - if (SDValue Split = split16BStores(N, DCI, DAG, Subtarget)) + if (SDValue Split = splitStores(N, DCI, DAG, Subtarget)) return Split; if (Subtarget->supportsAddressTopByteIgnored() && @@ -9455,52 +9784,51 @@ static bool isEquivalentMaskless(unsigned CC, unsigned width, switch(CC) { case AArch64CC::LE: - case AArch64CC::GT: { + case AArch64CC::GT: if ((AddConstant == 0) || (CompConstant == MaxUInt - 1 && AddConstant < 0) || (AddConstant >= 0 && CompConstant < 0) || (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant)) return true; - } break; + break; case AArch64CC::LT: - case AArch64CC::GE: { + case AArch64CC::GE: if ((AddConstant == 0) || (AddConstant >= 0 && CompConstant <= 0) || (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant)) return true; - } break; + break; case AArch64CC::HI: - case AArch64CC::LS: { + case AArch64CC::LS: if ((AddConstant >= 0 && CompConstant < 0) || (AddConstant <= 0 && CompConstant >= -1 && CompConstant < AddConstant + MaxUInt)) return true; - } break; + break; case AArch64CC::PL: - case AArch64CC::MI: { + case AArch64CC::MI: if ((AddConstant == 0) || (AddConstant > 0 && CompConstant <= 0) || (AddConstant < 0 && CompConstant <= AddConstant)) return true; - } break; + break; case AArch64CC::LO: - case AArch64CC::HS: { + case AArch64CC::HS: if ((AddConstant >= 0 && CompConstant <= 0) || (AddConstant <= 0 && CompConstant >= 0 && CompConstant <= AddConstant + MaxUInt)) return true; - } break; + break; case AArch64CC::EQ: - case AArch64CC::NE: { + case AArch64CC::NE: if ((AddConstant > 0 && CompConstant < 0) || (AddConstant < 0 && CompConstant >= 0 && CompConstant < AddConstant + MaxUInt) || (AddConstant >= 0 && CompConstant >= 0 && CompConstant >= AddConstant) || (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant)) - return true; - } break; + break; case AArch64CC::VS: case AArch64CC::VC: case AArch64CC::AL: @@ -9862,7 +10190,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case ISD::FP_TO_UINT: return performFpToIntCombine(N, DAG, DCI, Subtarget); case ISD::FDIV: - return performFDivCombine(N, DAG, Subtarget); + return performFDivCombine(N, DAG, DCI, Subtarget); case ISD::OR: return performORCombine(N, DCI, Subtarget); case ISD::SRL: @@ -9995,8 +10323,10 @@ bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base, // All of the indexed addressing mode instructions take a signed // 9 bit immediate offset. if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) { - int64_t RHSC = (int64_t)RHS->getZExtValue(); - if (RHSC >= 256 || RHSC <= -256) + int64_t RHSC = RHS->getSExtValue(); + if (Op->getOpcode() == ISD::SUB) + RHSC = -(uint64_t)RHSC; + if (!isInt<9>(RHSC)) return false; IsInc = (Op->getOpcode() == ISD::ADD); Offset = Op->getOperand(1); @@ -10222,7 +10552,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, if (ValTy->getPrimitiveSizeInBits() == 128) { Intrinsic::ID Int = IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp; - Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int); + Function *Ldxr = Intrinsic::getDeclaration(M, Int); Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi"); @@ -10238,7 +10568,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, Type *Tys[] = { Addr->getType() }; Intrinsic::ID Int = IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr; - Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys); + Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys); return Builder.CreateTruncOrBitCast( Builder.CreateCall(Ldxr, Addr), @@ -10248,8 +10578,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( IRBuilder<> &Builder) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Builder.CreateCall( - llvm::Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex)); + Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex)); } Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, |