summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp')
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp837
1 files changed, 583 insertions, 254 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 06bfe34..849058b 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11,28 +11,79 @@
//
//===----------------------------------------------------------------------===//
-#include "AArch64ISelLowering.h"
#include "AArch64CallingConvention.h"
#include "AArch64MachineFunctionInfo.h"
+#include "AArch64ISelLowering.h"
#include "AArch64PerfectShuffle.h"
+#include "AArch64RegisterInfo.h"
#include "AArch64Subtarget.h"
-#include "AArch64TargetMachine.h"
-#include "AArch64TargetObjectFile.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/OperandTraits.h"
#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetCallingConv.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
+#include <algorithm>
+#include <bitset>
+#include <cassert>
+#include <cctype>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <limits>
+#include <tuple>
+#include <utility>
+#include <vector>
+
using namespace llvm;
#define DEBUG_TYPE "aarch64-lower"
@@ -53,20 +104,12 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
cl::init(false));
-// Disabled for causing self-hosting failures once returned-attribute inference
-// was enabled.
-static cl::opt<bool>
-EnableThisRetForwarding("aarch64-this-return-forwarding", cl::Hidden,
- cl::desc("Directly forward this return"),
- cl::init(false));
-
/// Value type used for condition codes.
static const MVT MVT_CC = MVT::i32;
AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
const AArch64Subtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
-
// AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
// we have to make something up. Arbitrarily, choose ZeroOrOne.
setBooleanContents(ZeroOrOneBooleanContent);
@@ -116,6 +159,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SETCC, MVT::i64, Custom);
setOperationAction(ISD::SETCC, MVT::f32, Custom);
setOperationAction(ISD::SETCC, MVT::f64, Custom);
+ setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+ setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
setOperationAction(ISD::BRCOND, MVT::Other, Expand);
setOperationAction(ISD::BR_CC, MVT::i32, Custom);
setOperationAction(ISD::BR_CC, MVT::i64, Custom);
@@ -225,7 +270,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
-
setOperationAction(ISD::CTPOP, MVT::i32, Custom);
setOperationAction(ISD::CTPOP, MVT::i64, Custom);
@@ -520,6 +564,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
setPrefLoopAlignment(STI.getPrefLoopAlignment());
+ // Only change the limit for entries in a jump table if specified by
+ // the subtarget, but not at the command line.
+ unsigned MaxJT = STI.getMaximumJumpTableSize();
+ if (MaxJT && getMaximumJumpTableSize() == 0)
+ setMaximumJumpTableSize(MaxJT);
+
setHasExtractBitsInsn(true);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
@@ -764,7 +814,7 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
case Intrinsic::aarch64_ldxr: {
unsigned BitWidth = KnownOne.getBitWidth();
EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
- unsigned MemBits = VT.getScalarType().getSizeInBits();
+ unsigned MemBits = VT.getScalarSizeInBits();
KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
return;
}
@@ -960,8 +1010,10 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
- case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
+ case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS";
+ case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
+ case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS";
}
return nullptr;
}
@@ -1186,7 +1238,8 @@ static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
changeFPCCToAArch64CC(CC, CondCode, CondCode2);
break;
case ISD::SETUO:
- Invert = true; // Fallthrough
+ Invert = true;
+ LLVM_FALLTHROUGH;
case ISD::SETO:
CondCode = AArch64CC::MI;
CondCode2 = AArch64CC::GE;
@@ -2136,7 +2189,7 @@ static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
for (const SDValue &Elt : N->op_values()) {
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
- unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+ unsigned EltSize = VT.getScalarSizeInBits();
unsigned HalfSize = EltSize / 2;
if (isSigned) {
if (!isIntN(HalfSize, C->getSExtValue()))
@@ -2163,7 +2216,7 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
EVT VT = N->getValueType(0);
SDLoc dl(N);
- unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2;
+ unsigned EltSize = VT.getScalarSizeInBits() / 2;
unsigned NumElts = VT.getVectorNumElements();
MVT TruncVT = MVT::getIntegerVT(EltSize);
SmallVector<SDValue, 8> Ops;
@@ -2435,18 +2488,25 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
case CallingConv::Fast:
case CallingConv::PreserveMost:
case CallingConv::CXX_FAST_TLS:
+ case CallingConv::Swift:
if (!Subtarget->isTargetDarwin())
return CC_AArch64_AAPCS;
return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
}
}
+CCAssignFn *
+AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
+ return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
+ : RetCC_AArch64_AAPCS;
+}
+
SDValue AArch64TargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
- MachineFrameInfo *MFI = MF.getFrameInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
@@ -2499,7 +2559,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
// FIXME: This works on big-endian for composite byvals, which are the common
// case. It should also work for fundamental types too.
unsigned FrameIdx =
- MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
+ MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
InVals.push_back(FrameIdxN);
@@ -2564,7 +2624,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
!Ins[i].Flags.isInConsecutiveRegs())
BEAlign = 8 - ArgSize;
- int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
+ int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
// Create load nodes to retrieve arguments from the stack.
SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
@@ -2614,7 +2674,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
unsigned StackOffset = CCInfo.getNextStackOffset();
// We currently pass all varargs at 8-byte alignment.
StackOffset = ((StackOffset + 7) & ~7);
- FuncInfo->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
+ FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
}
unsigned StackArgSize = CCInfo.getNextStackOffset();
@@ -2645,7 +2705,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
const SDLoc &DL,
SDValue &Chain) const {
MachineFunction &MF = DAG.getMachineFunction();
- MachineFrameInfo *MFI = MF.getFrameInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
auto PtrVT = getPointerTy(DAG.getDataLayout());
@@ -2660,7 +2720,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
int GPRIdx = 0;
if (GPRSaveSize != 0) {
- GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
+ GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
@@ -2688,7 +2748,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
int FPRIdx = 0;
if (FPRSaveSize != 0) {
- FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
+ FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);
SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
@@ -2735,7 +2795,7 @@ SDValue AArch64TargetLowering::LowerCallResult(
// Pass 'this' value directly from the argument to return value, to avoid
// reg unit interference
- if (i == 0 && isThisReturn && EnableThisRetForwarding) {
+ if (i == 0 && isThisReturn) {
assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
"unexpected return calling convention register assignment");
InVals.push_back(ThisVal);
@@ -2763,15 +2823,29 @@ SDValue AArch64TargetLowering::LowerCallResult(
return Chain;
}
+/// Return true if the calling convention is one that we can guarantee TCO for.
+static bool canGuaranteeTCO(CallingConv::ID CC) {
+ return CC == CallingConv::Fast;
+}
+
+/// Return true if we might ever do TCO for calls with this calling convention.
+static bool mayTailCallThisCC(CallingConv::ID CC) {
+ switch (CC) {
+ case CallingConv::C:
+ case CallingConv::PreserveMost:
+ case CallingConv::Swift:
+ return true;
+ default:
+ return canGuaranteeTCO(CC);
+ }
+}
+
bool AArch64TargetLowering::isEligibleForTailCallOptimization(
SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
- // For CallingConv::C this function knows whether the ABI needs
- // changing. That's not true for other conventions so they will have to opt in
- // manually.
- if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
+ if (!mayTailCallThisCC(CalleeCC))
return false;
MachineFunction &MF = DAG.getMachineFunction();
@@ -2788,9 +2862,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
if (i->hasByValAttr())
return false;
- if (getTargetMachine().Options.GuaranteedTailCallOpt) {
- return IsTailCallConvention(CalleeCC) && CCMatch;
- }
+ if (getTargetMachine().Options.GuaranteedTailCallOpt)
+ return canGuaranteeTCO(CalleeCC) && CCMatch;
// Externally-defined functions with weak linkage should not be
// tail-called on AArch64 when the OS does not support dynamic
@@ -2872,11 +2945,11 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
SelectionDAG &DAG,
- MachineFrameInfo *MFI,
+ MachineFrameInfo &MFI,
int ClobberedFI) const {
SmallVector<SDValue, 8> ArgChains;
- int64_t FirstByte = MFI->getObjectOffset(ClobberedFI);
- int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1;
+ int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
+ int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
// Include the original chain at the beginning of the list. When this is
// used by target LowerCall hooks, this helps legalize find the
@@ -2890,9 +2963,9 @@ SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
if (FI->getIndex() < 0) {
- int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex());
+ int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
int64_t InLastByte = InFirstByte;
- InLastByte += MFI->getObjectSize(FI->getIndex()) - 1;
+ InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
(FirstByte <= InFirstByte && InFirstByte <= LastByte))
@@ -2908,11 +2981,6 @@ bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
return CallCC == CallingConv::Fast && TailCallOpt;
}
-bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
- return CallCC == CallingConv::Fast ||
- CallCC == CallingConv::PreserveMost;
-}
-
/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
/// and add input and output parameter nodes.
SDValue
@@ -3087,7 +3155,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
}
if (VA.isRegLoc()) {
- if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) {
+ if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
+ Outs[0].VT == MVT::i64) {
assert(VA.getLocVT() == MVT::i64 &&
"unexpected calling convention register assignment");
assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
@@ -3119,7 +3188,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
if (IsTailCall) {
Offset = Offset + FPDiff;
- int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+ int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
DstAddr = DAG.getFrameIndex(FI, PtrVT);
DstInfo =
@@ -3253,7 +3322,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// If we're doing a tall call, use a TC_RETURN here rather than an
// actual call instruction.
if (IsTailCall) {
- MF.getFrameInfo()->setHasTailCall();
+ MF.getFrameInfo().setHasTailCall();
return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
}
@@ -3444,15 +3513,16 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
// The first entry in the descriptor is a function pointer that we must call
// to obtain the address of the variable.
SDValue Chain = DAG.getEntryNode();
- SDValue FuncTLVGet =
- DAG.getLoad(MVT::i64, DL, Chain, DescAddr,
- MachinePointerInfo::getGOT(DAG.getMachineFunction()),
- /* Alignment = */ 8, MachineMemOperand::MONonTemporal |
- MachineMemOperand::MOInvariant);
+ SDValue FuncTLVGet = DAG.getLoad(
+ MVT::i64, DL, Chain, DescAddr,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+ /* Alignment = */ 8,
+ MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant |
+ MachineMemOperand::MODereferenceable);
Chain = FuncTLVGet.getValue(1);
- MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
- MFI->setAdjustsStack(true);
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI.setAdjustsStack(true);
// TLS calls preserve all registers except those that absolutely must be
// trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
@@ -3614,6 +3684,7 @@ SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
llvm_unreachable("Unexpected platform trying to use TLS");
}
+
SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
@@ -3705,7 +3776,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
// Don't combine AND since emitComparison converts the AND to an ANDS
// (a.k.a. TST) and the test in the test bit and branch instruction
// becomes redundant. This would also increase register pressure.
- uint64_t Mask = LHS.getValueType().getSizeInBits() - 1;
+ uint64_t Mask = LHS.getValueSizeInBits() - 1;
return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
DAG.getConstant(Mask, dl, MVT::i64), Dest);
}
@@ -3715,7 +3786,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
// Don't combine AND since emitComparison converts the AND to an ANDS
// (a.k.a. TST) and the test in the test bit and branch instruction
// becomes redundant. This would also increase register pressure.
- uint64_t Mask = LHS.getValueType().getSizeInBits() - 1;
+ uint64_t Mask = LHS.getValueSizeInBits() - 1;
return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
DAG.getConstant(Mask, dl, MVT::i64), Dest);
}
@@ -4036,6 +4107,33 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
}
}
+ // Avoid materializing a constant when possible by reusing a known value in
+ // a register. However, don't perform this optimization if the known value
+ // is one, zero or negative one in the case of a CSEL. We can always
+ // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
+ // FVal, respectively.
+ ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
+ if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
+ !RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) {
+ AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+ // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
+ // "a != C ? x : a" to avoid materializing C.
+ if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
+ TVal = LHS;
+ else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
+ FVal = LHS;
+ } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
+ assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
+ // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
+ // avoid materializing C.
+ AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+ if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
+ Opcode = AArch64ISD::CSINV;
+ TVal = LHS;
+ FVal = DAG.getConstant(0, dl, FVal.getValueType());
+ }
+ }
+
SDValue CCVal;
SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
@@ -4053,6 +4151,26 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
// clean. Some of them require two CSELs to implement.
AArch64CC::CondCode CC1, CC2;
changeFPCCToAArch64CC(CC, CC1, CC2);
+
+ if (DAG.getTarget().Options.UnsafeFPMath) {
+ // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
+ // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
+ ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
+ if (RHSVal && RHSVal->isZero()) {
+ ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
+ ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
+
+ if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
+ CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
+ TVal = LHS;
+ else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
+ CFVal && CFVal->isZero() &&
+ FVal.getValueType() == LHS.getValueType())
+ FVal = LHS;
+ }
+ }
+
+ // Emit first, and possibly only, CSEL.
SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
@@ -4378,8 +4496,8 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
SelectionDAG &DAG) const {
- MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
- MFI->setFrameAddressIsTaken(true);
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI.setFrameAddressIsTaken(true);
EVT VT = Op.getValueType();
SDLoc DL(Op);
@@ -4408,8 +4526,8 @@ unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
- MachineFrameInfo *MFI = MF.getFrameInfo();
- MFI->setReturnAddressIsTaken(true);
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MFI.setReturnAddressIsTaken(true);
EVT VT = Op.getValueType();
SDLoc DL(Op);
@@ -4484,7 +4602,6 @@ SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
return DAG.getMergeValues(Ops, dl);
}
-
/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
/// i64 values and take a 2 x i64 value to shift plus a shift amount.
SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
@@ -4559,38 +4676,96 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
// AArch64 Optimization Hooks
//===----------------------------------------------------------------------===//
-/// getEstimate - Return the appropriate estimate DAG for either the reciprocal
-/// or the reciprocal square root.
-static SDValue getEstimate(const AArch64Subtarget &ST,
- const AArch64TargetLowering::DAGCombinerInfo &DCI, unsigned Opcode,
- const SDValue &Operand, unsigned &ExtraSteps) {
- if (!ST.hasNEON())
- return SDValue();
-
+static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
+ SDValue Operand, SelectionDAG &DAG,
+ int &ExtraSteps) {
EVT VT = Operand.getValueType();
+ if (ST->hasNEON() &&
+ (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
+ VT == MVT::f32 || VT == MVT::v1f32 ||
+ VT == MVT::v2f32 || VT == MVT::v4f32)) {
+ if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
+ // For the reciprocal estimates, convergence is quadratic, so the number
+ // of digits is doubled after each iteration. In ARMv8, the accuracy of
+ // the initial estimate is 2^-8. Thus the number of extra steps to refine
+ // the result for float (23 mantissa bits) is 2 and for double (52
+ // mantissa bits) is 3.
+ ExtraSteps = VT == MVT::f64 ? 3 : 2;
- std::string RecipOp;
- RecipOp = Opcode == (AArch64ISD::FRECPE) ? "div": "sqrt";
- RecipOp = ((VT.isVector()) ? "vec-": "") + RecipOp;
- RecipOp += (VT.getScalarType() == MVT::f64) ? "d": "f";
+ return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
+ }
- TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
- if (!Recips.isEnabled(RecipOp))
- return SDValue();
+ return SDValue();
+}
+
+SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
+ SelectionDAG &DAG, int Enabled,
+ int &ExtraSteps,
+ bool &UseOneConst,
+ bool Reciprocal) const {
+ if (Enabled == ReciprocalEstimate::Enabled ||
+ (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
+ if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
+ DAG, ExtraSteps)) {
+ SDLoc DL(Operand);
+ EVT VT = Operand.getValueType();
+
+ SDNodeFlags Flags;
+ Flags.setUnsafeAlgebra(true);
+
+ // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
+ // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
+ for (int i = ExtraSteps; i > 0; --i) {
+ SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
+ &Flags);
+ Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, &Flags);
+ Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags);
+ }
+
+ if (!Reciprocal) {
+ EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+ VT);
+ SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
+ SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ);
+
+ Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, &Flags);
+ // Correct the result if the operand is 0.0.
+ Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL,
+ VT, Eq, Operand, Estimate);
+ }
+
+ ExtraSteps = 0;
+ return Estimate;
+ }
- ExtraSteps = Recips.getRefinementSteps(RecipOp);
- return DCI.DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
+ return SDValue();
}
SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
- DAGCombinerInfo &DCI, unsigned &ExtraSteps) const {
- return getEstimate(*Subtarget, DCI, AArch64ISD::FRECPE, Operand, ExtraSteps);
-}
+ SelectionDAG &DAG, int Enabled,
+ int &ExtraSteps) const {
+ if (Enabled == ReciprocalEstimate::Enabled)
+ if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
+ DAG, ExtraSteps)) {
+ SDLoc DL(Operand);
+ EVT VT = Operand.getValueType();
+
+ SDNodeFlags Flags;
+ Flags.setUnsafeAlgebra(true);
+
+ // Newton reciprocal iteration: E * (2 - X * E)
+ // AArch64 reciprocal iteration instruction: (2 - M * N)
+ for (int i = ExtraSteps; i > 0; --i) {
+ SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
+ Estimate, &Flags);
+ Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags);
+ }
-SDValue AArch64TargetLowering::getRsqrtEstimate(SDValue Operand,
- DAGCombinerInfo &DCI, unsigned &ExtraSteps, bool &UseOneConst) const {
- UseOneConst = true;
- return getEstimate(*Subtarget, DCI, AArch64ISD::FRSQRTE, Operand, ExtraSteps);
+ ExtraSteps = 0;
+ return Estimate;
+ }
+
+ return SDValue();
}
//===----------------------------------------------------------------------===//
@@ -4704,7 +4879,9 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
return std::make_pair(0U, &AArch64::GPR64commonRegClass);
return std::make_pair(0U, &AArch64::GPR32commonRegClass);
case 'w':
- if (VT == MVT::f32)
+ if (VT.getSizeInBits() == 16)
+ return std::make_pair(0U, &AArch64::FPR16RegClass);
+ if (VT.getSizeInBits() == 32)
return std::make_pair(0U, &AArch64::FPR32RegClass);
if (VT.getSizeInBits() == 64)
return std::make_pair(0U, &AArch64::FPR64RegClass);
@@ -4949,10 +5126,11 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
int WindowBase;
int WindowScale;
- bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
ShuffleSourceInfo(SDValue Vec)
- : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0),
- WindowScale(1) {}
+ : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
+ ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
+
+ bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
};
// First gather all vectors used as an immediate source for this BUILD_VECTOR
@@ -4971,7 +5149,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
// Add this element source to the list if it's not already there.
SDValue SourceVec = V.getOperand(0);
- auto Source = std::find(Sources.begin(), Sources.end(), SourceVec);
+ auto Source = find(Sources, SourceVec);
if (Source == Sources.end())
Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
@@ -4996,7 +5174,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
}
}
unsigned ResMultiplier =
- VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits();
+ VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
@@ -5081,21 +5259,21 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
// The stars all align, our next step is to produce the mask for the shuffle.
SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
- int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits();
+ int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
SDValue Entry = Op.getOperand(i);
if (Entry.isUndef())
continue;
- auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0));
+ auto Src = find(Sources, Entry.getOperand(0));
int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
// EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
// trunc. So only std::min(SrcBits, DestBits) actually get defined in this
// segment.
EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
- int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
- VT.getVectorElementType().getSizeInBits());
+ int BitsDefined =
+ std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits());
int LanesDefined = BitsDefined / BitsPerShuffleLane;
// This source is expected to fill ResMultiplier lanes of the final shuffle,
@@ -5157,8 +5335,7 @@ static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
unsigned &Imm) {
// Look for the first non-undef element.
- const int *FirstRealElt = std::find_if(M.begin(), M.end(),
- [](int Elt) {return Elt >= 0;});
+ const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
// Benefit form APInt to handle overflow when calculating expected element.
unsigned NumElts = VT.getVectorNumElements();
@@ -5200,7 +5377,7 @@ static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
"Only possible block sizes for REV are: 16, 32, 64");
- unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+ unsigned EltSz = VT.getScalarSizeInBits();
if (EltSz == 64)
return false;
@@ -5381,7 +5558,7 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
VT.getVectorElementType() != V1.getValueType().getVectorElementType())
return SDValue();
- bool SplitV0 = V0.getValueType().getSizeInBits() == 128;
+ bool SplitV0 = V0.getValueSizeInBits() == 128;
if (!isConcatMask(Mask, VT, SplitV0))
return SDValue();
@@ -5392,7 +5569,7 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
DAG.getConstant(0, DL, MVT::i64));
}
- if (V1.getValueType().getSizeInBits() == 128) {
+ if (V1.getValueSizeInBits() == 128) {
V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
DAG.getConstant(0, DL, MVT::i64));
}
@@ -5523,7 +5700,7 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
MVT IndexVT = MVT::v8i8;
unsigned IndexLen = 8;
- if (Op.getValueType().getSizeInBits() == 128) {
+ if (Op.getValueSizeInBits() == 128) {
IndexVT = MVT::v16i8;
IndexLen = 16;
}
@@ -5918,7 +6095,7 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
// Is C1 == ~C2, taking into account how much one can shift elements of a
// particular size?
uint64_t C2 = C2node->getZExtValue();
- unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits();
+ unsigned ElemSizeInBits = VT.getScalarSizeInBits();
if (C2 > ElemSizeInBits)
return SDValue();
unsigned ElemMask = (1 << ElemSizeInBits) - 1;
@@ -6351,7 +6528,7 @@ FailedModImm:
// DUPLANE works on 128-bit vectors, widen it if necessary.
SDValue Lane = Value.getOperand(1);
Value = Value.getOperand(0);
- if (Value.getValueType().getSizeInBits() == 64)
+ if (Value.getValueSizeInBits() == 64)
Value = WidenVector(Value, DAG);
unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
@@ -6414,7 +6591,7 @@ FailedModImm:
if (!isConstant && !usesOnlyOneValue) {
SDValue Vec = DAG.getUNDEF(VT);
SDValue Op0 = Op.getOperand(0);
- unsigned ElemSize = VT.getVectorElementType().getSizeInBits();
+ unsigned ElemSize = VT.getScalarSizeInBits();
unsigned i = 0;
// For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
// a) Avoid a RMW dependency on the full vector register, and
@@ -6528,7 +6705,7 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
return SDValue();
unsigned Val = Cst->getZExtValue();
- unsigned Size = Op.getValueType().getSizeInBits();
+ unsigned Size = Op.getValueSizeInBits();
// This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
if (Val == 0)
@@ -6536,7 +6713,7 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
// If this is extracting the upper 64-bits of a 128-bit vector, we match
// that directly.
- if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64)
+ if (Size == 64 && Val * VT.getScalarSizeInBits() == 64)
return Op;
return SDValue();
@@ -6606,7 +6783,7 @@ static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
/// 0 <= Value <= ElementBits for a long left shift.
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
assert(VT.isVector() && "vector shift count is not a vector type");
- int64_t ElementBits = VT.getVectorElementType().getSizeInBits();
+ int64_t ElementBits = VT.getScalarSizeInBits();
if (!getVShiftImm(Op, ElementBits, Cnt))
return false;
return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
@@ -6617,7 +6794,7 @@ static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
/// 1 <= Value <= ElementBits for a right shift; or
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
assert(VT.isVector() && "vector shift count is not a vector type");
- int64_t ElementBits = VT.getVectorElementType().getSizeInBits();
+ int64_t ElementBits = VT.getScalarSizeInBits();
if (!getVShiftImm(Op, ElementBits, Cnt))
return false;
return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
@@ -6631,7 +6808,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
if (!Op.getOperand(1).getValueType().isVector())
return Op;
- unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+ unsigned EltSize = VT.getScalarSizeInBits();
switch (Op.getOpcode()) {
default:
@@ -6716,8 +6893,8 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
case AArch64CC::LT:
if (!NoNans)
return SDValue();
- // If we ignore NaNs then we can use to the MI implementation.
- // Fallthrough.
+ // If we ignore NaNs then we can use to the MI implementation.
+ LLVM_FALLTHROUGH;
case AArch64CC::MI:
if (IsZero)
return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
@@ -6904,7 +7081,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
case Intrinsic::aarch64_ldaxp:
- case Intrinsic::aarch64_ldxp: {
+ case Intrinsic::aarch64_ldxp:
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::i128;
Info.ptrVal = I.getArgOperand(0);
@@ -6914,9 +7091,8 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.readMem = true;
Info.writeMem = false;
return true;
- }
case Intrinsic::aarch64_stlxp:
- case Intrinsic::aarch64_stxp: {
+ case Intrinsic::aarch64_stxp:
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::i128;
Info.ptrVal = I.getArgOperand(2);
@@ -6926,7 +7102,6 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.readMem = false;
Info.writeMem = true;
return true;
- }
default:
break;
}
@@ -7033,8 +7208,8 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
case Instruction::GetElementPtr: {
gep_type_iterator GTI = gep_type_begin(Instr);
auto &DL = Ext->getModule()->getDataLayout();
- std::advance(GTI, U.getOperandNo());
- Type *IdxTy = *GTI;
+ std::advance(GTI, U.getOperandNo()-1);
+ Type *IdxTy = GTI.getIndexedType();
// This extension will end up with a shift because of the scaling factor.
// 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
// Get the shift amount based on the scaling factor:
@@ -7052,7 +7227,7 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
// trunc(sext ty1 to ty2) to ty1.
if (Instr->getType() == Ext->getOperand(0)->getType())
continue;
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
default:
return false;
}
@@ -7063,16 +7238,6 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
return true;
}
-bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType,
- unsigned &RequiredAligment) const {
- if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy())
- return false;
- // Cyclone supports unaligned accesses.
- RequiredAligment = 0;
- unsigned NumBits = LoadedType->getPrimitiveSizeInBits();
- return NumBits == 32 || NumBits == 64;
-}
-
bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
unsigned &RequiredAligment) const {
if (!LoadedType.isSimple() ||
@@ -7167,7 +7332,7 @@ static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
///
/// E.g. Lower an interleaved store (Factor = 3):
/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
-/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
+/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
/// store <12 x i32> %i.vec, <12 x i32>* %ptr
///
/// Into:
@@ -7178,6 +7343,17 @@ static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
///
/// Note that the new shufflevectors will be removed and we'll only generate one
/// st3 instruction in CodeGen.
+///
+/// Example for a more general valid mask (Factor 3). Lower:
+/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
+/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
+/// store <12 x i32> %i.vec, <12 x i32>* %ptr
+///
+/// Into:
+/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
+/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
+/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
+/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
ShuffleVectorInst *SVI,
unsigned Factor) const {
@@ -7188,9 +7364,9 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
assert(VecTy->getVectorNumElements() % Factor == 0 &&
"Invalid interleaved store");
- unsigned NumSubElts = VecTy->getVectorNumElements() / Factor;
+ unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
Type *EltTy = VecTy->getVectorElementType();
- VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
+ VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
const DataLayout &DL = SI->getModule()->getDataLayout();
unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
@@ -7215,7 +7391,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
- SubVecTy = VectorType::get(IntTy, NumSubElts);
+ SubVecTy = VectorType::get(IntTy, LaneLen);
}
Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
@@ -7229,9 +7405,28 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
SmallVector<Value *, 5> Ops;
// Split the shufflevector operands into sub vectors for the new stN call.
- for (unsigned i = 0; i < Factor; i++)
- Ops.push_back(Builder.CreateShuffleVector(
- Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts)));
+ auto Mask = SVI->getShuffleMask();
+ for (unsigned i = 0; i < Factor; i++) {
+ if (Mask[i] >= 0) {
+ Ops.push_back(Builder.CreateShuffleVector(
+ Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen)));
+ } else {
+ unsigned StartMask = 0;
+ for (unsigned j = 1; j < LaneLen; j++) {
+ if (Mask[j*Factor + i] >= 0) {
+ StartMask = Mask[j*Factor + i] - j;
+ break;
+ }
+ }
+ // Note: If all elements in a chunk are undefs, StartMask=0!
+ // Note: Filling undef gaps with random elements is ok, since
+ // those elements were being written anyway (with undefs).
+ // In the case of all undefs we're defaulting to using elems from 0
+ // Note: StartMask cannot be negative, it's checked in isReInterleaveMask
+ Ops.push_back(Builder.CreateShuffleVector(
+ Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen)));
+ }
+ }
Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy));
Builder.CreateCall(StNFunc, Ops);
@@ -7323,7 +7518,7 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
int64_t Offset = AM.BaseOffs;
// 9-bit signed offset
- if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1)
+ if (isInt<9>(Offset))
return true;
// 12-bit unsigned offset
@@ -7337,8 +7532,7 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
// Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
- return !AM.Scale || AM.Scale == 1 ||
- (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
+ return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
}
int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
@@ -7544,57 +7738,98 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
if (DCI.isBeforeLegalizeOps())
return SDValue();
+ // The below optimizations require a constant RHS.
+ if (!isa<ConstantSDNode>(N->getOperand(1)))
+ return SDValue();
+
+ ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
+ const APInt &ConstValue = C->getAPIntValue();
+
// Multiplication of a power of two plus/minus one can be done more
// cheaply as as shift+add/sub. For now, this is true unilaterally. If
// future CPUs have a cheaper MADD instruction, this may need to be
// gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
// 64-bit is 5 cycles, so this is always a win.
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
- const APInt &Value = C->getAPIntValue();
- EVT VT = N->getValueType(0);
- SDLoc DL(N);
- if (Value.isNonNegative()) {
- // (mul x, 2^N + 1) => (add (shl x, N), x)
- APInt VM1 = Value - 1;
- if (VM1.isPowerOf2()) {
- SDValue ShiftedVal =
- DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
- DAG.getConstant(VM1.logBase2(), DL, MVT::i64));
- return DAG.getNode(ISD::ADD, DL, VT, ShiftedVal,
- N->getOperand(0));
- }
- // (mul x, 2^N - 1) => (sub (shl x, N), x)
- APInt VP1 = Value + 1;
- if (VP1.isPowerOf2()) {
- SDValue ShiftedVal =
- DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
- DAG.getConstant(VP1.logBase2(), DL, MVT::i64));
- return DAG.getNode(ISD::SUB, DL, VT, ShiftedVal,
- N->getOperand(0));
- }
- } else {
- // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
- APInt VNP1 = -Value + 1;
- if (VNP1.isPowerOf2()) {
- SDValue ShiftedVal =
- DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
- DAG.getConstant(VNP1.logBase2(), DL, MVT::i64));
- return DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0),
- ShiftedVal);
- }
- // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
- APInt VNM1 = -Value - 1;
- if (VNM1.isPowerOf2()) {
- SDValue ShiftedVal =
- DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
- DAG.getConstant(VNM1.logBase2(), DL, MVT::i64));
- SDValue Add =
- DAG.getNode(ISD::ADD, DL, VT, ShiftedVal, N->getOperand(0));
- return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Add);
- }
- }
+ // More aggressively, some multiplications N0 * C can be lowered to
+ // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
+ // e.g. 6=3*2=(2+1)*2.
+ // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
+ // which equals to (1+2)*16-(1+2).
+ SDValue N0 = N->getOperand(0);
+ // TrailingZeroes is used to test if the mul can be lowered to
+ // shift+add+shift.
+ unsigned TrailingZeroes = ConstValue.countTrailingZeros();
+ if (TrailingZeroes) {
+ // Conservatively do not lower to shift+add+shift if the mul might be
+ // folded into smul or umul.
+ if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) ||
+ isZeroExtended(N0.getNode(), DAG)))
+ return SDValue();
+ // Conservatively do not lower to shift+add+shift if the mul might be
+ // folded into madd or msub.
+ if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
+ N->use_begin()->getOpcode() == ISD::SUB))
+ return SDValue();
}
- return SDValue();
+ // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
+ // and shift+add+shift.
+ APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
+
+ unsigned ShiftAmt, AddSubOpc;
+ // Is the shifted value the LHS operand of the add/sub?
+ bool ShiftValUseIsN0 = true;
+ // Do we need to negate the result?
+ bool NegateResult = false;
+
+ if (ConstValue.isNonNegative()) {
+ // (mul x, 2^N + 1) => (add (shl x, N), x)
+ // (mul x, 2^N - 1) => (sub (shl x, N), x)
+ // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
+ APInt SCVMinus1 = ShiftedConstValue - 1;
+ APInt CVPlus1 = ConstValue + 1;
+ if (SCVMinus1.isPowerOf2()) {
+ ShiftAmt = SCVMinus1.logBase2();
+ AddSubOpc = ISD::ADD;
+ } else if (CVPlus1.isPowerOf2()) {
+ ShiftAmt = CVPlus1.logBase2();
+ AddSubOpc = ISD::SUB;
+ } else
+ return SDValue();
+ } else {
+ // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
+ // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
+ APInt CVNegPlus1 = -ConstValue + 1;
+ APInt CVNegMinus1 = -ConstValue - 1;
+ if (CVNegPlus1.isPowerOf2()) {
+ ShiftAmt = CVNegPlus1.logBase2();
+ AddSubOpc = ISD::SUB;
+ ShiftValUseIsN0 = false;
+ } else if (CVNegMinus1.isPowerOf2()) {
+ ShiftAmt = CVNegMinus1.logBase2();
+ AddSubOpc = ISD::ADD;
+ NegateResult = true;
+ } else
+ return SDValue();
+ }
+
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
+ DAG.getConstant(ShiftAmt, DL, MVT::i64));
+
+ SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0;
+ SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal;
+ SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1);
+ assert(!(NegateResult && TrailingZeroes) &&
+ "NegateResult and TrailingZeroes cannot both be true for now.");
+ // Negate the result.
+ if (NegateResult)
+ return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
+ // Shift the result.
+ if (TrailingZeroes)
+ return DAG.getNode(ISD::SHL, DL, VT, Res,
+ DAG.getConstant(TrailingZeroes, DL, MVT::i64));
+ return Res;
}
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
@@ -7655,7 +7890,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
// Only optimize when the source and destination types have the same width.
- if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits())
+ if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
return SDValue();
// If the result of an integer load is only used by an integer-to-float
@@ -7757,13 +7992,15 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
/// Fold a floating-point divide by power of two into fixed-point to
/// floating-point conversion.
static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
if (!Subtarget->hasNEON())
return SDValue();
SDValue Op = N->getOperand(0);
unsigned Opc = Op->getOpcode();
- if (!Op.getValueType().isVector() ||
+ if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
+ !Op.getOperand(0).getValueType().isSimple() ||
(Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
return SDValue();
@@ -7800,10 +8037,13 @@ static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
break;
case 4:
- ResTy = MVT::v4i32;
+ ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
break;
}
+ if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
+ return SDValue();
+
SDLoc DL(N);
SDValue ConvInput = Op.getOperand(0);
bool IsSigned = Opc == ISD::SINT_TO_FP;
@@ -7855,13 +8095,13 @@ static SDValue tryCombineToEXTR(SDNode *N,
SDValue LHS;
uint32_t ShiftLHS = 0;
- bool LHSFromHi = 0;
+ bool LHSFromHi = false;
if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
return SDValue();
SDValue RHS;
uint32_t ShiftRHS = 0;
- bool RHSFromHi = 0;
+ bool RHSFromHi = false;
if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
return SDValue();
@@ -7901,7 +8141,7 @@ static SDValue tryCombineToBSL(SDNode *N,
// We only have to look for constant vectors here since the general, variable
// case can be handled in TableGen.
- unsigned Bits = VT.getVectorElementType().getSizeInBits();
+ unsigned Bits = VT.getScalarSizeInBits();
uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
for (int i = 1; i >= 0; --i)
for (int j = 1; j >= 0; --j) {
@@ -8090,7 +8330,7 @@ static SDValue performConcatVectorsCombine(SDNode *N,
// splat. The indexed instructions are going to be expecting a DUPLANE64, so
// canonicalise to that.
if (N0 == N1 && VT.getVectorNumElements() == 2) {
- assert(VT.getVectorElementType().getSizeInBits() == 64);
+ assert(VT.getScalarSizeInBits() == 64);
return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
DAG.getConstant(0, dl, MVT::i64));
}
@@ -8153,7 +8393,7 @@ static SDValue tryCombineFixedPointConvert(SDNode *N,
// The vector width should be 128 bits by the time we get here, even
// if it started as 64 bits (the extract_vector handling will have
// done so).
- assert(Vec.getValueType().getSizeInBits() == 128 &&
+ assert(Vec.getValueSizeInBits() == 128 &&
"unexpected vector size on extract_vector_elt!");
if (Vec.getValueType() == MVT::v4i32)
VecResTy = MVT::v4f32;
@@ -8655,7 +8895,7 @@ static SDValue performExtendCombine(SDNode *N,
if (SrcVT.getSizeInBits() != 64)
return SDValue();
- unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
+ unsigned SrcEltSize = SrcVT.getScalarSizeInBits();
unsigned ElementCount = SrcVT.getVectorNumElements();
SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
SDLoc DL(N);
@@ -8684,13 +8924,101 @@ static SDValue performExtendCombine(SDNode *N,
return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
}
+static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
+ SDValue SplatVal, unsigned NumVecElts) {
+ unsigned OrigAlignment = St.getAlignment();
+ unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
+
+ // Create scalar stores. This is at least as good as the code sequence for a
+ // split unaligned store which is a dup.s, ext.b, and two stores.
+ // Most of the time the three stores should be replaced by store pair
+ // instructions (stp).
+ SDLoc DL(&St);
+ SDValue BasePtr = St.getBasePtr();
+ const MachinePointerInfo &PtrInfo = St.getPointerInfo();
+ SDValue NewST1 =
+ DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
+ OrigAlignment, St.getMemOperand()->getFlags());
+
+ unsigned Offset = EltOffset;
+ while (--NumVecElts) {
+ unsigned Alignment = MinAlign(OrigAlignment, Offset);
+ SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+ DAG.getConstant(Offset, DL, MVT::i64));
+ NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
+ PtrInfo.getWithOffset(Offset), Alignment,
+ St.getMemOperand()->getFlags());
+ Offset += EltOffset;
+ }
+ return NewST1;
+}
+
+/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
+/// load store optimizer pass will merge them to store pair stores. This should
+/// be better than a movi to create the vector zero followed by a vector store
+/// if the zero constant is not re-used, since one instructions and one register
+/// live range will be removed.
+///
+/// For example, the final generated code should be:
+///
+/// stp xzr, xzr, [x0]
+///
+/// instead of:
+///
+/// movi v0.2d, #0
+/// str q0, [x0]
+///
+static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
+ SDValue StVal = St.getValue();
+ EVT VT = StVal.getValueType();
+
+ // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
+ // 2, 3 or 4 i32 elements.
+ int NumVecElts = VT.getVectorNumElements();
+ if (!(((NumVecElts == 2 || NumVecElts == 3) &&
+ VT.getVectorElementType().getSizeInBits() == 64) ||
+ ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
+ VT.getVectorElementType().getSizeInBits() == 32)))
+ return SDValue();
+
+ if (StVal.getOpcode() != ISD::BUILD_VECTOR)
+ return SDValue();
+
+ // If the zero constant has more than one use then the vector store could be
+ // better since the constant mov will be amortized and stp q instructions
+ // should be able to be formed.
+ if (!StVal.hasOneUse())
+ return SDValue();
+
+ // If the immediate offset of the address operand is too large for the stp
+ // instruction, then bail out.
+ if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
+ int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
+ if (Offset < -512 || Offset > 504)
+ return SDValue();
+ }
+
+ for (int I = 0; I < NumVecElts; ++I) {
+ SDValue EltVal = StVal.getOperand(I);
+ if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
+ return SDValue();
+ }
+
+ // Use WZR/XZR here to prevent DAGCombiner::MergeConsecutiveStores from
+ // undoing this transformation.
+ SDValue SplatVal = VT.getVectorElementType().getSizeInBits() == 32
+ ? DAG.getRegister(AArch64::WZR, MVT::i32)
+ : DAG.getRegister(AArch64::XZR, MVT::i64);
+ return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
+}
+
/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
/// value. The load store optimizer pass will merge them to store pair stores.
/// This has better performance than a splat of the scalar followed by a split
/// vector store. Even if the stores are not merged it is four stores vs a dup,
/// followed by an ext.b and two stores.
-static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
- SDValue StVal = St->getValue();
+static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
+ SDValue StVal = St.getValue();
EVT VT = StVal.getValueType();
// Don't replace floating point stores, they possibly won't be transformed to
@@ -8698,55 +9026,48 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
if (VT.isFloatingPoint())
return SDValue();
- // Check for insert vector elements.
- if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
- return SDValue();
-
// We can express a splat as store pair(s) for 2 or 4 elements.
unsigned NumVecElts = VT.getVectorNumElements();
if (NumVecElts != 4 && NumVecElts != 2)
return SDValue();
- SDValue SplatVal = StVal.getOperand(1);
- unsigned RemainInsertElts = NumVecElts - 1;
// Check that this is a splat.
- while (--RemainInsertElts) {
- SDValue NextInsertElt = StVal.getOperand(0);
- if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
+ // Make sure that each of the relevant vector element locations are inserted
+ // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
+ std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
+ SDValue SplatVal;
+ for (unsigned I = 0; I < NumVecElts; ++I) {
+ // Check for insert vector elements.
+ if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
return SDValue();
- if (NextInsertElt.getOperand(1) != SplatVal)
+
+ // Check that same value is inserted at each vector element.
+ if (I == 0)
+ SplatVal = StVal.getOperand(1);
+ else if (StVal.getOperand(1) != SplatVal)
return SDValue();
- StVal = NextInsertElt;
- }
- unsigned OrigAlignment = St->getAlignment();
- unsigned EltOffset = NumVecElts == 4 ? 4 : 8;
- unsigned Alignment = std::min(OrigAlignment, EltOffset);
- // Create scalar stores. This is at least as good as the code sequence for a
- // split unaligned store which is a dup.s, ext.b, and two stores.
- // Most of the time the three stores should be replaced by store pair
- // instructions (stp).
- SDLoc DL(St);
- SDValue BasePtr = St->getBasePtr();
- SDValue NewST1 =
- DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(),
- St->getAlignment(), St->getMemOperand()->getFlags());
+ // Check insert element index.
+ ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
+ if (!CIndex)
+ return SDValue();
+ uint64_t IndexVal = CIndex->getZExtValue();
+ if (IndexVal >= NumVecElts)
+ return SDValue();
+ IndexNotInserted.reset(IndexVal);
- unsigned Offset = EltOffset;
- while (--NumVecElts) {
- SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
- DAG.getConstant(Offset, DL, MVT::i64));
- NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
- St->getPointerInfo(), Alignment,
- St->getMemOperand()->getFlags());
- Offset += EltOffset;
+ StVal = StVal.getOperand(0);
}
- return NewST1;
+ // Check that all vector element locations were inserted to.
+ if (IndexNotInserted.any())
+ return SDValue();
+
+ return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
}
-static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
- SelectionDAG &DAG,
- const AArch64Subtarget *Subtarget) {
+static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
if (!DCI.isBeforeLegalize())
return SDValue();
@@ -8754,6 +9075,17 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
if (S->isVolatile())
return SDValue();
+ SDValue StVal = S->getValue();
+ EVT VT = StVal.getValueType();
+ if (!VT.isVector())
+ return SDValue();
+
+ // If we get a splat of zeros, convert this vector store to a store of
+ // scalars. They will be merged into store pairs of xzr thereby removing one
+ // instruction and one register.
+ if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
+ return ReplacedZeroSplat;
+
// FIXME: The logic for deciding if an unaligned store should be split should
// be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
// a call to that function here.
@@ -8765,12 +9097,9 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
if (DAG.getMachineFunction().getFunction()->optForMinSize())
return SDValue();
- SDValue StVal = S->getValue();
- EVT VT = StVal.getValueType();
-
// Don't split v2i64 vectors. Memcpy lowering produces those and splitting
// those up regresses performance on micro-benchmarks and olden/bh.
- if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
+ if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
return SDValue();
// Split unaligned 16B stores. They are terrible for performance.
@@ -8785,7 +9114,7 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
// If we get a splat of a scalar convert this vector store to a store of
// scalars. They will be merged into store pairs thereby removing two
// instructions.
- if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S))
+ if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
return ReplacedSplat;
SDLoc DL(S);
@@ -8928,7 +9257,7 @@ static SDValue performSTORECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
- if (SDValue Split = split16BStores(N, DCI, DAG, Subtarget))
+ if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
return Split;
if (Subtarget->supportsAddressTopByteIgnored() &&
@@ -9455,52 +9784,51 @@ static bool isEquivalentMaskless(unsigned CC, unsigned width,
switch(CC) {
case AArch64CC::LE:
- case AArch64CC::GT: {
+ case AArch64CC::GT:
if ((AddConstant == 0) ||
(CompConstant == MaxUInt - 1 && AddConstant < 0) ||
(AddConstant >= 0 && CompConstant < 0) ||
(AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
return true;
- } break;
+ break;
case AArch64CC::LT:
- case AArch64CC::GE: {
+ case AArch64CC::GE:
if ((AddConstant == 0) ||
(AddConstant >= 0 && CompConstant <= 0) ||
(AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
return true;
- } break;
+ break;
case AArch64CC::HI:
- case AArch64CC::LS: {
+ case AArch64CC::LS:
if ((AddConstant >= 0 && CompConstant < 0) ||
(AddConstant <= 0 && CompConstant >= -1 &&
CompConstant < AddConstant + MaxUInt))
return true;
- } break;
+ break;
case AArch64CC::PL:
- case AArch64CC::MI: {
+ case AArch64CC::MI:
if ((AddConstant == 0) ||
(AddConstant > 0 && CompConstant <= 0) ||
(AddConstant < 0 && CompConstant <= AddConstant))
return true;
- } break;
+ break;
case AArch64CC::LO:
- case AArch64CC::HS: {
+ case AArch64CC::HS:
if ((AddConstant >= 0 && CompConstant <= 0) ||
(AddConstant <= 0 && CompConstant >= 0 &&
CompConstant <= AddConstant + MaxUInt))
return true;
- } break;
+ break;
case AArch64CC::EQ:
- case AArch64CC::NE: {
+ case AArch64CC::NE:
if ((AddConstant > 0 && CompConstant < 0) ||
(AddConstant < 0 && CompConstant >= 0 &&
CompConstant < AddConstant + MaxUInt) ||
(AddConstant >= 0 && CompConstant >= 0 &&
CompConstant >= AddConstant) ||
(AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
-
return true;
- } break;
+ break;
case AArch64CC::VS:
case AArch64CC::VC:
case AArch64CC::AL:
@@ -9862,7 +10190,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::FP_TO_UINT:
return performFpToIntCombine(N, DAG, DCI, Subtarget);
case ISD::FDIV:
- return performFDivCombine(N, DAG, Subtarget);
+ return performFDivCombine(N, DAG, DCI, Subtarget);
case ISD::OR:
return performORCombine(N, DCI, Subtarget);
case ISD::SRL:
@@ -9995,8 +10323,10 @@ bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
// All of the indexed addressing mode instructions take a signed
// 9 bit immediate offset.
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
- int64_t RHSC = (int64_t)RHS->getZExtValue();
- if (RHSC >= 256 || RHSC <= -256)
+ int64_t RHSC = RHS->getSExtValue();
+ if (Op->getOpcode() == ISD::SUB)
+ RHSC = -(uint64_t)RHSC;
+ if (!isInt<9>(RHSC))
return false;
IsInc = (Op->getOpcode() == ISD::ADD);
Offset = Op->getOperand(1);
@@ -10222,7 +10552,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
if (ValTy->getPrimitiveSizeInBits() == 128) {
Intrinsic::ID Int =
IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
- Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int);
+ Function *Ldxr = Intrinsic::getDeclaration(M, Int);
Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
@@ -10238,7 +10568,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
Type *Tys[] = { Addr->getType() };
Intrinsic::ID Int =
IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
- Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys);
+ Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
return Builder.CreateTruncOrBitCast(
Builder.CreateCall(Ldxr, Addr),
@@ -10248,8 +10578,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
IRBuilder<> &Builder) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
- Builder.CreateCall(
- llvm::Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
+ Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
}
Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
OpenPOWER on IntegriCloud