diff options
Diffstat (limited to 'contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp | 1823 |
1 files changed, 1219 insertions, 604 deletions
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp index 0f84a23..27dda93 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -13,46 +13,100 @@ //===----------------------------------------------------------------------===// #include "ARMISelLowering.h" +#include "ARMBaseInstrInfo.h" +#include "ARMBaseRegisterInfo.h" #include "ARMCallingConv.h" #include "ARMConstantPoolValue.h" #include "ARMMachineFunctionInfo.h" #include "ARMPerfectShuffle.h" +#include "ARMRegisterInfo.h" +#include "ARMSelectionDAGInfo.h" #include "ARMSubtarget.h" -#include "ARMTargetMachine.h" -#include "ARMTargetObjectFile.h" #include "MCTargetDesc/ARMAddressingModes.h" +#include "MCTargetDesc/ARMBaseInfo.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Triple.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" -#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Attributes.h" #include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" -#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" #include "llvm/IR/Type.h" -#include "llvm/MC/MCSectionMachO.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCInstrItineraries.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSchedule.h" +#include "llvm/Support/AtomicOrdering.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <cstdlib> +#include <iterator> +#include <limits> +#include <string> +#include <tuple> #include <utility> +#include <vector> + using namespace llvm; #define DEBUG_TYPE "arm-isel" @@ -72,7 +126,7 @@ static cl::opt<bool> EnableConstpoolPromotion( "arm-promote-constant", cl::Hidden, cl::desc("Enable / disable promotion of unnamed_addr constants into " "constant pools"), - cl::init(true)); + cl::init(false)); // FIXME: set to true by default once PR32780 is fixed static cl::opt<unsigned> ConstpoolPromotionMaxSize( "arm-promote-constant-max-size", cl::Hidden, cl::desc("Maximum size of constant to promote into a constant pool"), @@ -82,21 +136,6 @@ static cl::opt<unsigned> ConstpoolPromotionMaxTotal( cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128)); -namespace { - class ARMCCState : public CCState { - public: - ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF, - SmallVectorImpl<CCValAssign> &locs, LLVMContext &C, - ParmContext PC) - : CCState(CC, isVarArg, MF, locs, C) { - assert(((PC == Call) || (PC == Prologue)) && - "ARMCCState users must specify whether their context is call" - "or prologue generation."); - CallOrPrologue = PC; - } - }; -} - // The APCS parameter registers. static const MCPhysReg GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 @@ -162,7 +201,7 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) - for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) + for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) setOperationAction(Opcode, VT, Legal); } @@ -433,9 +472,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } // Use divmod compiler-rt calls for iOS 5.0 and later. - if (Subtarget->isTargetWatchOS() || - (Subtarget->isTargetIOS() && - !Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { + if (Subtarget->isTargetMachO() && + !(Subtarget->isTargetIOS() && + Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); } @@ -545,7 +584,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); setOperationAction(ISD::FSIN, MVT::v2f64, Expand); setOperationAction(ISD::FCOS, MVT::v2f64, Expand); - setOperationAction(ISD::FPOWI, MVT::v2f64, Expand); setOperationAction(ISD::FPOW, MVT::v2f64, Expand); setOperationAction(ISD::FLOG, MVT::v2f64, Expand); setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); @@ -563,7 +601,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); setOperationAction(ISD::FSIN, MVT::v4f32, Expand); setOperationAction(ISD::FCOS, MVT::v4f32, Expand); - setOperationAction(ISD::FPOWI, MVT::v4f32, Expand); setOperationAction(ISD::FPOW, MVT::v4f32, Expand); setOperationAction(ISD::FLOG, MVT::v4f32, Expand); setOperationAction(ISD::FLOG2, MVT::v4f32, Expand); @@ -580,7 +617,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSQRT, MVT::v2f32, Expand); setOperationAction(ISD::FSIN, MVT::v2f32, Expand); setOperationAction(ISD::FCOS, MVT::v2f32, Expand); - setOperationAction(ISD::FPOWI, MVT::v2f32, Expand); setOperationAction(ISD::FPOW, MVT::v2f32, Expand); setOperationAction(ISD::FLOG, MVT::v2f32, Expand); setOperationAction(ISD::FLOG2, MVT::v2f32, Expand); @@ -685,10 +721,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } } - // ARM and Thumb2 support UMLAL/SMLAL. - if (!Subtarget->isThumb1Only()) - setTargetDAGCombine(ISD::ADDC); - if (Subtarget->isFPOnlySP()) { // When targeting a floating-point unit with only single-precision // operations, f64 is legal for the few double-precision instructions which @@ -707,7 +739,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSQRT, MVT::f64, Expand); setOperationAction(ISD::FSIN, MVT::f64, Expand); setOperationAction(ISD::FCOS, MVT::f64, Expand); - setOperationAction(ISD::FPOWI, MVT::f64, Expand); setOperationAction(ISD::FPOW, MVT::f64, Expand); setOperationAction(ISD::FLOG, MVT::f64, Expand); setOperationAction(ISD::FLOG2, MVT::f64, Expand); @@ -786,14 +817,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRL, MVT::i64, Custom); setOperationAction(ISD::SRA, MVT::i64, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); - if (!Subtarget->isThumb1Only()) { - // FIXME: We should do this for Thumb1 as well. - setOperationAction(ISD::ADDC, MVT::i32, Custom); - setOperationAction(ISD::ADDE, MVT::i32, Custom); - setOperationAction(ISD::SUBC, MVT::i32, Custom); - setOperationAction(ISD::SUBE, MVT::i32, Custom); - } + setOperationAction(ISD::ADDC, MVT::i32, Custom); + setOperationAction(ISD::ADDE, MVT::i32, Custom); + setOperationAction(ISD::SUBC, MVT::i32, Custom); + setOperationAction(ISD::SUBE, MVT::i32, Custom); if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); @@ -820,7 +849,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (!Subtarget->hasV6Ops()) setOperationAction(ISD::BSWAP, MVT::i32, Expand); - bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivide() + bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() : Subtarget->hasDivideInARMMode(); if (!hasDivide) { // These are expanded into libcalls if the cpu doesn't have HW divider. @@ -828,7 +857,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::UDIV, MVT::i32, LibCall); } - if (Subtarget->isTargetWindows() && !Subtarget->hasDivide()) { + if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) { setOperationAction(ISD::SDIV, MVT::i32, Custom); setOperationAction(ISD::UDIV, MVT::i32, Custom); @@ -1305,6 +1334,16 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::UMAAL: return "ARMISD::UMAAL"; case ARMISD::UMLAL: return "ARMISD::UMLAL"; case ARMISD::SMLAL: return "ARMISD::SMLAL"; + case ARMISD::SMLALBB: return "ARMISD::SMLALBB"; + case ARMISD::SMLALBT: return "ARMISD::SMLALBT"; + case ARMISD::SMLALTB: return "ARMISD::SMLALTB"; + case ARMISD::SMLALTT: return "ARMISD::SMLALTT"; + case ARMISD::SMULWB: return "ARMISD::SMULWB"; + case ARMISD::SMULWT: return "ARMISD::SMULWT"; + case ARMISD::SMLALD: return "ARMISD::SMLALD"; + case ARMISD::SMLALDX: return "ARMISD::SMLALDX"; + case ARMISD::SMLSLD: return "ARMISD::SMLSLD"; + case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX"; case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; case ARMISD::BFI: return "ARMISD::BFI"; case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; @@ -1414,6 +1453,40 @@ Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { // Lowering Code //===----------------------------------------------------------------------===// +static bool isSRL16(const SDValue &Op) { + if (Op.getOpcode() != ISD::SRL) + return false; + if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) + return Const->getZExtValue() == 16; + return false; +} + +static bool isSRA16(const SDValue &Op) { + if (Op.getOpcode() != ISD::SRA) + return false; + if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) + return Const->getZExtValue() == 16; + return false; +} + +static bool isSHL16(const SDValue &Op) { + if (Op.getOpcode() != ISD::SHL) + return false; + if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) + return Const->getZExtValue() == 16; + return false; +} + +// Check for a signed 16-bit value. We special case SRA because it makes it +// more simple when also looking for SRAs that aren't sign extending a +// smaller value. Without the check, we'd need to take extra care with +// checking order for some operations. +static bool isS16(const SDValue &Op, SelectionDAG &DAG) { + if (isSRA16(Op)) + return isSHL16(Op.getOperand(0)); + return DAG.ComputeNumSignBits(Op) == 17; +} + /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { switch (CC) { @@ -1433,22 +1506,34 @@ static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, - ARMCC::CondCodes &CondCode2) { + ARMCC::CondCodes &CondCode2, bool &InvalidOnQNaN) { CondCode2 = ARMCC::AL; + InvalidOnQNaN = true; switch (CC) { default: llvm_unreachable("Unknown FP condition!"); case ISD::SETEQ: - case ISD::SETOEQ: CondCode = ARMCC::EQ; break; + case ISD::SETOEQ: + CondCode = ARMCC::EQ; + InvalidOnQNaN = false; + break; case ISD::SETGT: case ISD::SETOGT: CondCode = ARMCC::GT; break; case ISD::SETGE: case ISD::SETOGE: CondCode = ARMCC::GE; break; case ISD::SETOLT: CondCode = ARMCC::MI; break; case ISD::SETOLE: CondCode = ARMCC::LS; break; - case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break; + case ISD::SETONE: + CondCode = ARMCC::MI; + CondCode2 = ARMCC::GT; + InvalidOnQNaN = false; + break; case ISD::SETO: CondCode = ARMCC::VC; break; case ISD::SETUO: CondCode = ARMCC::VS; break; - case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break; + case ISD::SETUEQ: + CondCode = ARMCC::EQ; + CondCode2 = ARMCC::VS; + InvalidOnQNaN = false; + break; case ISD::SETUGT: CondCode = ARMCC::HI; break; case ISD::SETUGE: CondCode = ARMCC::PL; break; case ISD::SETLT: @@ -1456,7 +1541,10 @@ static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, case ISD::SETLE: case ISD::SETULE: CondCode = ARMCC::LE; break; case ISD::SETNE: - case ISD::SETUNE: CondCode = ARMCC::NE; break; + case ISD::SETUNE: + CondCode = ARMCC::NE; + InvalidOnQNaN = false; + break; } } @@ -1549,8 +1637,8 @@ SDValue ARMTargetLowering::LowerCallResult( // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; - ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, - *DAG.getContext(), Call); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg)); // Copy all of the result registers out of their specified physreg. @@ -1710,8 +1798,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, - *DAG.getContext(), Call); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg)); // Get a count of how many bytes are to be pushed on the stack. @@ -1724,8 +1812,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!isSibCall) - Chain = DAG.getCALLSEQ_START(Chain, - DAG.getIntPtrConstant(NumBytes, dl, true), dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); @@ -2088,10 +2175,6 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, /// this. void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, unsigned Align) const { - assert((State->getCallOrPrologue() == Prologue || - State->getCallOrPrologue() == Call) && - "unhandled ParmContext"); - // Byval (as with any stack) slots are always at least 4 byte aligned. Align = std::max(Align, 4U); @@ -2148,7 +2231,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, const TargetInstrInfo *TII) { unsigned Bytes = Arg.getValueSizeInBits() / 8; - int FI = INT_MAX; + int FI = std::numeric_limits<int>::max(); if (Arg.getOpcode() == ISD::CopyFromReg) { unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); if (!TargetRegisterInfo::isVirtualRegister(VR)) @@ -2178,7 +2261,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, } else return false; - assert(FI != INT_MAX); + assert(FI != std::numeric_limits<int>::max()); if (!MFI.isFixedObjectIndex(FI)) return false; return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI); @@ -2260,7 +2343,7 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // Check if stack adjustment is needed. For now, do not do this if any // argument is passed on the stack. SmallVector<CCValAssign, 16> ArgLocs; - ARMCCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C, Call); + CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); if (CCInfo.getNextStackOffset()) { // Check if the arguments are already laid out in the right way as @@ -2362,8 +2445,8 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SmallVector<CCValAssign, 16> RVLocs; // CCState - Info about the registers and stack slots. - ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, - *DAG.getContext(), Call); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); // Analyze outgoing return values. CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); @@ -2550,7 +2633,7 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { return true; } -bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { +bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { if (!Subtarget->supportsTailCall()) return false; @@ -2586,12 +2669,35 @@ static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) { // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only // be used to form addressing mode. These wrapped nodes will be selected // into MOVi. -static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) { +SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, + SelectionDAG &DAG) const { EVT PtrVT = Op.getValueType(); // FIXME there is no actual debug info here SDLoc dl(Op); ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); SDValue Res; + + // When generating execute-only code Constant Pools must be promoted to the + // global data section. It's a bit ugly that we can't share them across basic + // blocks, but this way we guarantee that execute-only behaves correct with + // position-independent addressing modes. + if (Subtarget->genExecuteOnly()) { + auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); + auto T = const_cast<Type*>(CP->getType()); + auto C = const_cast<Constant*>(CP->getConstVal()); + auto M = const_cast<Module*>(DAG.getMachineFunction(). + getFunction()->getParent()); + auto GV = new GlobalVariable( + *M, T, /*isConst=*/true, GlobalVariable::InternalLinkage, C, + Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" + + Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" + + Twine(AFI->createPICLabelUId()) + ); + SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV), + dl, PtrVT); + return LowerGlobalAddress(GA, DAG); + } + if (CP->isMachineConstantPoolEntry()) Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlignment()); @@ -2790,9 +2896,9 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, // FIXME: is there useful debug info available here? TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(Chain) - .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()), - DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args)); + CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( + CallingConv::C, Type::getInt32Ty(*DAG.getContext()), + DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args)); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); return CallResult.first; @@ -2935,7 +3041,7 @@ static bool isSimpleType(Type *T) { } static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG, - EVT PtrVT, SDLoc dl) { + EVT PtrVT, const SDLoc &dl) { // If we're creating a pool entry for a constant global with unnamed address, // and the global is small enough, we can emit it inline into the constant pool // to save ourselves an indirection. @@ -2980,7 +3086,8 @@ static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG, unsigned RequiredPadding = 4 - (Size % 4); bool PaddingPossible = RequiredPadding == 4 || (CDAInit && CDAInit->isString()); - if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize) + if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize || + Size == 0) return SDValue(); unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding); @@ -3034,6 +3141,19 @@ static bool isReadOnly(const GlobalValue *GV) { isa<Function>(GV); } +SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op, + SelectionDAG &DAG) const { + switch (Subtarget->getTargetTriple().getObjectFormat()) { + default: llvm_unreachable("unknown object format"); + case Triple::COFF: + return LowerGlobalAddressWindows(Op, DAG); + case Triple::ELF: + return LowerGlobalAddressELF(Op, DAG); + case Triple::MachO: + return LowerGlobalAddressDarwin(Op, DAG); + } +} + SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(DAG.getDataLayout()); @@ -3080,15 +3200,22 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, return Result; } else if (Subtarget->isRWPI() && !IsRO) { // SB-relative. - ARMConstantPoolValue *CPV = - ARMConstantPoolConstant::Create(GV, ARMCP::SBREL); - SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); - CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - SDValue G = DAG.getLoad( - PtrVT, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); + SDValue RelAddr; + if (Subtarget->useMovt(DAG.getMachineFunction())) { + ++NumMovwMovt; + SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL); + RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G); + } else { // use literal pool for address constant + ARMConstantPoolValue *CPV = + ARMConstantPoolConstant::Create(GV, ARMCP::SBREL); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + RelAddr = DAG.getLoad( + PtrVT, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); + } SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT); - SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, G); + SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr); return Result; } @@ -3219,6 +3346,9 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, } return Result; } + case Intrinsic::arm_neon_vabs: + return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(), + Op.getOperand(1)); case Intrinsic::arm_neon_vmulls: case Intrinsic::arm_neon_vmullu: { unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) @@ -3256,13 +3386,23 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } + case Intrinsic::arm_neon_vtbl1: + return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::arm_neon_vtbl2: + return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } } static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { - // FIXME: handle "fence singlethread" more efficiently. SDLoc dl(Op); + ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2)); + auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue()); + if (SSID == SyncScope::SingleThread) + return Op; + if (!Subtarget->hasDataBarrier()) { // Some ARMv6 cpus can support data barriers with an mcr instruction. // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get @@ -3462,8 +3602,8 @@ SDValue ARMTargetLowering::LowerFormalArguments( // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, - *DAG.getContext(), Prologue); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); SmallVector<SDValue, 16> ArgValues; @@ -3595,7 +3735,6 @@ SDValue ARMTargetLowering::LowerFormalArguments( InVals.push_back(ArgValue); } else { // VA.isRegLoc() - // sanity check assert(VA.isMemLoc()); assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); @@ -3734,13 +3873,15 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, - SelectionDAG &DAG, const SDLoc &dl) const { + SelectionDAG &DAG, const SDLoc &dl, + bool InvalidOnQNaN) const { assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64); SDValue Cmp; + SDValue C = DAG.getConstant(InvalidOnQNaN, dl, MVT::i32); if (!isFloatingPointZero(RHS)) - Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS); + Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS, C); else - Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS); + Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS, C); return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); } @@ -3757,10 +3898,12 @@ ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { Cmp = Cmp.getOperand(0); Opc = Cmp.getOpcode(); if (Opc == ARMISD::CMPFP) - Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); + Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0), + Cmp.getOperand(1), Cmp.getOperand(2)); else { assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); - Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0)); + Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0), + Cmp.getOperand(1)); } return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); } @@ -3808,7 +3951,6 @@ ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, return std::make_pair(Value, OverflowCmp); } - SDValue ARMTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { // Let legalize expand this if it isn't a legal type yet. @@ -3832,7 +3974,6 @@ ARMTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); } - SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Cond = Op.getOperand(0); SDValue SelectTrue = Op.getOperand(1); @@ -4025,7 +4166,6 @@ static bool isUpperSaturate(const SDValue LHS, const SDValue RHS, // Additionally, the variable is returned in parameter V and the constant in K. static bool isSaturatingConditional(const SDValue &Op, SDValue &V, uint64_t &K) { - SDValue LHS1 = Op.getOperand(0); SDValue RHS1 = Op.getOperand(1); SDValue TrueVal1 = Op.getOperand(2); @@ -4046,10 +4186,10 @@ static bool isSaturatingConditional(const SDValue &Op, SDValue &V, // in each conditional SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1) ? &RHS1 - : NULL; + : nullptr; SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2) ? &RHS2 - : NULL; + : nullptr; SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2; SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1; SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2; @@ -4073,13 +4213,15 @@ static bool isSaturatingConditional(const SDValue &Op, SDValue &V, const SDValue *LowerCheckOp = isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) ? &Op - : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2 - : NULL; + : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) + ? &Op2 + : nullptr; const SDValue *UpperCheckOp = isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) ? &Op - : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2 - : NULL; + : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) + ? &Op2 + : nullptr; if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp) return false; @@ -4104,7 +4246,6 @@ static bool isSaturatingConditional(const SDValue &Op, SDValue &V, } SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); SDLoc dl(Op); @@ -4162,7 +4303,8 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { } ARMCC::CondCodes CondCode, CondCode2; - FPCCToARMCC(CC, CondCode, CondCode2); + bool InvalidOnQNaN; + FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN); // Try to generate VMAXNM/VMINNM on ARMv8. if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || @@ -4181,13 +4323,13 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { } SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); - SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); + SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); if (CondCode2 != ARMCC::AL) { SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); // FIXME: Needs another CMP because flag can have but one use. - SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); + SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); } return Result; @@ -4348,10 +4490,11 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { } ARMCC::CondCodes CondCode, CondCode2; - FPCCToARMCC(CC, CondCode, CondCode2); + bool InvalidOnQNaN; + FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN); SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); - SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); + SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; @@ -4853,9 +4996,10 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) // so that the shift + and get folded into a bitfield extract. SDLoc dl(Op); - SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, - DAG.getConstant(Intrinsic::arm_get_fpscr, dl, - MVT::i32)); + SDValue Ops[] = { DAG.getEntryNode(), + DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) }; + + SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops); SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, DAG.getConstant(1U << 22, dl, MVT::i32)); SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, @@ -5212,15 +5356,15 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { // Integer comparisons. switch (SetCCOpcode) { default: llvm_unreachable("Illegal integer comparison"); - case ISD::SETNE: Invert = true; + case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH; case ISD::SETEQ: Opc = ARMISD::VCEQ; break; - case ISD::SETLT: Swap = true; + case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETGT: Opc = ARMISD::VCGT; break; - case ISD::SETLE: Swap = true; + case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETGE: Opc = ARMISD::VCGE; break; - case ISD::SETULT: Swap = true; + case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETUGT: Opc = ARMISD::VCGTU; break; - case ISD::SETULE: Swap = true; + case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETUGE: Opc = ARMISD::VCGEU; break; } @@ -5584,7 +5728,6 @@ static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { return true; } - static bool isVEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseVEXT, unsigned &Imm) { unsigned NumElts = VT.getVectorNumElements(); @@ -5758,7 +5901,10 @@ static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { return false; for (unsigned i = 0; i < M.size(); i += NumElts) { - WhichResult = M[i] == 0 ? 0 : 1; + if (M.size() == NumElts * 2) + WhichResult = i / NumElts; + else + WhichResult = M[i] == 0 ? 0 : 1; for (unsigned j = 0; j < NumElts; ++j) { if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) return false; @@ -5789,7 +5935,10 @@ static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ unsigned Half = NumElts / 2; for (unsigned i = 0; i < M.size(); i += NumElts) { - WhichResult = M[i] == 0 ? 0 : 1; + if (M.size() == NumElts * 2) + WhichResult = i / NumElts; + else + WhichResult = M[i] == 0 ? 0 : 1; for (unsigned j = 0; j < NumElts; j += Half) { unsigned Idx = WhichResult; for (unsigned k = 0; k < Half; ++k) { @@ -5829,7 +5978,10 @@ static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { return false; for (unsigned i = 0; i < M.size(); i += NumElts) { - WhichResult = M[i] == 0 ? 0 : 1; + if (M.size() == NumElts * 2) + WhichResult = i / NumElts; + else + WhichResult = M[i] == 0 ? 0 : 1; unsigned Idx = WhichResult * NumElts / 2; for (unsigned j = 0; j < NumElts; j += 2) { if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || @@ -5862,7 +6014,10 @@ static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ return false; for (unsigned i = 0; i < M.size(); i += NumElts) { - WhichResult = M[i] == 0 ? 0 : 1; + if (M.size() == NumElts * 2) + WhichResult = i / NumElts; + else + WhichResult = M[i] == 0 ? 0 : 1; unsigned Idx = WhichResult * NumElts / 2; for (unsigned j = 0; j < NumElts; j += 2) { if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || @@ -6027,10 +6182,10 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, } if (ValueCounts.size() != 1) usesOnlyOneValue = false; - if (!Value.getNode() && ValueCounts.size() > 0) + if (!Value.getNode() && !ValueCounts.empty()) Value = ValueCounts.begin()->first; - if (ValueCounts.size() == 0) + if (ValueCounts.empty()) return DAG.getUNDEF(VT); // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. @@ -6182,8 +6337,8 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, struct ShuffleSourceInfo { SDValue Vec; - unsigned MinElt; - unsigned MaxElt; + unsigned MinElt = std::numeric_limits<unsigned>::max(); + unsigned MaxElt = 0; // We may insert some combination of BITCASTs and VEXT nodes to force Vec to // be compatible with the shuffle we intend to construct. As a result @@ -6192,13 +6347,12 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, // Code should guarantee that element i in Vec starts at element "WindowBase // + i * WindowScale in ShuffleVec". - int WindowBase; - int WindowScale; + int WindowBase = 0; + int WindowScale = 1; + + ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {} bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } - ShuffleSourceInfo(SDValue Vec) - : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0), - WindowScale(1) {} }; // First gather all vectors used as an immediate source for this BUILD_VECTOR @@ -6220,7 +6374,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, // Add this element source to the list if it's not already there. SDValue SourceVec = V.getOperand(0); - auto Source = find(Sources, SourceVec); + auto Source = llvm::find(Sources, SourceVec); if (Source == Sources.end()) Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); @@ -6336,7 +6490,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, if (Entry.isUndef()) continue; - auto Src = find(Sources, Entry.getOperand(0)); + auto Src = llvm::find(Sources, Entry.getOperand(0)); int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit @@ -6633,7 +6787,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { EVT SubVT = SubV1.getValueType(); // We expect these to have been canonicalized to -1. - assert(all_of(ShuffleMask, [&](int i) { + assert(llvm::all_of(ShuffleMask, [&](int i) { return i < (int)VT.getVectorNumElements(); }) && "Unexpected shuffle index into UNDEF operand!"); @@ -6896,8 +7050,19 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { N->getValueType(0), N->getOpcode()); - if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) - return SkipLoadExtensionForVMULL(LD, DAG); + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { + assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) && + "Expected extending load"); + + SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG); + DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1)); + unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + SDValue extLoad = + DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad); + DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad); + + return newLoad; + } // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will // have been legalized as a BITCAST from v4i32. @@ -7242,7 +7407,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Pair of floats / doubles used to pass the result. - Type *RetTy = StructType::get(ArgTy, ArgTy, nullptr); + Type *RetTy = StructType::get(ArgTy, ArgTy); auto &DL = DAG.getDataLayout(); ArgListTy Args; @@ -7258,9 +7423,9 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { ArgListEntry Entry; Entry.Node = SRet; Entry.Ty = RetTy->getPointerTo(); - Entry.isSExt = false; - Entry.isZExt = false; - Entry.isSRet = true; + Entry.IsSExt = false; + Entry.IsZExt = false; + Entry.IsSRet = true; Args.push_back(Entry); RetTy = Type::getVoidTy(*DAG.getContext()); } @@ -7268,8 +7433,8 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { ArgListEntry Entry; Entry.Node = Arg; Entry.Ty = ArgTy; - Entry.isSExt = false; - Entry.isZExt = false; + Entry.IsSExt = false; + Entry.IsZExt = false; Args.push_back(Entry); const char *LibcallName = @@ -7427,6 +7592,9 @@ static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { SDValue VHi = DAG.getAnyExtOrTrunc( DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)), dl, MVT::i32); + bool isBigEndian = DAG.getDataLayout().isBigEndian(); + if (isBigEndian) + std::swap (VLo, VHi); SDValue RegClass = DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32); SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32); @@ -7454,10 +7622,14 @@ static void ReplaceCMP_SWAP_64Results(SDNode *N, MemOp[0] = cast<MemSDNode>(N)->getMemOperand(); cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1); - Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_0, SDLoc(N), MVT::i32, - SDValue(CmpSwap, 0))); - Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_1, SDLoc(N), MVT::i32, - SDValue(CmpSwap, 0))); + bool isBigEndian = DAG.getDataLayout().isBigEndian(); + + Results.push_back( + DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0, + SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); + Results.push_back( + DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1, + SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); Results.push_back(SDValue(CmpSwap, 2)); } @@ -7480,12 +7652,12 @@ static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget, Entry.Node = Val; Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext()); - Entry.isZExt = true; + Entry.IsZExt = true; Args.push_back(Entry); Entry.Node = Exponent; Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext()); - Entry.isZExt = true; + Entry.IsZExt = true; Args.push_back(Entry); Type *LCRTy = Val.getValueType().getTypeForEVT(*DAG.getContext()); @@ -7517,21 +7689,9 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Don't know how to custom lower this!"); case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); - case ISD::ConstantPool: - if (Subtarget->genExecuteOnly()) - llvm_unreachable("execute-only should not generate constant pools"); - return LowerConstantPool(Op, DAG); + case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); - case ISD::GlobalAddress: - switch (Subtarget->getTargetTriple().getObjectFormat()) { - default: llvm_unreachable("unknown object format"); - case Triple::COFF: - return LowerGlobalAddressWindows(Op, DAG); - case Triple::ELF: - return LowerGlobalAddressELF(Op, DAG); - case Triple::MachO: - return LowerGlobalAddressDarwin(Op, DAG); - } + case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); @@ -7607,6 +7767,37 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { } } +static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + unsigned Opc = 0; + if (IntNo == Intrinsic::arm_smlald) + Opc = ARMISD::SMLALD; + else if (IntNo == Intrinsic::arm_smlaldx) + Opc = ARMISD::SMLALDX; + else if (IntNo == Intrinsic::arm_smlsld) + Opc = ARMISD::SMLSLD; + else if (IntNo == Intrinsic::arm_smlsldx) + Opc = ARMISD::SMLSLDX; + else + return; + + SDLoc dl(N); + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, + N->getOperand(3), + DAG.getConstant(0, dl, MVT::i32)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, + N->getOperand(3), + DAG.getConstant(1, dl, MVT::i32)); + + SDValue LongMul = DAG.getNode(Opc, dl, + DAG.getVTList(MVT::i32, MVT::i32), + N->getOperand(1), N->getOperand(2), + Lo, Hi); + Results.push_back(LongMul.getValue(0)); + Results.push_back(LongMul.getValue(1)); +} + /// ReplaceNodeResults - Replace the results of node with an illegal result /// type with new values built out of custom code. void ARMTargetLowering::ReplaceNodeResults(SDNode *N, @@ -7648,6 +7839,8 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, case ISD::ATOMIC_CMP_SWAP: ReplaceCMP_SWAP_64Results(N, Results, DAG); return; + case ISD::INTRINSIC_WO_CHAIN: + return ReplaceLongIntrinsic(N, Results, DAG); } if (Res.getNode()) Results.push_back(Res); @@ -7702,24 +7895,27 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, // add r5, pc // str r5, [$jbuf, #+4] ; &jbuf[1] unsigned NewVReg1 = MRI->createVirtualRegister(TRC); - AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) - .addConstantPoolIndex(CPI) - .addMemOperand(CPMMO)); + BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) + .addConstantPoolIndex(CPI) + .addMemOperand(CPMMO) + .add(predOps(ARMCC::AL)); // Set the low bit because of thumb mode. unsigned NewVReg2 = MRI->createVirtualRegister(TRC); - AddDefaultCC( - AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) - .addReg(NewVReg1, RegState::Kill) - .addImm(0x01))); + BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) + .addReg(NewVReg1, RegState::Kill) + .addImm(0x01) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); unsigned NewVReg3 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) .addReg(NewVReg2, RegState::Kill) .addImm(PCLabelId); - AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) - .addReg(NewVReg3, RegState::Kill) - .addFrameIndex(FI) - .addImm(36) // &jbuf[1] :: pc - .addMemOperand(FIMMOSt)); + BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) + .addReg(NewVReg3, RegState::Kill) + .addFrameIndex(FI) + .addImm(36) // &jbuf[1] :: pc + .addMemOperand(FIMMOSt) + .add(predOps(ARMCC::AL)); } else if (isThumb) { // Incoming value: jbuf // ldr.n r1, LCPI1_4 @@ -7729,51 +7925,58 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, // add r2, $jbuf, #+4 ; &jbuf[1] // str r1, [r2] unsigned NewVReg1 = MRI->createVirtualRegister(TRC); - AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) - .addConstantPoolIndex(CPI) - .addMemOperand(CPMMO)); + BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) + .addConstantPoolIndex(CPI) + .addMemOperand(CPMMO) + .add(predOps(ARMCC::AL)); unsigned NewVReg2 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) .addReg(NewVReg1, RegState::Kill) .addImm(PCLabelId); // Set the low bit because of thumb mode. unsigned NewVReg3 = MRI->createVirtualRegister(TRC); - AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) - .addReg(ARM::CPSR, RegState::Define) - .addImm(1)); + BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) + .addReg(ARM::CPSR, RegState::Define) + .addImm(1) + .add(predOps(ARMCC::AL)); unsigned NewVReg4 = MRI->createVirtualRegister(TRC); - AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) - .addReg(ARM::CPSR, RegState::Define) - .addReg(NewVReg2, RegState::Kill) - .addReg(NewVReg3, RegState::Kill)); + BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) + .addReg(ARM::CPSR, RegState::Define) + .addReg(NewVReg2, RegState::Kill) + .addReg(NewVReg3, RegState::Kill) + .add(predOps(ARMCC::AL)); unsigned NewVReg5 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) .addFrameIndex(FI) .addImm(36); // &jbuf[1] :: pc - AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) - .addReg(NewVReg4, RegState::Kill) - .addReg(NewVReg5, RegState::Kill) - .addImm(0) - .addMemOperand(FIMMOSt)); + BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) + .addReg(NewVReg4, RegState::Kill) + .addReg(NewVReg5, RegState::Kill) + .addImm(0) + .addMemOperand(FIMMOSt) + .add(predOps(ARMCC::AL)); } else { // Incoming value: jbuf // ldr r1, LCPI1_1 // add r1, pc, r1 // str r1, [$jbuf, #+4] ; &jbuf[1] unsigned NewVReg1 = MRI->createVirtualRegister(TRC); - AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) - .addConstantPoolIndex(CPI) - .addImm(0) - .addMemOperand(CPMMO)); + BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) + .addConstantPoolIndex(CPI) + .addImm(0) + .addMemOperand(CPMMO) + .add(predOps(ARMCC::AL)); unsigned NewVReg2 = MRI->createVirtualRegister(TRC); - AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) - .addReg(NewVReg1, RegState::Kill) - .addImm(PCLabelId)); - AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) - .addReg(NewVReg2, RegState::Kill) - .addFrameIndex(FI) - .addImm(36) // &jbuf[1] :: pc - .addMemOperand(FIMMOSt)); + BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) + .addReg(NewVReg1, RegState::Kill) + .addImm(PCLabelId) + .add(predOps(ARMCC::AL)); + BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) + .addReg(NewVReg2, RegState::Kill) + .addFrameIndex(FI) + .addImm(36) // &jbuf[1] :: pc + .addMemOperand(FIMMOSt) + .add(predOps(ARMCC::AL)); } } @@ -7791,7 +7994,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, // Get a mapping of the call site numbers to all of the landing pads they're // associated with. - DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2> > CallSiteNumToLPad; + DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad; unsigned MaxCSNum = 0; for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; ++BB) { @@ -7886,31 +8089,36 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, unsigned NumLPads = LPadList.size(); if (Subtarget->isThumb2()) { unsigned NewVReg1 = MRI->createVirtualRegister(TRC); - AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) - .addFrameIndex(FI) - .addImm(4) - .addMemOperand(FIMMOLd)); + BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) + .addFrameIndex(FI) + .addImm(4) + .addMemOperand(FIMMOLd) + .add(predOps(ARMCC::AL)); if (NumLPads < 256) { - AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) - .addReg(NewVReg1) - .addImm(LPadList.size())); + BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) + .addReg(NewVReg1) + .addImm(LPadList.size()) + .add(predOps(ARMCC::AL)); } else { unsigned VReg1 = MRI->createVirtualRegister(TRC); - AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) - .addImm(NumLPads & 0xFFFF)); + BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) + .addImm(NumLPads & 0xFFFF) + .add(predOps(ARMCC::AL)); unsigned VReg2 = VReg1; if ((NumLPads & 0xFFFF0000) != 0) { VReg2 = MRI->createVirtualRegister(TRC); - AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2) - .addReg(VReg1) - .addImm(NumLPads >> 16)); + BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2) + .addReg(VReg1) + .addImm(NumLPads >> 16) + .add(predOps(ARMCC::AL)); } - AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr)) - .addReg(NewVReg1) - .addReg(VReg2)); + BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr)) + .addReg(NewVReg1) + .addReg(VReg2) + .add(predOps(ARMCC::AL)); } BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc)) @@ -7919,16 +8127,17 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addReg(ARM::CPSR); unsigned NewVReg3 = MRI->createVirtualRegister(TRC); - AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg3) - .addJumpTableIndex(MJTI)); + BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3) + .addJumpTableIndex(MJTI) + .add(predOps(ARMCC::AL)); unsigned NewVReg4 = MRI->createVirtualRegister(TRC); - AddDefaultCC( - AddDefaultPred( - BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) + BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) .addReg(NewVReg3, RegState::Kill) .addReg(NewVReg1) - .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)))); + .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) .addReg(NewVReg4, RegState::Kill) @@ -7936,15 +8145,17 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addJumpTableIndex(MJTI); } else if (Subtarget->isThumb()) { unsigned NewVReg1 = MRI->createVirtualRegister(TRC); - AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) - .addFrameIndex(FI) - .addImm(1) - .addMemOperand(FIMMOLd)); + BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) + .addFrameIndex(FI) + .addImm(1) + .addMemOperand(FIMMOLd) + .add(predOps(ARMCC::AL)); if (NumLPads < 256) { - AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) - .addReg(NewVReg1) - .addImm(NumLPads)); + BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) + .addReg(NewVReg1) + .addImm(NumLPads) + .add(predOps(ARMCC::AL)); } else { MachineConstantPool *ConstantPool = MF->getConstantPool(); Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); @@ -7957,12 +8168,14 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); unsigned VReg1 = MRI->createVirtualRegister(TRC); - AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) - .addReg(VReg1, RegState::Define) - .addConstantPoolIndex(Idx)); - AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr)) - .addReg(NewVReg1) - .addReg(VReg1)); + BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) + .addReg(VReg1, RegState::Define) + .addConstantPoolIndex(Idx) + .add(predOps(ARMCC::AL)); + BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr)) + .addReg(NewVReg1) + .addReg(VReg1) + .add(predOps(ARMCC::AL)); } BuildMI(DispatchBB, dl, TII->get(ARM::tBcc)) @@ -7971,37 +8184,42 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addReg(ARM::CPSR); unsigned NewVReg2 = MRI->createVirtualRegister(TRC); - AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) - .addReg(ARM::CPSR, RegState::Define) - .addReg(NewVReg1) - .addImm(2)); + BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) + .addReg(ARM::CPSR, RegState::Define) + .addReg(NewVReg1) + .addImm(2) + .add(predOps(ARMCC::AL)); unsigned NewVReg3 = MRI->createVirtualRegister(TRC); - AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) - .addJumpTableIndex(MJTI)); + BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) + .addJumpTableIndex(MJTI) + .add(predOps(ARMCC::AL)); unsigned NewVReg4 = MRI->createVirtualRegister(TRC); - AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) - .addReg(ARM::CPSR, RegState::Define) - .addReg(NewVReg2, RegState::Kill) - .addReg(NewVReg3)); + BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) + .addReg(ARM::CPSR, RegState::Define) + .addReg(NewVReg2, RegState::Kill) + .addReg(NewVReg3) + .add(predOps(ARMCC::AL)); MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); unsigned NewVReg5 = MRI->createVirtualRegister(TRC); - AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) - .addReg(NewVReg4, RegState::Kill) - .addImm(0) - .addMemOperand(JTMMOLd)); + BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) + .addReg(NewVReg4, RegState::Kill) + .addImm(0) + .addMemOperand(JTMMOLd) + .add(predOps(ARMCC::AL)); unsigned NewVReg6 = NewVReg5; if (IsPositionIndependent) { NewVReg6 = MRI->createVirtualRegister(TRC); - AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) - .addReg(ARM::CPSR, RegState::Define) - .addReg(NewVReg5, RegState::Kill) - .addReg(NewVReg3)); + BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) + .addReg(ARM::CPSR, RegState::Define) + .addReg(NewVReg5, RegState::Kill) + .addReg(NewVReg3) + .add(predOps(ARMCC::AL)); } BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) @@ -8009,31 +8227,36 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addJumpTableIndex(MJTI); } else { unsigned NewVReg1 = MRI->createVirtualRegister(TRC); - AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) - .addFrameIndex(FI) - .addImm(4) - .addMemOperand(FIMMOLd)); + BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) + .addFrameIndex(FI) + .addImm(4) + .addMemOperand(FIMMOLd) + .add(predOps(ARMCC::AL)); if (NumLPads < 256) { - AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) - .addReg(NewVReg1) - .addImm(NumLPads)); + BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) + .addReg(NewVReg1) + .addImm(NumLPads) + .add(predOps(ARMCC::AL)); } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { unsigned VReg1 = MRI->createVirtualRegister(TRC); - AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) - .addImm(NumLPads & 0xFFFF)); + BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) + .addImm(NumLPads & 0xFFFF) + .add(predOps(ARMCC::AL)); unsigned VReg2 = VReg1; if ((NumLPads & 0xFFFF0000) != 0) { VReg2 = MRI->createVirtualRegister(TRC); - AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2) - .addReg(VReg1) - .addImm(NumLPads >> 16)); + BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2) + .addReg(VReg1) + .addImm(NumLPads >> 16) + .add(predOps(ARMCC::AL)); } - AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) - .addReg(NewVReg1) - .addReg(VReg2)); + BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) + .addReg(NewVReg1) + .addReg(VReg2) + .add(predOps(ARMCC::AL)); } else { MachineConstantPool *ConstantPool = MF->getConstantPool(); Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); @@ -8046,13 +8269,15 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); unsigned VReg1 = MRI->createVirtualRegister(TRC); - AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) - .addReg(VReg1, RegState::Define) - .addConstantPoolIndex(Idx) - .addImm(0)); - AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) - .addReg(NewVReg1) - .addReg(VReg1, RegState::Kill)); + BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) + .addReg(VReg1, RegState::Define) + .addConstantPoolIndex(Idx) + .addImm(0) + .add(predOps(ARMCC::AL)); + BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) + .addReg(NewVReg1) + .addReg(VReg1, RegState::Kill) + .add(predOps(ARMCC::AL)); } BuildMI(DispatchBB, dl, TII->get(ARM::Bcc)) @@ -8061,23 +8286,25 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addReg(ARM::CPSR); unsigned NewVReg3 = MRI->createVirtualRegister(TRC); - AddDefaultCC( - AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) - .addReg(NewVReg1) - .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)))); + BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) + .addReg(NewVReg1) + .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); unsigned NewVReg4 = MRI->createVirtualRegister(TRC); - AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) - .addJumpTableIndex(MJTI)); + BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) + .addJumpTableIndex(MJTI) + .add(predOps(ARMCC::AL)); MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); unsigned NewVReg5 = MRI->createVirtualRegister(TRC); - AddDefaultPred( - BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) - .addReg(NewVReg3, RegState::Kill) - .addReg(NewVReg4) - .addImm(0) - .addMemOperand(JTMMOLd)); + BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) + .addReg(NewVReg3, RegState::Kill) + .addReg(NewVReg4) + .addImm(0) + .addMemOperand(JTMMOLd) + .add(predOps(ARMCC::AL)); if (IsPositionIndependent) { BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) @@ -8222,26 +8449,35 @@ static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); assert(LdOpc != 0 && "Should have a load opcode"); if (LdSize >= 8) { - AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) - .addReg(AddrOut, RegState::Define).addReg(AddrIn) - .addImm(0)); + BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) + .addReg(AddrOut, RegState::Define) + .addReg(AddrIn) + .addImm(0) + .add(predOps(ARMCC::AL)); } else if (IsThumb1) { // load + update AddrIn - AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) - .addReg(AddrIn).addImm(0)); - MachineInstrBuilder MIB = - BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut); - MIB = AddDefaultT1CC(MIB); - MIB.addReg(AddrIn).addImm(LdSize); - AddDefaultPred(MIB); + BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) + .addReg(AddrIn) + .addImm(0) + .add(predOps(ARMCC::AL)); + BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) + .add(t1CondCodeOp()) + .addReg(AddrIn) + .addImm(LdSize) + .add(predOps(ARMCC::AL)); } else if (IsThumb2) { - AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) - .addReg(AddrOut, RegState::Define).addReg(AddrIn) - .addImm(LdSize)); + BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) + .addReg(AddrOut, RegState::Define) + .addReg(AddrIn) + .addImm(LdSize) + .add(predOps(ARMCC::AL)); } else { // arm - AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) - .addReg(AddrOut, RegState::Define).addReg(AddrIn) - .addReg(0).addImm(LdSize)); + BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) + .addReg(AddrOut, RegState::Define) + .addReg(AddrIn) + .addReg(0) + .addImm(LdSize) + .add(predOps(ARMCC::AL)); } } @@ -8254,24 +8490,36 @@ static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); assert(StOpc != 0 && "Should have a store opcode"); if (StSize >= 8) { - AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) - .addReg(AddrIn).addImm(0).addReg(Data)); + BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) + .addReg(AddrIn) + .addImm(0) + .addReg(Data) + .add(predOps(ARMCC::AL)); } else if (IsThumb1) { // store + update AddrIn - AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc)).addReg(Data) - .addReg(AddrIn).addImm(0)); - MachineInstrBuilder MIB = - BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut); - MIB = AddDefaultT1CC(MIB); - MIB.addReg(AddrIn).addImm(StSize); - AddDefaultPred(MIB); + BuildMI(*BB, Pos, dl, TII->get(StOpc)) + .addReg(Data) + .addReg(AddrIn) + .addImm(0) + .add(predOps(ARMCC::AL)); + BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) + .add(t1CondCodeOp()) + .addReg(AddrIn) + .addImm(StSize) + .add(predOps(ARMCC::AL)); } else if (IsThumb2) { - AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) - .addReg(Data).addReg(AddrIn).addImm(StSize)); + BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) + .addReg(Data) + .addReg(AddrIn) + .addImm(StSize) + .add(predOps(ARMCC::AL)); } else { // arm - AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) - .addReg(Data).addReg(AddrIn).addReg(0) - .addImm(StSize)); + BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) + .addReg(Data) + .addReg(AddrIn) + .addReg(0) + .addImm(StSize) + .add(predOps(ARMCC::AL)); } } @@ -8402,16 +8650,15 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, unsigned Vtmp = varEnd; if ((LoopSize & 0xFFFF0000) != 0) Vtmp = MRI.createVirtualRegister(TRC); - AddDefaultPred(BuildMI(BB, dl, - TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), - Vtmp).addImm(LoopSize & 0xFFFF)); + BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp) + .addImm(LoopSize & 0xFFFF) + .add(predOps(ARMCC::AL)); if ((LoopSize & 0xFFFF0000) != 0) - AddDefaultPred(BuildMI(BB, dl, - TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), - varEnd) - .addReg(Vtmp) - .addImm(LoopSize >> 16)); + BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd) + .addReg(Vtmp) + .addImm(LoopSize >> 16) + .add(predOps(ARMCC::AL)); } else { MachineConstantPool *ConstantPool = MF->getConstantPool(); Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); @@ -8424,11 +8671,16 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); if (IsThumb) - AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)).addReg( - varEnd, RegState::Define).addConstantPoolIndex(Idx)); + BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)) + .addReg(varEnd, RegState::Define) + .addConstantPoolIndex(Idx) + .add(predOps(ARMCC::AL)); else - AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)).addReg( - varEnd, RegState::Define).addConstantPoolIndex(Idx).addImm(0)); + BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)) + .addReg(varEnd, RegState::Define) + .addConstantPoolIndex(Idx) + .addImm(0) + .add(predOps(ARMCC::AL)); } BB->addSuccessor(loopMBB); @@ -8465,16 +8717,19 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, // Decrement loop variable by UnitSize. if (IsThumb1) { - MachineInstrBuilder MIB = - BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop); - MIB = AddDefaultT1CC(MIB); - MIB.addReg(varPhi).addImm(UnitSize); - AddDefaultPred(MIB); + BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop) + .add(t1CondCodeOp()) + .addReg(varPhi) + .addImm(UnitSize) + .add(predOps(ARMCC::AL)); } else { MachineInstrBuilder MIB = BuildMI(*BB, BB->end(), dl, TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); - AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize))); + MIB.addReg(varPhi) + .addImm(UnitSize) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); MIB->getOperand(5).setReg(ARM::CPSR); MIB->getOperand(5).setIsDef(true); } @@ -8545,11 +8800,14 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, case CodeModel::Default: case CodeModel::Kernel: BuildMI(*MBB, MI, DL, TII.get(ARM::tBL)) - .addImm((unsigned)ARMCC::AL).addReg(0) - .addExternalSymbol("__chkstk") - .addReg(ARM::R4, RegState::Implicit | RegState::Kill) - .addReg(ARM::R4, RegState::Implicit | RegState::Define) - .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead); + .add(predOps(ARMCC::AL)) + .addExternalSymbol("__chkstk") + .addReg(ARM::R4, RegState::Implicit | RegState::Kill) + .addReg(ARM::R4, RegState::Implicit | RegState::Define) + .addReg(ARM::R12, + RegState::Implicit | RegState::Define | RegState::Dead) + .addReg(ARM::CPSR, + RegState::Implicit | RegState::Define | RegState::Dead); break; case CodeModel::Large: case CodeModel::JITDefault: { @@ -8559,20 +8817,24 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) .addExternalSymbol("__chkstk"); BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr)) - .addImm((unsigned)ARMCC::AL).addReg(0) - .addReg(Reg, RegState::Kill) - .addReg(ARM::R4, RegState::Implicit | RegState::Kill) - .addReg(ARM::R4, RegState::Implicit | RegState::Define) - .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead); + .add(predOps(ARMCC::AL)) + .addReg(Reg, RegState::Kill) + .addReg(ARM::R4, RegState::Implicit | RegState::Kill) + .addReg(ARM::R4, RegState::Implicit | RegState::Define) + .addReg(ARM::R12, + RegState::Implicit | RegState::Define | RegState::Dead) + .addReg(ARM::CPSR, + RegState::Implicit | RegState::Define | RegState::Dead); break; } } - AddDefaultCC(AddDefaultPred(BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), - ARM::SP) - .addReg(ARM::SP, RegState::Kill) - .addReg(ARM::R4, RegState::Kill) - .setMIFlags(MachineInstr::FrameSetup))); + BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP) + .addReg(ARM::SP, RegState::Kill) + .addReg(ARM::R4, RegState::Kill) + .setMIFlags(MachineInstr::FrameSetup) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); MI.eraseFromParent(); return MBB; @@ -8597,9 +8859,10 @@ ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI, MF->push_back(TrapBB); MBB->addSuccessor(TrapBB); - AddDefaultPred(BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8)) - .addReg(MI.getOperand(0).getReg()) - .addImm(0)); + BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8)) + .addReg(MI.getOperand(0).getReg()) + .addImm(0) + .add(predOps(ARMCC::AL)); BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc)) .addMBB(TrapBB) .addImm(ARMCC::EQ) @@ -8617,18 +8880,18 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, bool isThumb2 = Subtarget->isThumb2(); switch (MI.getOpcode()) { default: { - MI.dump(); + MI.print(errs()); llvm_unreachable("Unexpected instr type to insert"); } // Thumb1 post-indexed loads are really just single-register LDMs. case ARM::tLDR_postidx: { BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD)) - .addOperand(MI.getOperand(1)) // Rn_wb - .addOperand(MI.getOperand(2)) // Rn - .addOperand(MI.getOperand(3)) // PredImm - .addOperand(MI.getOperand(4)) // PredReg - .addOperand(MI.getOperand(0)); // Rt + .add(MI.getOperand(1)) // Rn_wb + .add(MI.getOperand(2)) // Rn + .add(MI.getOperand(3)) // PredImm + .add(MI.getOperand(4)) // PredReg + .add(MI.getOperand(0)); // Rt MI.eraseFromParent(); return BB; } @@ -8659,12 +8922,12 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineMemOperand *MMO = *MI.memoperands_begin(); BuildMI(*BB, MI, dl, TII->get(NewOpc)) - .addOperand(MI.getOperand(0)) // Rn_wb - .addOperand(MI.getOperand(1)) // Rt - .addOperand(MI.getOperand(2)) // Rn - .addImm(Offset) // offset (skip GPR==zero_reg) - .addOperand(MI.getOperand(5)) // pred - .addOperand(MI.getOperand(6)) + .add(MI.getOperand(0)) // Rn_wb + .add(MI.getOperand(1)) // Rt + .add(MI.getOperand(2)) // Rn + .addImm(Offset) // offset (skip GPR==zero_reg) + .add(MI.getOperand(5)) // pred + .add(MI.getOperand(6)) .addMemOperand(MMO); MI.eraseFromParent(); return BB; @@ -8681,7 +8944,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, } MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); for (unsigned i = 0; i < MI.getNumOperands(); ++i) - MIB.addOperand(MI.getOperand(i)); + MIB.add(MI.getOperand(i)); MI.eraseFromParent(); return BB; } @@ -8754,18 +9017,20 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, unsigned LHS1 = MI.getOperand(1).getReg(); unsigned LHS2 = MI.getOperand(2).getReg(); if (RHSisZero) { - AddDefaultPred(BuildMI(BB, dl, - TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) - .addReg(LHS1).addImm(0)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(LHS1) + .addImm(0) + .add(predOps(ARMCC::AL)); BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) .addReg(LHS2).addImm(0) .addImm(ARMCC::EQ).addReg(ARM::CPSR); } else { unsigned RHS1 = MI.getOperand(3).getReg(); unsigned RHS2 = MI.getOperand(4).getReg(); - AddDefaultPred(BuildMI(BB, dl, - TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) - .addReg(LHS1).addReg(RHS1)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) + .addReg(LHS1) + .addReg(RHS1) + .add(predOps(ARMCC::AL)); BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) .addReg(LHS2).addReg(RHS2) .addImm(ARMCC::EQ).addReg(ARM::CPSR); @@ -8779,7 +9044,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); if (isThumb2) - AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2B)).addMBB(exitMBB)); + BuildMI(BB, dl, TII->get(ARM::t2B)) + .addMBB(exitMBB) + .add(predOps(ARMCC::AL)); else BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); @@ -8842,9 +9109,10 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, RSBBB->addSuccessor(SinkBB); // insert a cmp at the end of BB - AddDefaultPred(BuildMI(BB, dl, - TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) - .addReg(ABSSrcReg).addImm(0)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(ABSSrcReg) + .addImm(0) + .add(predOps(ARMCC::AL)); // insert a bcc with opposite CC to ARMCC::MI at the end of BB BuildMI(BB, dl, @@ -8855,9 +9123,11 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // Note: BCC and rsbri will be converted into predicated rsbmi // by if-conversion pass BuildMI(*RSBBB, RSBBB->begin(), dl, - TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) - .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0) - .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); + TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) + .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0) + .addImm(0) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); // insert PHI in SinkBB, // reuse ABSDstReg to not change uses of ABS instruction @@ -8927,19 +9197,45 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, // Rename pseudo opcodes. unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode()); + unsigned ccOutIdx; if (NewOpc) { const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); MCID = &TII->get(NewOpc); - assert(MCID->getNumOperands() == MI.getDesc().getNumOperands() + 1 && - "converted opcode should be the same except for cc_out"); + assert(MCID->getNumOperands() == + MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize() + && "converted opcode should be the same except for cc_out" + " (and, on Thumb1, pred)"); MI.setDesc(*MCID); // Add the optional cc_out operand MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); - } - unsigned ccOutIdx = MCID->getNumOperands() - 1; + + // On Thumb1, move all input operands to the end, then add the predicate + if (Subtarget->isThumb1Only()) { + for (unsigned c = MCID->getNumOperands() - 4; c--;) { + MI.addOperand(MI.getOperand(1)); + MI.RemoveOperand(1); + } + + // Restore the ties + for (unsigned i = MI.getNumOperands(); i--;) { + const MachineOperand& op = MI.getOperand(i); + if (op.isReg() && op.isUse()) { + int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO); + if (DefIdx != -1) + MI.tieOperands(DefIdx, i); + } + } + + MI.addOperand(MachineOperand::CreateImm(ARMCC::AL)); + MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false)); + ccOutIdx = 1; + } else + ccOutIdx = MCID->getNumOperands() - 1; + } else + ccOutIdx = MCID->getNumOperands() - 1; // Any ARM instruction that sets the 's' bit should specify an optional // "cc_out" operand in the last operand position. @@ -8970,7 +9266,9 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, if (deadCPSR) { assert(!MI.getOperand(ccOutIdx).getReg() && "expect uninitialized optional cc_out operand"); - return; + // Thumb1 instructions must have the S bit even if the CPSR is dead. + if (!Subtarget->isThumb1Only()) + return; } // If this instruction was defined with an optional CPSR def and its dag node @@ -9032,7 +9330,7 @@ static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDLoc dl(N); EVT VT = N->getValueType(0); CC = N->getOperand(0); - if (CC.getValueType() != MVT::i1) + if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC) return false; Invert = !AllOnes; if (AllOnes) @@ -9265,8 +9563,11 @@ AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, return SDValue(); } - // Don't generate vpaddl+vmovn; we'll match it to vpadd later. - if (Vec.getValueType().getVectorElementType() == VT.getVectorElementType()) + // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure + // we're using the entire input vector, otherwise there's a size/legality + // mismatch somewhere. + if (nextIndex != Vec.getValueType().getVectorNumElements() || + Vec.getValueType().getVectorElementType() == VT.getVectorElementType()) return SDValue(); // Create VPADDL node. @@ -9308,10 +9609,90 @@ static SDValue findMUL_LOHI(SDValue V) { return SDValue(); } -static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, +static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + + if (Subtarget->isThumb()) { + if (!Subtarget->hasDSP()) + return SDValue(); + } else if (!Subtarget->hasV5TEOps()) + return SDValue(); + + // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and + // accumulates the product into a 64-bit value. The 16-bit values will + // be sign extended somehow or SRA'd into 32-bit values + // (addc (adde (mul 16bit, 16bit), lo), hi) + SDValue Mul = AddcNode->getOperand(0); + SDValue Lo = AddcNode->getOperand(1); + if (Mul.getOpcode() != ISD::MUL) { + Lo = AddcNode->getOperand(0); + Mul = AddcNode->getOperand(1); + if (Mul.getOpcode() != ISD::MUL) + return SDValue(); + } + + SDValue SRA = AddeNode->getOperand(0); + SDValue Hi = AddeNode->getOperand(1); + if (SRA.getOpcode() != ISD::SRA) { + SRA = AddeNode->getOperand(1); + Hi = AddeNode->getOperand(0); + if (SRA.getOpcode() != ISD::SRA) + return SDValue(); + } + if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) { + if (Const->getZExtValue() != 31) + return SDValue(); + } else + return SDValue(); + + if (SRA.getOperand(0) != Mul) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(AddcNode); + unsigned Opcode = 0; + SDValue Op0; + SDValue Op1; + + if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) { + Opcode = ARMISD::SMLALBB; + Op0 = Mul.getOperand(0); + Op1 = Mul.getOperand(1); + } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) { + Opcode = ARMISD::SMLALBT; + Op0 = Mul.getOperand(0); + Op1 = Mul.getOperand(1).getOperand(0); + } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) { + Opcode = ARMISD::SMLALTB; + Op0 = Mul.getOperand(0).getOperand(0); + Op1 = Mul.getOperand(1); + } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) { + Opcode = ARMISD::SMLALTT; + Op0 = Mul->getOperand(0).getOperand(0); + Op1 = Mul->getOperand(1).getOperand(0); + } + + if (!Op0 || !Op1) + return SDValue(); + + SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32), + Op0, Op1, Lo, Hi); + // Replace the ADDs' nodes uses by the MLA node's values. + SDValue HiMLALResult(SMLAL.getNode(), 1); + SDValue LoMLALResult(SMLAL.getNode(), 0); + + DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); + DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); + + // Return original node to notify the driver to stop replacing. + SDValue resNode(AddcNode, 0); + return resNode; +} + +static SDValue AddCombineTo64bitMLAL(SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { - // Look for multiply add opportunities. // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where // each add nodes consumes a value from ISD::UMUL_LOHI and there is @@ -9326,7 +9707,17 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, // \ / // ADDC <- hiAdd // - assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC"); + assert(AddeNode->getOpcode() == ARMISD::ADDE && "Expect an ADDE"); + + assert(AddeNode->getNumOperands() == 3 && + AddeNode->getOperand(2).getValueType() == MVT::i32 && + "ADDE node has the wrong inputs"); + + // Check that we have a glued ADDC node. + SDNode* AddcNode = AddeNode->getOperand(2).getNode(); + if (AddcNode->getOpcode() != ARMISD::ADDC) + return SDValue(); + SDValue AddcOp0 = AddcNode->getOperand(0); SDValue AddcOp1 = AddcNode->getOperand(1); @@ -9338,29 +9729,13 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, AddcNode->getValueType(0) == MVT::i32 && "Expect ADDC with two result values. First: i32"); - // Check that we have a glued ADDC node. - if (AddcNode->getValueType(1) != MVT::Glue) - return SDValue(); - - // Check that the ADDC adds the low result of the S/UMUL_LOHI. + // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it + // maybe a SMLAL which multiplies two 16-bit values. if (AddcOp0->getOpcode() != ISD::UMUL_LOHI && AddcOp0->getOpcode() != ISD::SMUL_LOHI && AddcOp1->getOpcode() != ISD::UMUL_LOHI && AddcOp1->getOpcode() != ISD::SMUL_LOHI) - return SDValue(); - - // Look for the glued ADDE. - SDNode* AddeNode = AddcNode->getGluedUser(); - if (!AddeNode) - return SDValue(); - - // Make sure it is really an ADDE. - if (AddeNode->getOpcode() != ISD::ADDE) - return SDValue(); - - assert(AddeNode->getNumOperands() == 3 && - AddeNode->getOperand(2).getValueType() == MVT::Glue && - "ADDE node has the wrong inputs"); + return AddCombineTo64BitSMLAL16(AddcNode, AddeNode, DCI, Subtarget); // Check for the triangle shape. SDValue AddeOp0 = AddeNode->getOperand(0); @@ -9435,38 +9810,25 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); // Return original node to notify the driver to stop replacing. - SDValue resNode(AddcNode, 0); - return resNode; + return SDValue(AddeNode, 0); } -static SDValue AddCombineTo64bitUMAAL(SDNode *AddcNode, +static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // UMAAL is similar to UMLAL except that it adds two unsigned values. // While trying to combine for the other MLAL nodes, first search for the - // chance to use UMAAL. Check if Addc uses another addc node which can first - // be combined into a UMLAL. The other pattern is AddcNode being combined - // into an UMLAL and then using another addc is handled in ISelDAGToDAG. - - if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() || - (Subtarget->isThumb() && !Subtarget->hasThumb2())) - return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget); - - SDNode *PrevAddc = nullptr; - if (AddcNode->getOperand(0).getOpcode() == ISD::ADDC) - PrevAddc = AddcNode->getOperand(0).getNode(); - else if (AddcNode->getOperand(1).getOpcode() == ISD::ADDC) - PrevAddc = AddcNode->getOperand(1).getNode(); - - // If there's no addc chains, just return a search for any MLAL. - if (PrevAddc == nullptr) - return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget); - - // Try to convert the addc operand to an MLAL and if that fails try to - // combine AddcNode. - SDValue MLAL = AddCombineTo64bitMLAL(PrevAddc, DCI, Subtarget); - if (MLAL != SDValue(PrevAddc, 0)) - return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget); + // chance to use UMAAL. Check if Addc uses a node which has already + // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde + // as the addend, and it's handled in PerformUMLALCombine. + + if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) + return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); + + // Check that we have a glued ADDC node. + SDNode* AddcNode = AddeNode->getOperand(2).getNode(); + if (AddcNode->getOpcode() != ARMISD::ADDC) + return SDValue(); // Find the converted UMAAL or quit if it doesn't exist. SDNode *UmlalNode = nullptr; @@ -9478,29 +9840,18 @@ static SDValue AddCombineTo64bitUMAAL(SDNode *AddcNode, UmlalNode = AddcNode->getOperand(1).getNode(); AddHi = AddcNode->getOperand(0); } else { - return SDValue(); + return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); } // The ADDC should be glued to an ADDE node, which uses the same UMLAL as // the ADDC as well as Zero. - auto *Zero = dyn_cast<ConstantSDNode>(UmlalNode->getOperand(3)); - - if (!Zero || Zero->getZExtValue() != 0) - return SDValue(); - - // Check that we have a glued ADDC node. - if (AddcNode->getValueType(1) != MVT::Glue) - return SDValue(); - - // Look for the glued ADDE. - SDNode* AddeNode = AddcNode->getGluedUser(); - if (!AddeNode) + if (!isNullConstant(UmlalNode->getOperand(3))) return SDValue(); - if ((AddeNode->getOperand(0).getNode() == Zero && + if ((isNullConstant(AddeNode->getOperand(0)) && AddeNode->getOperand(1).getNode() == UmlalNode) || (AddeNode->getOperand(0).getNode() == UmlalNode && - AddeNode->getOperand(1).getNode() == Zero)) { + isNullConstant(AddeNode->getOperand(1)))) { SelectionDAG &DAG = DCI.DAG; SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1), @@ -9513,19 +9864,84 @@ static SDValue AddCombineTo64bitUMAAL(SDNode *AddcNode, DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0)); // Return original node to notify the driver to stop replacing. - return SDValue(AddcNode, 0); + return SDValue(AddeNode, 0); } return SDValue(); } -/// PerformADDCCombine - Target-specific dag combine transform from -/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL or -/// ISD::ADDC, ISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL -static SDValue PerformADDCCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - const ARMSubtarget *Subtarget) { +static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) + return SDValue(); + + // Check that we have a pair of ADDC and ADDE as operands. + // Both addends of the ADDE must be zero. + SDNode* AddcNode = N->getOperand(2).getNode(); + SDNode* AddeNode = N->getOperand(3).getNode(); + if ((AddcNode->getOpcode() == ARMISD::ADDC) && + (AddeNode->getOpcode() == ARMISD::ADDE) && + isNullConstant(AddeNode->getOperand(0)) && + isNullConstant(AddeNode->getOperand(1)) && + (AddeNode->getOperand(2).getNode() == AddcNode)) + return DAG.getNode(ARMISD::UMAAL, SDLoc(N), + DAG.getVTList(MVT::i32, MVT::i32), + {N->getOperand(0), N->getOperand(1), + AddcNode->getOperand(0), AddcNode->getOperand(1)}); + else + return SDValue(); +} + +static SDValue PerformAddcSubcCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + if (Subtarget->isThumb1Only()) { + SDValue RHS = N->getOperand(1); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { + int32_t imm = C->getSExtValue(); + if (imm < 0 && imm > INT_MIN) { + SDLoc DL(N); + RHS = DAG.getConstant(-imm, DL, MVT::i32); + unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC + : ARMISD::ADDC; + return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS); + } + } + } + return SDValue(); +} - if (Subtarget->isThumb1Only()) return SDValue(); +static SDValue PerformAddeSubeCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + if (Subtarget->isThumb1Only()) { + SDValue RHS = N->getOperand(1); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { + int64_t imm = C->getSExtValue(); + if (imm < 0) { + SDLoc DL(N); + + // The with-carry-in form matches bitwise not instead of the negation. + // Effectively, the inverse interpretation of the carry flag already + // accounts for part of the negation. + RHS = DAG.getConstant(~imm, DL, MVT::i32); + + unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE + : ARMISD::ADDE; + return DAG.getNode(Opcode, DL, N->getVTList(), + N->getOperand(0), RHS, N->getOperand(2)); + } + } + } + return SDValue(); +} + +/// PerformADDECombine - Target-specific dag combine transform from +/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or +/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL +static SDValue PerformADDECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + // Only ARM and Thumb2 support UMLAL/SMLAL. + if (Subtarget->isThumb1Only()) + return PerformAddeSubeCombine(N, DCI.DAG, Subtarget); // Only perform the checks after legalize when the pattern is available. if (DCI.isBeforeLegalize()) return SDValue(); @@ -9722,7 +10138,6 @@ static SDValue PerformMULCombine(SDNode *N, static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { - // Attempt to use immediate-form VBIC BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); SDLoc dl(N); @@ -9761,6 +10176,67 @@ static SDValue PerformANDCombine(SDNode *N, return SDValue(); } +// Try combining OR nodes to SMULWB, SMULWT. +static SDValue PerformORCombineToSMULWBT(SDNode *OR, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + if (!Subtarget->hasV6Ops() || + (Subtarget->isThumb() && + (!Subtarget->hasThumb2() || !Subtarget->hasDSP()))) + return SDValue(); + + SDValue SRL = OR->getOperand(0); + SDValue SHL = OR->getOperand(1); + + if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) { + SRL = OR->getOperand(1); + SHL = OR->getOperand(0); + } + if (!isSRL16(SRL) || !isSHL16(SHL)) + return SDValue(); + + // The first operands to the shifts need to be the two results from the + // same smul_lohi node. + if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) || + SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI) + return SDValue(); + + SDNode *SMULLOHI = SRL.getOperand(0).getNode(); + if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) || + SHL.getOperand(0) != SDValue(SMULLOHI, 1)) + return SDValue(); + + // Now we have: + // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16))) + // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments. + // For SMUWB the 16-bit value will signed extended somehow. + // For SMULWT only the SRA is required. + // Check both sides of SMUL_LOHI + SDValue OpS16 = SMULLOHI->getOperand(0); + SDValue OpS32 = SMULLOHI->getOperand(1); + + SelectionDAG &DAG = DCI.DAG; + if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) { + OpS16 = OpS32; + OpS32 = SMULLOHI->getOperand(0); + } + + SDLoc dl(OR); + unsigned Opcode = 0; + if (isS16(OpS16, DAG)) + Opcode = ARMISD::SMULWB; + else if (isSRA16(OpS16)) { + Opcode = ARMISD::SMULWT; + OpS16 = OpS16->getOperand(0); + } + else + return SDValue(); + + SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16); + DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res); + return SDValue(OR, 0); +} + /// PerformORCombine - Target-specific dag combine xforms for ISD::OR static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, @@ -9798,6 +10274,8 @@ static SDValue PerformORCombine(SDNode *N, // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) return Result; + if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget)) + return Result; } // The code below optimizes (or (and X, Y), Z). @@ -9906,7 +10384,7 @@ static SDValue PerformORCombine(SDNode *N, (Mask == ~Mask2)) { // The pack halfword instruction works better for masks that fit it, // so use that when it's available. - if (Subtarget->hasT2ExtractPack() && + if (Subtarget->hasDSP() && (Mask == 0xffff || Mask == 0xffff0000)) return SDValue(); // 2a @@ -9922,7 +10400,7 @@ static SDValue PerformORCombine(SDNode *N, (~Mask == Mask2)) { // The pack halfword instruction works better for masks that fit it, // so use that when it's available. - if (Subtarget->hasT2ExtractPack() && + if (Subtarget->hasDSP() && (Mask2 == 0xffff || Mask2 == 0xffff0000)) return SDValue(); // 2b @@ -10485,11 +10963,8 @@ static SDValue CombineBaseUpdate(SDNode *N, // If the increment is a constant, it must match the memory ref size. SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); - if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { - uint64_t IncVal = CInc->getZExtValue(); - if (IncVal != NumBytes) - continue; - } else if (NumBytes >= 3 * 16) { + ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); + if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) { // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two // separate instructions that make it harder to use a non-constant update. continue; @@ -11306,34 +11781,6 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static void computeKnownBits(SelectionDAG &DAG, SDValue Op, APInt &KnownZero, - APInt &KnownOne) { - if (Op.getOpcode() == ARMISD::BFI) { - // Conservatively, we can recurse down the first operand - // and just mask out all affected bits. - computeKnownBits(DAG, Op.getOperand(0), KnownZero, KnownOne); - - // The operand to BFI is already a mask suitable for removing the bits it - // sets. - ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2)); - const APInt &Mask = CI->getAPIntValue(); - KnownZero &= Mask; - KnownOne &= Mask; - return; - } - if (Op.getOpcode() == ARMISD::CMOV) { - APInt KZ2(KnownZero.getBitWidth(), 0); - APInt KO2(KnownOne.getBitWidth(), 0); - computeKnownBits(DAG, Op.getOperand(1), KnownZero, KnownOne); - computeKnownBits(DAG, Op.getOperand(2), KZ2, KO2); - - KnownZero &= KZ2; - KnownOne &= KO2; - return; - } - return DAG.computeKnownBits(Op, KnownZero, KnownOne); -} - SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { // If we have a CMOV, OR and AND combination such as: // if (x & CN) @@ -11394,9 +11841,9 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D // Lastly, can we determine that the bits defined by OrCI // are zero in Y? - APInt KnownZero, KnownOne; - computeKnownBits(DAG, Y, KnownZero, KnownOne); - if ((OrCI & KnownZero) != OrCI) + KnownBits Known; + DAG.computeKnownBits(Y, Known); + if ((OrCI & Known.Zero) != OrCI) return SDValue(); // OK, we can do the combine. @@ -11534,16 +11981,16 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { } if (Res.getNode()) { - APInt KnownZero, KnownOne; - DAG.computeKnownBits(SDValue(N,0), KnownZero, KnownOne); + KnownBits Known; + DAG.computeKnownBits(SDValue(N,0), Known); // Capture demanded bits information that would be otherwise lost. - if (KnownZero == 0xfffffffe) + if (Known.Zero == 0xfffffffe) Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, DAG.getValueType(MVT::i1)); - else if (KnownZero == 0xffffff00) + else if (Known.Zero == 0xffffff00) Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, DAG.getValueType(MVT::i8)); - else if (KnownZero == 0xffff0000) + else if (Known.Zero == 0xffff0000) Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, DAG.getValueType(MVT::i16)); } @@ -11555,13 +12002,17 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { default: break; - case ISD::ADDC: return PerformADDCCombine(N, DCI, Subtarget); + case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); + case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); case ISD::SUB: return PerformSUBCombine(N, DCI); case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); case ISD::OR: return PerformORCombine(N, DCI, Subtarget); case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); + case ARMISD::ADDC: + case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI.DAG, Subtarget); + case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI.DAG, Subtarget); case ARMISD::BFI: return PerformBFICombine(N, DCI); case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); @@ -11593,6 +12044,56 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, return PerformVLDCombine(N, DCI); case ARMISD::BUILD_VECTOR: return PerformARMBUILD_VECTORCombine(N, DCI); + case ARMISD::SMULWB: { + unsigned BitWidth = N->getValueType(0).getSizeInBits(); + APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); + if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) + return SDValue(); + break; + } + case ARMISD::SMULWT: { + unsigned BitWidth = N->getValueType(0).getSizeInBits(); + APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); + if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) + return SDValue(); + break; + } + case ARMISD::SMLALBB: { + unsigned BitWidth = N->getValueType(0).getSizeInBits(); + APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); + if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || + (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) + return SDValue(); + break; + } + case ARMISD::SMLALBT: { + unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits(); + APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); + unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits(); + APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); + if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) || + (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI))) + return SDValue(); + break; + } + case ARMISD::SMLALTB: { + unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits(); + APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); + unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits(); + APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); + if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) || + (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI))) + return SDValue(); + break; + } + case ARMISD::SMLALTT: { + unsigned BitWidth = N->getValueType(0).getSizeInBits(); + APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); + if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || + (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) + return SDValue(); + break; + } case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { @@ -11688,12 +12189,6 @@ EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size, } } - // Lowering to i32/i16 if the size permits. - if (Size >= 4) - return MVT::i32; - else if (Size >= 2) - return MVT::i16; - // Let the target-independent logic figure it out. return MVT::Other; } @@ -12178,12 +12673,12 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, } void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, - APInt &KnownZero, - APInt &KnownOne, + KnownBits &Known, + const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { - unsigned BitWidth = KnownOne.getBitWidth(); - KnownZero = KnownOne = APInt(BitWidth, 0); + unsigned BitWidth = Known.getBitWidth(); + Known.resetAll(); switch (Op.getOpcode()) { default: break; case ARMISD::ADDC: @@ -12193,17 +12688,18 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, // These nodes' second result is a boolean if (Op.getResNo() == 0) break; - KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); + Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); break; case ARMISD::CMOV: { // Bits are known zero/one if known on the LHS and RHS. - DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); - if (KnownZero == 0 && KnownOne == 0) return; + DAG.computeKnownBits(Op.getOperand(0), Known, Depth+1); + if (Known.isUnknown()) + return; - APInt KnownZeroRHS, KnownOneRHS; - DAG.computeKnownBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1); - KnownZero &= KnownZeroRHS; - KnownOne &= KnownOneRHS; + KnownBits KnownRHS; + DAG.computeKnownBits(Op.getOperand(1), KnownRHS, Depth+1); + Known.Zero &= KnownRHS.Zero; + Known.One &= KnownRHS.One; return; } case ISD::INTRINSIC_W_CHAIN: { @@ -12215,11 +12711,24 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, case Intrinsic::arm_ldrex: { EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); unsigned MemBits = VT.getScalarSizeInBits(); - KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); + Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); return; } } } + case ARMISD::BFI: { + // Conservatively, we can recurse down the first operand + // and just mask out all affected bits. + DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1); + + // The operand to BFI is already a mask suitable for removing the bits it + // sets. + ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2)); + const APInt &Mask = CI->getAPIntValue(); + Known.Zero &= Mask; + Known.One &= Mask; + return; + } } } @@ -12588,8 +13097,8 @@ static TargetLowering::ArgListTy getDivRemArgList( Type *ArgTy = ArgVT.getTypeForEVT(*Context); Entry.Node = N->getOperand(i); Entry.Ty = ArgTy; - Entry.isSExt = isSigned; - Entry.isZExt = !isSigned; + Entry.IsSExt = isSigned; + Entry.IsZExt = !isSigned; Args.push_back(Entry); } if (Subtarget->isTargetWindows() && Args.size() >= 2) @@ -12615,7 +13124,9 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { // rem = a - b * div // return {div, rem} // This should be lowered into UDIV/SDIV + MLS later on. - if (Subtarget->hasDivide() && Op->getValueType(0).isSimple() && + bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() + : Subtarget->hasDivideInARMMode(); + if (hasDivide && Op->getValueType(0).isSimple() && Op->getSimpleValueType(0) == MVT::i32) { unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; const SDValue Dividend = Op->getOperand(0); @@ -12639,7 +13150,7 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy(DAG.getDataLayout())); - Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr); + Type *RetTy = StructType::get(Ty, Ty); if (Subtarget->isTargetWindows()) InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain); @@ -12861,7 +13372,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return true; } case Intrinsic::arm_stlexd: - case Intrinsic::arm_strexd: { + case Intrinsic::arm_strexd: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::i64; Info.ptrVal = I.getArgOperand(2); @@ -12871,9 +13382,9 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.readMem = false; Info.writeMem = true; return true; - } + case Intrinsic::arm_ldaexd: - case Intrinsic::arm_ldrexd: { + case Intrinsic::arm_ldrexd: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::i64; Info.ptrVal = I.getArgOperand(0); @@ -12883,7 +13394,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.readMem = true; Info.writeMem = false; return true; - } + default: break; } @@ -12921,7 +13432,7 @@ Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get // here. if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { - Function *MCR = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); + Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), Builder.getInt32(0), Builder.getInt32(7), Builder.getInt32(10), Builder.getInt32(5)}; @@ -12932,7 +13443,7 @@ Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, llvm_unreachable("makeDMB on a target so old that it has no barriers"); } } else { - Function *DMB = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); + Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); // Only a full system barrier exists in the M-class architectures. Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; Constant *CDomain = Builder.getInt32(Domain); @@ -12941,9 +13452,9 @@ Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, } // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html -Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, - AtomicOrdering Ord, bool IsStore, - bool IsLoad) const { +Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, + Instruction *Inst, + AtomicOrdering Ord) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: @@ -12952,7 +13463,7 @@ Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, case AtomicOrdering::Acquire: return nullptr; // Nothing to do case AtomicOrdering::SequentiallyConsistent: - if (!IsStore) + if (!Inst->hasAtomicStore()) return nullptr; // Nothing to do /*FALLTHROUGH*/ case AtomicOrdering::Release: @@ -12966,9 +13477,9 @@ Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, llvm_unreachable("Unknown fence ordering in emitLeadingFence"); } -Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, - AtomicOrdering Ord, bool IsStore, - bool IsLoad) const { +Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, + Instruction *Inst, + AtomicOrdering Ord) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: @@ -13089,7 +13600,7 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, if (ValTy->getPrimitiveSizeInBits() == 64) { Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; - Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int); + Function *Ldrex = Intrinsic::getDeclaration(M, Int); Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); @@ -13106,7 +13617,7 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, Type *Tys[] = { Addr->getType() }; Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; - Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int, Tys); + Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys); return Builder.CreateTruncOrBitCast( Builder.CreateCall(Ldrex, Addr), @@ -13118,7 +13629,7 @@ void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( if (!Subtarget->hasV7Ops()) return; Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Builder.CreateCall(llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); + Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); } Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, @@ -13154,6 +13665,39 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, Addr}); } +/// A helper function for determining the number of interleaved accesses we +/// will generate when lowering accesses of the given type. +unsigned +ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy, + const DataLayout &DL) const { + return (DL.getTypeSizeInBits(VecTy) + 127) / 128; +} + +bool ARMTargetLowering::isLegalInterleavedAccessType( + VectorType *VecTy, const DataLayout &DL) const { + + unsigned VecSize = DL.getTypeSizeInBits(VecTy); + unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); + + // Ensure the vector doesn't have f16 elements. Even though we could do an + // i16 vldN, we can't hold the f16 vectors and will end up converting via + // f32. + if (VecTy->getElementType()->isHalfTy()) + return false; + + // Ensure the number of vector elements is greater than 1. + if (VecTy->getNumElements() < 2) + return false; + + // Ensure the element type is legal. + if (ElSize != 8 && ElSize != 16 && ElSize != 32) + return false; + + // Ensure the total vector size is 64 or a multiple of 128. Types larger than + // 128 will be split into multiple interleaved accesses. + return VecSize == 64 || VecSize % 128 == 0; +} + /// \brief Lower an interleaved load into a vldN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): @@ -13178,64 +13722,99 @@ bool ARMTargetLowering::lowerInterleavedLoad( Type *EltTy = VecTy->getVectorElementType(); const DataLayout &DL = LI->getModule()->getDataLayout(); - unsigned VecSize = DL.getTypeSizeInBits(VecTy); - bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64; - // Skip if we do not have NEON and skip illegal vector types and vector types - // with i64/f64 elements (vldN doesn't support i64/f64 elements). - if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits) + // Skip if we do not have NEON and skip illegal vector types. We can + // "legalize" wide vector types into multiple interleaved accesses as long as + // the vector types are divisible by 128. + if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL)) return false; + unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); + // A pointer vector can not be the return type of the ldN intrinsics. Need to // load integer vectors first and then convert to pointer vectors. if (EltTy->isPointerTy()) VecTy = VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); + IRBuilder<> Builder(LI); + + // The base address of the load. + Value *BaseAddr = LI->getPointerOperand(); + + if (NumLoads > 1) { + // If we're going to generate more than one load, reset the sub-vector type + // to something legal. + VecTy = VectorType::get(VecTy->getVectorElementType(), + VecTy->getVectorNumElements() / NumLoads); + + // We will compute the pointer operand of each load from the original base + // address using GEPs. Cast the base address to a pointer to the scalar + // element type. + BaseAddr = Builder.CreateBitCast( + BaseAddr, VecTy->getVectorElementType()->getPointerTo( + LI->getPointerAddressSpace())); + } + + assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); + + Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); + Type *Tys[] = {VecTy, Int8Ptr}; static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, Intrinsic::arm_neon_vld3, Intrinsic::arm_neon_vld4}; + Function *VldnFunc = + Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); - IRBuilder<> Builder(LI); - SmallVector<Value *, 2> Ops; + // Holds sub-vectors extracted from the load intrinsic return values. The + // sub-vectors are associated with the shufflevector instructions they will + // replace. + DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs; - Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); - Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr)); - Ops.push_back(Builder.getInt32(LI->getAlignment())); + for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { - Type *Tys[] = { VecTy, Int8Ptr }; - Function *VldnFunc = - Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); - CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); + // If we're generating more than one load, compute the base address of + // subsequent loads as an offset from the previous. + if (LoadCount > 0) + BaseAddr = Builder.CreateConstGEP1_32( + BaseAddr, VecTy->getVectorNumElements() * Factor); - // Replace uses of each shufflevector with the corresponding vector loaded - // by ldN. - for (unsigned i = 0; i < Shuffles.size(); i++) { - ShuffleVectorInst *SV = Shuffles[i]; - unsigned Index = Indices[i]; + SmallVector<Value *, 2> Ops; + Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); + Ops.push_back(Builder.getInt32(LI->getAlignment())); - Value *SubVec = Builder.CreateExtractValue(VldN, Index); + CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); - // Convert the integer vector to pointer vector if the element is pointer. - if (EltTy->isPointerTy()) - SubVec = Builder.CreateIntToPtr(SubVec, SV->getType()); + // Replace uses of each shufflevector with the corresponding vector loaded + // by ldN. + for (unsigned i = 0; i < Shuffles.size(); i++) { + ShuffleVectorInst *SV = Shuffles[i]; + unsigned Index = Indices[i]; - SV->replaceAllUsesWith(SubVec); - } + Value *SubVec = Builder.CreateExtractValue(VldN, Index); - return true; -} + // Convert the integer vector to pointer vector if the element is pointer. + if (EltTy->isPointerTy()) + SubVec = Builder.CreateIntToPtr( + SubVec, VectorType::get(SV->getType()->getVectorElementType(), + VecTy->getVectorNumElements())); -/// \brief Get a mask consisting of sequential integers starting from \p Start. -/// -/// I.e. <Start, Start + 1, ..., Start + NumElts - 1> -static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start, - unsigned NumElts) { - SmallVector<Constant *, 16> Mask; - for (unsigned i = 0; i < NumElts; i++) - Mask.push_back(Builder.getInt32(Start + i)); + SubVecs[SV].push_back(SubVec); + } + } - return ConstantVector::get(Mask); + // Replace uses of the shufflevector instructions with the sub-vectors + // returned by the load intrinsic. If a shufflevector instruction is + // associated with more than one sub-vector, those sub-vectors will be + // concatenated into a single wide vector. + for (ShuffleVectorInst *SVI : Shuffles) { + auto &SubVec = SubVecs[SVI]; + auto *WideVec = + SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; + SVI->replaceAllUsesWith(WideVec); + } + + return true; } /// \brief Lower an interleaved store into a vstN intrinsic. @@ -13279,15 +13858,15 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); const DataLayout &DL = SI->getModule()->getDataLayout(); - unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); - bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64; - // Skip if we do not have NEON and skip illegal vector types and vector types - // with i64/f64 elements (vstN doesn't support i64/f64 elements). - if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) || - EltIs64Bits) + // Skip if we do not have NEON and skip illegal vector types. We can + // "legalize" wide vector types into multiple interleaved accesses as long as + // the vector types are divisible by 128. + if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL)) return false; + unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); + Value *Op0 = SVI->getOperand(0); Value *Op1 = SVI->getOperand(1); IRBuilder<> Builder(SI); @@ -13306,44 +13885,75 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, SubVecTy = VectorType::get(IntTy, LaneLen); } + // The base address of the store. + Value *BaseAddr = SI->getPointerOperand(); + + if (NumStores > 1) { + // If we're going to generate more than one store, reset the lane length + // and sub-vector type to something legal. + LaneLen /= NumStores; + SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); + + // We will compute the pointer operand of each store from the original base + // address using GEPs. Cast the base address to a pointer to the scalar + // element type. + BaseAddr = Builder.CreateBitCast( + BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( + SI->getPointerAddressSpace())); + } + + assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!"); + + auto Mask = SVI->getShuffleMask(); + + Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); + Type *Tys[] = {Int8Ptr, SubVecTy}; static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, Intrinsic::arm_neon_vst3, Intrinsic::arm_neon_vst4}; - SmallVector<Value *, 6> Ops; - Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); - Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr)); + for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { - Type *Tys[] = { Int8Ptr, SubVecTy }; - Function *VstNFunc = Intrinsic::getDeclaration( - SI->getModule(), StoreInts[Factor - 2], Tys); + // If we generating more than one store, we compute the base address of + // subsequent stores as an offset from the previous. + if (StoreCount > 0) + BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor); - // Split the shufflevector operands into sub vectors for the new vstN call. - auto Mask = SVI->getShuffleMask(); - for (unsigned i = 0; i < Factor; i++) { - if (Mask[i] >= 0) { - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen))); - } else { - unsigned StartMask = 0; - for (unsigned j = 1; j < LaneLen; j++) { - if (Mask[j*Factor + i] >= 0) { - StartMask = Mask[j*Factor + i] - j; - break; + SmallVector<Value *, 6> Ops; + Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); + + Function *VstNFunc = + Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); + + // Split the shufflevector operands into sub vectors for the new vstN call. + for (unsigned i = 0; i < Factor; i++) { + unsigned IdxI = StoreCount * LaneLen * Factor + i; + if (Mask[IdxI] >= 0) { + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); + } else { + unsigned StartMask = 0; + for (unsigned j = 1; j < LaneLen; j++) { + unsigned IdxJ = StoreCount * LaneLen * Factor + j; + if (Mask[IdxJ * Factor + IdxI] >= 0) { + StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; + break; + } } + // Note: If all elements in a chunk are undefs, StartMask=0! + // Note: Filling undef gaps with random elements is ok, since + // those elements were being written anyway (with undefs). + // In the case of all undefs we're defaulting to using elems from 0 + // Note: StartMask cannot be negative, it's checked in + // isReInterleaveMask + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); } - // Note: If all elements in a chunk are undefs, StartMask=0! - // Note: Filling undef gaps with random elements is ok, since - // those elements were being written anyway (with undefs). - // In the case of all undefs we're defaulting to using elems from 0 - // Note: StartMask cannot be negative, it's checked in isReInterleaveMask - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen))); } - } - Ops.push_back(Builder.getInt32(SI->getAlignment())); - Builder.CreateCall(VstNFunc, Ops); + Ops.push_back(Builder.getInt32(SI->getAlignment())); + Builder.CreateCall(VstNFunc, Ops); + } return true; } @@ -13484,3 +14094,8 @@ void ARMTargetLowering::insertCopiesSplitCSR( .addReg(NewVR); } } + +void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const { + MF.getFrameInfo().computeMaxCallFrameSize(MF); + TargetLoweringBase::finalizeLowering(MF); +} |