diff options
author | dim <dim@FreeBSD.org> | 2013-12-22 00:04:03 +0000 |
---|---|---|
committer | dim <dim@FreeBSD.org> | 2013-12-22 00:04:03 +0000 |
commit | 8cf58e3ee36bd550746fca361a894e2727485200 (patch) | |
tree | 2ba0398b4c42ad4f55561327538044fd2c925a8b /lib/Target/AArch64 | |
parent | aa45f148926e3461a1fd8b10c990f0a51a908cc9 (diff) | |
download | FreeBSD-src-8cf58e3ee36bd550746fca361a894e2727485200.zip FreeBSD-src-8cf58e3ee36bd550746fca361a894e2727485200.tar.gz |
Vendor import of llvm release_34 branch r197841 (effectively, 3.4 RC3):
https://llvm.org/svn/llvm-project/llvm/branches/release_34@197841
Diffstat (limited to 'lib/Target/AArch64')
38 files changed, 14159 insertions, 511 deletions
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index e17052b..9c2c69a 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -21,8 +21,11 @@ include "llvm/Target/Target.td" // AArch64 Subtarget features. // +def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true", + "Enable ARMv8 FP">; + def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true", - "Enable Advanced SIMD instructions">; + "Enable Advanced SIMD instructions", [FeatureFPARMv8]>; def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", "Enable cryptographic instructions">; @@ -33,7 +36,7 @@ def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", include "AArch64Schedule.td" -def : Processor<"generic", GenericItineraries, [FeatureNEON, FeatureCrypto]>; +def : Processor<"generic", GenericItineraries, [FeatureFPARMv8]>; //===----------------------------------------------------------------------===// // Register File Description diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp index 47ebb82..d59ca56 100644 --- a/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -27,32 +27,23 @@ using namespace llvm; -MachineLocation -AArch64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const { - // See emitFrameIndexDebugValue in InstrInfo for where this instruction is - // expected to be created. - assert(MI->getNumOperands() == 4 && MI->getOperand(0).isReg() - && MI->getOperand(1).isImm() && "unexpected custom DBG_VALUE"); - return MachineLocation(MI->getOperand(0).getReg(), - MI->getOperand(1).getImm()); -} - /// Try to print a floating-point register as if it belonged to a specified /// register-class. For example the inline asm operand modifier "b" requires its /// argument to be printed as "bN". static bool printModifiedFPRAsmOperand(const MachineOperand &MO, const TargetRegisterInfo *TRI, - const TargetRegisterClass &RegClass, - raw_ostream &O) { + char RegType, raw_ostream &O) { if (!MO.isReg()) return true; for (MCRegAliasIterator AR(MO.getReg(), TRI, true); AR.isValid(); ++AR) { - if (RegClass.contains(*AR)) { - O << AArch64InstPrinter::getRegisterName(*AR); + if (AArch64::FPR8RegClass.contains(*AR)) { + O << RegType << TRI->getEncodingValue(MO.getReg()); return false; } } + + // The register doesn't correspond to anything floating-point like. return true; } @@ -91,9 +82,9 @@ bool AArch64AsmPrinter::printSymbolicAddress(const MachineOperand &MO, StringRef Modifier; switch (MO.getType()) { default: - llvm_unreachable("Unexpected operand for symbolic address constraint"); + return true; case MachineOperand::MO_GlobalAddress: - Name = Mang->getSymbol(MO.getGlobal())->getName(); + Name = getSymbol(MO.getGlobal())->getName(); // Global variables may be accessed either via a GOT or in various fun and // interesting TLS-model specific ways. Set the prefix modifier as @@ -155,57 +146,29 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); - if (!ExtraCode || !ExtraCode[0]) { - // There's actually no operand modifier, which leads to a slightly eclectic - // set of behaviour which we have to handle here. - const MachineOperand &MO = MI->getOperand(OpNum); - switch (MO.getType()) { - default: - llvm_unreachable("Unexpected operand for inline assembly"); - case MachineOperand::MO_Register: - // GCC prints the unmodified operand of a 'w' constraint as the vector - // register. Technically, we could allocate the argument as a VPR128, but - // that leads to extremely dodgy copies being generated to get the data - // there. - if (printModifiedFPRAsmOperand(MO, TRI, AArch64::VPR128RegClass, O)) - O << AArch64InstPrinter::getRegisterName(MO.getReg()); - break; - case MachineOperand::MO_Immediate: - O << '#' << MO.getImm(); - break; - case MachineOperand::MO_FPImmediate: - assert(MO.getFPImm()->isExactlyValue(0.0) && "Only FP 0.0 expected"); - O << "#0.0"; - break; - case MachineOperand::MO_BlockAddress: - case MachineOperand::MO_ConstantPoolIndex: - case MachineOperand::MO_GlobalAddress: - case MachineOperand::MO_ExternalSymbol: - return printSymbolicAddress(MO, false, "", O); - } - return false; - } - // We have a real modifier to handle. + if (!ExtraCode) + ExtraCode = ""; + switch(ExtraCode[0]) { default: - // See if this is a generic operand - return AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O); - case 'c': // Don't print "#" before an immediate operand. - if (!MI->getOperand(OpNum).isImm()) - return true; - O << MI->getOperand(OpNum).getImm(); - return false; + if (!AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O)) + return false; + break; case 'w': // Output 32-bit general register operand, constant zero as wzr, or stack // pointer as wsp. Ignored when used with other operand types. - return printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI, - AArch64::GPR32RegClass, O); + if (!printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI, + AArch64::GPR32RegClass, O)) + return false; + break; case 'x': // Output 64-bit general register operand, constant zero as xzr, or stack // pointer as sp. Ignored when used with other operand types. - return printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI, - AArch64::GPR64RegClass, O); + if (!printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI, + AArch64::GPR64RegClass, O)) + return false; + break; case 'H': // Output higher numbered of a 64-bit general register pair case 'Q': @@ -221,40 +184,65 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, // copies ...). llvm_unreachable("FIXME: Unimplemented register pairs"); case 'b': - // Output 8-bit FP/SIMD scalar register operand, prefixed with b. - return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI, - AArch64::FPR8RegClass, O); case 'h': - // Output 16-bit FP/SIMD scalar register operand, prefixed with h. - return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI, - AArch64::FPR16RegClass, O); case 's': - // Output 32-bit FP/SIMD scalar register operand, prefixed with s. - return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI, - AArch64::FPR32RegClass, O); case 'd': - // Output 64-bit FP/SIMD scalar register operand, prefixed with d. - return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI, - AArch64::FPR64RegClass, O); case 'q': - // Output 128-bit FP/SIMD scalar register operand, prefixed with q. - return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI, - AArch64::FPR128RegClass, O); + if (!printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI, + ExtraCode[0], O)) + return false; + break; case 'A': // Output symbolic address with appropriate relocation modifier (also // suitable for ADRP). - return printSymbolicAddress(MI->getOperand(OpNum), false, "", O); + if (!printSymbolicAddress(MI->getOperand(OpNum), false, "", O)) + return false; + break; case 'L': // Output bits 11:0 of symbolic address with appropriate :lo12: relocation // modifier. - return printSymbolicAddress(MI->getOperand(OpNum), true, "lo12", O); + if (!printSymbolicAddress(MI->getOperand(OpNum), true, "lo12", O)) + return false; + break; case 'G': // Output bits 23:12 of symbolic address with appropriate :hi12: relocation // modifier (currently only for TLS local exec). - return printSymbolicAddress(MI->getOperand(OpNum), true, "hi12", O); + if (!printSymbolicAddress(MI->getOperand(OpNum), true, "hi12", O)) + return false; + break; + case 'a': + return PrintAsmMemoryOperand(MI, OpNum, AsmVariant, ExtraCode, O); } + // There's actually no operand modifier, which leads to a slightly eclectic + // set of behaviour which we have to handle here. + const MachineOperand &MO = MI->getOperand(OpNum); + switch (MO.getType()) { + default: + llvm_unreachable("Unexpected operand for inline assembly"); + case MachineOperand::MO_Register: + // GCC prints the unmodified operand of a 'w' constraint as the vector + // register. Technically, we could allocate the argument as a VPR128, but + // that leads to extremely dodgy copies being generated to get the data + // there. + if (printModifiedFPRAsmOperand(MO, TRI, 'v', O)) + O << AArch64InstPrinter::getRegisterName(MO.getReg()); + break; + case MachineOperand::MO_Immediate: + O << '#' << MO.getImm(); + break; + case MachineOperand::MO_FPImmediate: + assert(MO.getFPImm()->isExactlyValue(0.0) && "Only FP 0.0 expected"); + O << "#0.0"; + break; + case MachineOperand::MO_BlockAddress: + case MachineOperand::MO_ConstantPoolIndex: + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_ExternalSymbol: + return printSymbolicAddress(MO, false, "", O); + } + return false; } bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, @@ -271,24 +259,6 @@ bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, return false; } -void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI, - raw_ostream &OS) { - unsigned NOps = MI->getNumOperands(); - assert(NOps==4); - OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: "; - // cast away const; DIetc do not take const operands for some reason. - DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata())); - OS << V.getName(); - OS << " <- "; - // Frame address. Currently handles register +- offset only. - assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm()); - OS << '[' << AArch64InstPrinter::getRegisterName(MI->getOperand(0).getReg()); - OS << '+' << MI->getOperand(1).getImm(); - OS << ']'; - OS << "+" << MI->getOperand(NOps - 2).getImm(); -} - - #include "AArch64GenMCPseudoLowering.inc" void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { @@ -296,18 +266,6 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { if (emitPseudoExpansionLowering(OutStreamer, MI)) return; - switch (MI->getOpcode()) { - case AArch64::DBG_VALUE: { - if (isVerbose() && OutStreamer.hasRawTextSupport()) { - SmallString<128> TmpStr; - raw_svector_ostream OS(TmpStr); - PrintDebugValueComment(MI, OS); - OutStreamer.EmitRawText(StringRef(OS.str())); - } - return; - } - } - MCInst TmpInst; LowerAArch64MachineInstrToMCInst(MI, TmpInst, *this); OutStreamer.EmitInstruction(TmpInst); @@ -329,7 +287,7 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) { for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { OutStreamer.EmitLabel(Stubs[i].first); OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(), - TD->getPointerSize(0), 0); + TD->getPointerSize(0)); } Stubs.clear(); } diff --git a/lib/Target/AArch64/AArch64AsmPrinter.h b/lib/Target/AArch64/AArch64AsmPrinter.h index af0c9fe..824f003 100644 --- a/lib/Target/AArch64/AArch64AsmPrinter.h +++ b/lib/Target/AArch64/AArch64AsmPrinter.h @@ -55,8 +55,6 @@ class LLVM_LIBRARY_VISIBILITY AArch64AsmPrinter : public AsmPrinter { unsigned AsmVariant, const char *ExtraCode, raw_ostream &O); - void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS); - /// printSymbolicAddress - Given some kind of reasonably bare symbolic /// reference, print out the appropriate asm string to represent it. If /// appropriate, a relocation-specifier will be produced, composed of a @@ -67,8 +65,6 @@ class LLVM_LIBRARY_VISIBILITY AArch64AsmPrinter : public AsmPrinter { bool PrintImmediatePrefix, StringRef Suffix, raw_ostream &O); - MachineLocation getDebugValueLocation(const MachineInstr *MI) const; - virtual const char *getPassName() const { return "AArch64 Assembly Printer"; } diff --git a/lib/Target/AArch64/AArch64BranchFixupPass.cpp b/lib/Target/AArch64/AArch64BranchFixupPass.cpp index 71233ba..11e7f41 100644 --- a/lib/Target/AArch64/AArch64BranchFixupPass.cpp +++ b/lib/Target/AArch64/AArch64BranchFixupPass.cpp @@ -87,7 +87,7 @@ namespace { // If the block size isn't a multiple of the known bits, assume the // worst case padding. if (Size & ((1u << Bits) - 1)) - Bits = CountTrailingZeros_32(Size); + Bits = countTrailingZeros(Size); return Bits; } diff --git a/lib/Target/AArch64/AArch64CallingConv.td b/lib/Target/AArch64/AArch64CallingConv.td index b880d83..a2a9f3f 100644 --- a/lib/Target/AArch64/AArch64CallingConv.td +++ b/lib/Target/AArch64/AArch64CallingConv.td @@ -59,9 +59,9 @@ def CC_A64_APCS : CallingConv<[ // Canonicalise the various types that live in different floating-point // registers. This makes sense because the PCS does not distinguish Short // Vectors and Floating-point types. - CCIfType<[v2i8], CCBitConvertToType<f16>>, - CCIfType<[v4i8, v2i16], CCBitConvertToType<f32>>, - CCIfType<[v8i8, v4i16, v2i32, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v1i16, v2i8], CCBitConvertToType<f16>>, + CCIfType<[v1i32, v4i8, v2i16, v1f32], CCBitConvertToType<f32>>, + CCIfType<[v8i8, v4i16, v2i32, v2f32, v1i64, v1f64], CCBitConvertToType<f64>>, CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCBitConvertToType<f128>>, @@ -70,7 +70,8 @@ def CC_A64_APCS : CallingConv<[ // argument is allocated to the least significant bits of register // v[NSRN]. The NSRN is incremented by one. The argument has now been // allocated." - CCIfType<[f16], CCAssignToReg<[B0, B1, B2, B3, B4, B5, B6, B7]>>, + CCIfType<[v1i8], CCAssignToReg<[B0, B1, B2, B3, B4, B5, B6, B7]>>, + CCIfType<[f16], CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>, CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7]>>, CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, CCIfType<[f128], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index daa7f1d..7318230 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -54,7 +54,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const { DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); MachineModuleInfo &MMI = MF.getMMI(); - std::vector<MachineMove> &Moves = MMI.getFrameMoves(); + const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); bool NeedsFrameMoves = MMI.hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry(); @@ -97,8 +97,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const { .addSym(SPLabel); MachineLocation Dst(MachineLocation::VirtualFP); - MachineLocation Src(AArch64::XSP, NumInitialBytes); - Moves.push_back(MachineMove(SPLabel, Dst, Src)); + unsigned Reg = MRI->getDwarfRegNum(AArch64::XSP, true); + MMI.addFrameInst( + MCCFIInstruction::createDefCfa(SPLabel, Reg, -NumInitialBytes)); } // Otherwise we need to set the frame pointer and/or add a second stack @@ -131,9 +132,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const { MCSymbol *FPLabel = MMI.getContext().CreateTempSymbol(); BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::PROLOG_LABEL)) .addSym(FPLabel); - MachineLocation Dst(MachineLocation::VirtualFP); - MachineLocation Src(AArch64::X29, -MFI->getObjectOffset(X29FrameIdx)); - Moves.push_back(MachineMove(FPLabel, Dst, Src)); + unsigned Reg = MRI->getDwarfRegNum(AArch64::X29, true); + unsigned Offset = MFI->getObjectOffset(X29FrameIdx); + MMI.addFrameInst(MCCFIInstruction::createDefCfa(FPLabel, Reg, Offset)); } FPNeedsSetting = false; @@ -164,8 +165,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const { .addSym(CSLabel); MachineLocation Dst(MachineLocation::VirtualFP); - MachineLocation Src(AArch64::XSP, NumResidualBytes + NumInitialBytes); - Moves.push_back(MachineMove(CSLabel, Dst, Src)); + unsigned Reg = MRI->getDwarfRegNum(AArch64::XSP, true); + unsigned Offset = NumResidualBytes + NumInitialBytes; + MMI.addFrameInst(MCCFIInstruction::createDefCfa(CSLabel, Reg, -Offset)); } // And any callee-saved registers (it's fine to leave them to the end here, @@ -180,10 +182,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const { for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(), E = CSI.end(); I != E; ++I) { - MachineLocation Dst(MachineLocation::VirtualFP, - MFI->getObjectOffset(I->getFrameIdx())); - MachineLocation Src(I->getReg()); - Moves.push_back(MachineMove(CSLabel, Dst, Src)); + unsigned Offset = MFI->getObjectOffset(I->getFrameIdx()); + unsigned Reg = MRI->getDwarfRegNum(I->getReg(), true); + MMI.addFrameInst(MCCFIInstruction::createOffset(CSLabel, Reg, Offset)); } } } @@ -424,7 +425,7 @@ AArch64FrameLowering::emitFrameMemOps(bool isPrologue, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const std::vector<CalleeSavedInfo> &CSI, const TargetRegisterInfo *TRI, - LoadStoreMethod PossClasses[], + const LoadStoreMethod PossClasses[], unsigned NumClasses) const { DebugLoc DL = MBB.findDebugLoc(MBBI); MachineFunction &MF = *MBB.getParent(); @@ -527,11 +528,11 @@ AArch64FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, if (CSI.empty()) return false; - static LoadStoreMethod PossibleClasses[] = { + static const LoadStoreMethod PossibleClasses[] = { {&AArch64::GPR64RegClass, AArch64::LSPair64_STR, AArch64::LS64_STR}, {&AArch64::FPR64RegClass, AArch64::LSFPPair64_STR, AArch64::LSFP64_STR}, }; - unsigned NumClasses = llvm::array_lengthof(PossibleClasses); + const unsigned NumClasses = llvm::array_lengthof(PossibleClasses); emitFrameMemOps(/* isPrologue = */ true, MBB, MBBI, CSI, TRI, PossibleClasses, NumClasses); @@ -548,11 +549,11 @@ AArch64FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, if (CSI.empty()) return false; - static LoadStoreMethod PossibleClasses[] = { + static const LoadStoreMethod PossibleClasses[] = { {&AArch64::GPR64RegClass, AArch64::LSPair64_LDR, AArch64::LS64_LDR}, {&AArch64::FPR64RegClass, AArch64::LSFPPair64_LDR, AArch64::LSFP64_LDR}, }; - unsigned NumClasses = llvm::array_lengthof(PossibleClasses); + const unsigned NumClasses = llvm::array_lengthof(PossibleClasses); emitFrameMemOps(/* isPrologue = */ false, MBB, MBBI, CSI, TRI, PossibleClasses, NumClasses); diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h index 45ea0ec..032dd90 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.h +++ b/lib/Target/AArch64/AArch64FrameLowering.h @@ -90,7 +90,7 @@ public: MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI, const TargetRegisterInfo *TRI, - LoadStoreMethod PossibleClasses[], + const LoadStoreMethod PossibleClasses[], unsigned NumClasses) const; diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 102c71b..ef99541 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -33,7 +33,6 @@ namespace { class AArch64DAGToDAGISel : public SelectionDAGISel { AArch64TargetMachine &TM; - const AArch64InstrInfo *TII; /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. @@ -43,7 +42,6 @@ public: explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm, CodeGenOpt::Level OptLevel) : SelectionDAGISel(tm, OptLevel), TM(tm), - TII(static_cast<const AArch64InstrInfo*>(TM.getInstrInfo())), Subtarget(&TM.getSubtarget<AArch64Subtarget>()) { } @@ -72,10 +70,11 @@ public: /// Used for pre-lowered address-reference nodes, so we already know /// the fields match. This operand's job is simply to add an - /// appropriate shift operand (i.e. 0) to the MOVZ/MOVK instruction. + /// appropriate shift operand to the MOVZ/MOVK instruction. + template<unsigned LogShift> bool SelectMOVWAddressRef(SDValue N, SDValue &Imm, SDValue &Shift) { Imm = N; - Shift = CurDAG->getTargetConstant(0, MVT::i32); + Shift = CurDAG->getTargetConstant(LogShift, MVT::i32); return true; } @@ -102,7 +101,7 @@ public: /// Put the given constant into a pool and return a DAG which will give its /// address. - SDValue getConstantPoolItemAddress(DebugLoc DL, const Constant *CV); + SDValue getConstantPoolItemAddress(SDLoc DL, const Constant *CV); SDNode *TrySelectToMoveImm(SDNode *N); SDNode *LowerToFPLitPool(SDNode *Node); @@ -110,6 +109,45 @@ public: SDNode* Select(SDNode*); private: + /// Get the opcode for table lookup instruction + unsigned getTBLOpc(bool IsExt, bool Is64Bit, unsigned NumOfVec); + + /// Select NEON table lookup intrinsics. NumVecs should be 1, 2, 3 or 4. + /// IsExt is to indicate if the result will be extended with an argument. + SDNode *SelectVTBL(SDNode *N, unsigned NumVecs, bool IsExt); + + /// Select NEON load intrinsics. NumVecs should be 1, 2, 3 or 4. + SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, + const uint16_t *Opcode); + + /// Select NEON store intrinsics. NumVecs should be 1, 2, 3 or 4. + SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, + const uint16_t *Opcodes); + + /// Form sequences of consecutive 64/128-bit registers for use in NEON + /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have + /// between 1 and 4 elements. If it contains a single element that is returned + /// unchanged; otherwise a REG_SEQUENCE value is returned. + SDValue createDTuple(ArrayRef<SDValue> Vecs); + SDValue createQTuple(ArrayRef<SDValue> Vecs); + + /// Generic helper for the createDTuple/createQTuple + /// functions. Those should almost always be called instead. + SDValue createTuple(ArrayRef<SDValue> Vecs, unsigned RegClassIDs[], + unsigned SubRegs[]); + + /// Select NEON load-duplicate intrinsics. NumVecs should be 2, 3 or 4. + /// The opcode array specifies the instructions used for load. + SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs, + const uint16_t *Opcodes); + + /// Select NEON load/store lane intrinsics. NumVecs should be 2, 3 or 4. + /// The opcode arrays specify the instructions used for load/store. + SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating, + unsigned NumVecs, const uint16_t *Opcodes); + + SDValue getTargetSubregToReg(int SRIdx, SDLoc DL, EVT VT, EVT VTD, + SDValue Operand); }; } @@ -191,7 +229,7 @@ bool AArch64DAGToDAGISel::SelectLogicalImm(SDValue N, SDValue &Imm) { SDNode *AArch64DAGToDAGISel::TrySelectToMoveImm(SDNode *Node) { SDNode *ResNode; - DebugLoc dl = Node->getDebugLoc(); + SDLoc dl(Node); EVT DestType = Node->getValueType(0); unsigned DestWidth = DestType.getSizeInBits(); @@ -241,14 +279,14 @@ SDNode *AArch64DAGToDAGISel::TrySelectToMoveImm(SDNode *Node) { } SDValue -AArch64DAGToDAGISel::getConstantPoolItemAddress(DebugLoc DL, +AArch64DAGToDAGISel::getConstantPoolItemAddress(SDLoc DL, const Constant *CV) { - EVT PtrVT = TLI.getPointerTy(); + EVT PtrVT = getTargetLowering()->getPointerTy(); - switch (TLI.getTargetMachine().getCodeModel()) { + switch (getTargetLowering()->getTargetMachine().getCodeModel()) { case CodeModel::Small: { unsigned Alignment = - TLI.getDataLayout()->getABITypeAlignment(CV->getType()); + getTargetLowering()->getDataLayout()->getABITypeAlignment(CV->getType()); return CurDAG->getNode( AArch64ISD::WrapperSmall, DL, PtrVT, CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_NO_FLAG), @@ -260,15 +298,15 @@ AArch64DAGToDAGISel::getConstantPoolItemAddress(DebugLoc DL, LitAddr = CurDAG->getMachineNode( AArch64::MOVZxii, DL, PtrVT, CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G3), - CurDAG->getTargetConstant(0, MVT::i32)); + CurDAG->getTargetConstant(3, MVT::i32)); LitAddr = CurDAG->getMachineNode( AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0), CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G2_NC), - CurDAG->getTargetConstant(0, MVT::i32)); + CurDAG->getTargetConstant(2, MVT::i32)); LitAddr = CurDAG->getMachineNode( AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0), CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G1_NC), - CurDAG->getTargetConstant(0, MVT::i32)); + CurDAG->getTargetConstant(1, MVT::i32)); LitAddr = CurDAG->getMachineNode( AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0), CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G0_NC), @@ -281,7 +319,7 @@ AArch64DAGToDAGISel::getConstantPoolItemAddress(DebugLoc DL, } SDNode *AArch64DAGToDAGISel::SelectToLitPool(SDNode *Node) { - DebugLoc DL = Node->getDebugLoc(); + SDLoc DL(Node); uint64_t UnsignedVal = cast<ConstantSDNode>(Node)->getZExtValue(); int64_t SignedVal = cast<ConstantSDNode>(Node)->getSExtValue(); EVT DestType = Node->getValueType(0); @@ -312,7 +350,8 @@ SDNode *AArch64DAGToDAGISel::SelectToLitPool(SDNode *Node) { MemType.getSizeInBits()), UnsignedVal); SDValue PoolAddr = getConstantPoolItemAddress(DL, CV); - unsigned Alignment = TLI.getDataLayout()->getABITypeAlignment(CV->getType()); + unsigned Alignment = + getTargetLowering()->getDataLayout()->getABITypeAlignment(CV->getType()); return CurDAG->getExtLoad(Extension, DL, DestType, CurDAG->getEntryNode(), PoolAddr, @@ -323,11 +362,12 @@ SDNode *AArch64DAGToDAGISel::SelectToLitPool(SDNode *Node) { } SDNode *AArch64DAGToDAGISel::LowerToFPLitPool(SDNode *Node) { - DebugLoc DL = Node->getDebugLoc(); + SDLoc DL(Node); const ConstantFP *FV = cast<ConstantFPSDNode>(Node)->getConstantFPValue(); EVT DestType = Node->getValueType(0); - unsigned Alignment = TLI.getDataLayout()->getABITypeAlignment(FV->getType()); + unsigned Alignment = + getTargetLowering()->getDataLayout()->getABITypeAlignment(FV->getType()); SDValue PoolAddr = getConstantPoolItemAddress(DL, FV); return CurDAG->getLoad(DestType, DL, CurDAG->getEntryNode(), PoolAddr, @@ -389,12 +429,607 @@ SDNode *AArch64DAGToDAGISel::SelectAtomic(SDNode *Node, unsigned Op8, &Ops[0], Ops.size()); } +SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) { + static unsigned RegClassIDs[] = { AArch64::DPairRegClassID, + AArch64::DTripleRegClassID, + AArch64::DQuadRegClassID }; + static unsigned SubRegs[] = { AArch64::dsub_0, AArch64::dsub_1, + AArch64::dsub_2, AArch64::dsub_3 }; + + return createTuple(Regs, RegClassIDs, SubRegs); +} + +SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) { + static unsigned RegClassIDs[] = { AArch64::QPairRegClassID, + AArch64::QTripleRegClassID, + AArch64::QQuadRegClassID }; + static unsigned SubRegs[] = { AArch64::qsub_0, AArch64::qsub_1, + AArch64::qsub_2, AArch64::qsub_3 }; + + return createTuple(Regs, RegClassIDs, SubRegs); +} + +SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs, + unsigned RegClassIDs[], + unsigned SubRegs[]) { + // There's no special register-class for a vector-list of 1 element: it's just + // a vector. + if (Regs.size() == 1) + return Regs[0]; + + assert(Regs.size() >= 2 && Regs.size() <= 4); + + SDLoc DL(Regs[0].getNode()); + + SmallVector<SDValue, 4> Ops; + + // First operand of REG_SEQUENCE is the desired RegClass. + Ops.push_back( + CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], MVT::i32)); + + // Then we get pairs of source & subregister-position for the components. + for (unsigned i = 0; i < Regs.size(); ++i) { + Ops.push_back(Regs[i]); + Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], MVT::i32)); + } + + SDNode *N = + CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops); + return SDValue(N, 0); +} + + +// Get the register stride update opcode of a VLD/VST instruction that +// is otherwise equivalent to the given fixed stride updating instruction. +static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) { + switch (Opc) { + default: break; + case AArch64::LD1WB_8B_fixed: return AArch64::LD1WB_8B_register; + case AArch64::LD1WB_4H_fixed: return AArch64::LD1WB_4H_register; + case AArch64::LD1WB_2S_fixed: return AArch64::LD1WB_2S_register; + case AArch64::LD1WB_1D_fixed: return AArch64::LD1WB_1D_register; + case AArch64::LD1WB_16B_fixed: return AArch64::LD1WB_16B_register; + case AArch64::LD1WB_8H_fixed: return AArch64::LD1WB_8H_register; + case AArch64::LD1WB_4S_fixed: return AArch64::LD1WB_4S_register; + case AArch64::LD1WB_2D_fixed: return AArch64::LD1WB_2D_register; + + case AArch64::LD2WB_8B_fixed: return AArch64::LD2WB_8B_register; + case AArch64::LD2WB_4H_fixed: return AArch64::LD2WB_4H_register; + case AArch64::LD2WB_2S_fixed: return AArch64::LD2WB_2S_register; + case AArch64::LD2WB_16B_fixed: return AArch64::LD2WB_16B_register; + case AArch64::LD2WB_8H_fixed: return AArch64::LD2WB_8H_register; + case AArch64::LD2WB_4S_fixed: return AArch64::LD2WB_4S_register; + case AArch64::LD2WB_2D_fixed: return AArch64::LD2WB_2D_register; + + case AArch64::LD3WB_8B_fixed: return AArch64::LD3WB_8B_register; + case AArch64::LD3WB_4H_fixed: return AArch64::LD3WB_4H_register; + case AArch64::LD3WB_2S_fixed: return AArch64::LD3WB_2S_register; + case AArch64::LD3WB_16B_fixed: return AArch64::LD3WB_16B_register; + case AArch64::LD3WB_8H_fixed: return AArch64::LD3WB_8H_register; + case AArch64::LD3WB_4S_fixed: return AArch64::LD3WB_4S_register; + case AArch64::LD3WB_2D_fixed: return AArch64::LD3WB_2D_register; + + case AArch64::LD4WB_8B_fixed: return AArch64::LD4WB_8B_register; + case AArch64::LD4WB_4H_fixed: return AArch64::LD4WB_4H_register; + case AArch64::LD4WB_2S_fixed: return AArch64::LD4WB_2S_register; + case AArch64::LD4WB_16B_fixed: return AArch64::LD4WB_16B_register; + case AArch64::LD4WB_8H_fixed: return AArch64::LD4WB_8H_register; + case AArch64::LD4WB_4S_fixed: return AArch64::LD4WB_4S_register; + case AArch64::LD4WB_2D_fixed: return AArch64::LD4WB_2D_register; + + case AArch64::LD1x2WB_8B_fixed: return AArch64::LD1x2WB_8B_register; + case AArch64::LD1x2WB_4H_fixed: return AArch64::LD1x2WB_4H_register; + case AArch64::LD1x2WB_2S_fixed: return AArch64::LD1x2WB_2S_register; + case AArch64::LD1x2WB_1D_fixed: return AArch64::LD1x2WB_1D_register; + case AArch64::LD1x2WB_16B_fixed: return AArch64::LD1x2WB_16B_register; + case AArch64::LD1x2WB_8H_fixed: return AArch64::LD1x2WB_8H_register; + case AArch64::LD1x2WB_4S_fixed: return AArch64::LD1x2WB_4S_register; + case AArch64::LD1x2WB_2D_fixed: return AArch64::LD1x2WB_2D_register; + + case AArch64::LD1x3WB_8B_fixed: return AArch64::LD1x3WB_8B_register; + case AArch64::LD1x3WB_4H_fixed: return AArch64::LD1x3WB_4H_register; + case AArch64::LD1x3WB_2S_fixed: return AArch64::LD1x3WB_2S_register; + case AArch64::LD1x3WB_1D_fixed: return AArch64::LD1x3WB_1D_register; + case AArch64::LD1x3WB_16B_fixed: return AArch64::LD1x3WB_16B_register; + case AArch64::LD1x3WB_8H_fixed: return AArch64::LD1x3WB_8H_register; + case AArch64::LD1x3WB_4S_fixed: return AArch64::LD1x3WB_4S_register; + case AArch64::LD1x3WB_2D_fixed: return AArch64::LD1x3WB_2D_register; + + case AArch64::LD1x4WB_8B_fixed: return AArch64::LD1x4WB_8B_register; + case AArch64::LD1x4WB_4H_fixed: return AArch64::LD1x4WB_4H_register; + case AArch64::LD1x4WB_2S_fixed: return AArch64::LD1x4WB_2S_register; + case AArch64::LD1x4WB_1D_fixed: return AArch64::LD1x4WB_1D_register; + case AArch64::LD1x4WB_16B_fixed: return AArch64::LD1x4WB_16B_register; + case AArch64::LD1x4WB_8H_fixed: return AArch64::LD1x4WB_8H_register; + case AArch64::LD1x4WB_4S_fixed: return AArch64::LD1x4WB_4S_register; + case AArch64::LD1x4WB_2D_fixed: return AArch64::LD1x4WB_2D_register; + + case AArch64::ST1WB_8B_fixed: return AArch64::ST1WB_8B_register; + case AArch64::ST1WB_4H_fixed: return AArch64::ST1WB_4H_register; + case AArch64::ST1WB_2S_fixed: return AArch64::ST1WB_2S_register; + case AArch64::ST1WB_1D_fixed: return AArch64::ST1WB_1D_register; + case AArch64::ST1WB_16B_fixed: return AArch64::ST1WB_16B_register; + case AArch64::ST1WB_8H_fixed: return AArch64::ST1WB_8H_register; + case AArch64::ST1WB_4S_fixed: return AArch64::ST1WB_4S_register; + case AArch64::ST1WB_2D_fixed: return AArch64::ST1WB_2D_register; + + case AArch64::ST2WB_8B_fixed: return AArch64::ST2WB_8B_register; + case AArch64::ST2WB_4H_fixed: return AArch64::ST2WB_4H_register; + case AArch64::ST2WB_2S_fixed: return AArch64::ST2WB_2S_register; + case AArch64::ST2WB_16B_fixed: return AArch64::ST2WB_16B_register; + case AArch64::ST2WB_8H_fixed: return AArch64::ST2WB_8H_register; + case AArch64::ST2WB_4S_fixed: return AArch64::ST2WB_4S_register; + case AArch64::ST2WB_2D_fixed: return AArch64::ST2WB_2D_register; + + case AArch64::ST3WB_8B_fixed: return AArch64::ST3WB_8B_register; + case AArch64::ST3WB_4H_fixed: return AArch64::ST3WB_4H_register; + case AArch64::ST3WB_2S_fixed: return AArch64::ST3WB_2S_register; + case AArch64::ST3WB_16B_fixed: return AArch64::ST3WB_16B_register; + case AArch64::ST3WB_8H_fixed: return AArch64::ST3WB_8H_register; + case AArch64::ST3WB_4S_fixed: return AArch64::ST3WB_4S_register; + case AArch64::ST3WB_2D_fixed: return AArch64::ST3WB_2D_register; + + case AArch64::ST4WB_8B_fixed: return AArch64::ST4WB_8B_register; + case AArch64::ST4WB_4H_fixed: return AArch64::ST4WB_4H_register; + case AArch64::ST4WB_2S_fixed: return AArch64::ST4WB_2S_register; + case AArch64::ST4WB_16B_fixed: return AArch64::ST4WB_16B_register; + case AArch64::ST4WB_8H_fixed: return AArch64::ST4WB_8H_register; + case AArch64::ST4WB_4S_fixed: return AArch64::ST4WB_4S_register; + case AArch64::ST4WB_2D_fixed: return AArch64::ST4WB_2D_register; + + case AArch64::ST1x2WB_8B_fixed: return AArch64::ST1x2WB_8B_register; + case AArch64::ST1x2WB_4H_fixed: return AArch64::ST1x2WB_4H_register; + case AArch64::ST1x2WB_2S_fixed: return AArch64::ST1x2WB_2S_register; + case AArch64::ST1x2WB_1D_fixed: return AArch64::ST1x2WB_1D_register; + case AArch64::ST1x2WB_16B_fixed: return AArch64::ST1x2WB_16B_register; + case AArch64::ST1x2WB_8H_fixed: return AArch64::ST1x2WB_8H_register; + case AArch64::ST1x2WB_4S_fixed: return AArch64::ST1x2WB_4S_register; + case AArch64::ST1x2WB_2D_fixed: return AArch64::ST1x2WB_2D_register; + + case AArch64::ST1x3WB_8B_fixed: return AArch64::ST1x3WB_8B_register; + case AArch64::ST1x3WB_4H_fixed: return AArch64::ST1x3WB_4H_register; + case AArch64::ST1x3WB_2S_fixed: return AArch64::ST1x3WB_2S_register; + case AArch64::ST1x3WB_1D_fixed: return AArch64::ST1x3WB_1D_register; + case AArch64::ST1x3WB_16B_fixed: return AArch64::ST1x3WB_16B_register; + case AArch64::ST1x3WB_8H_fixed: return AArch64::ST1x3WB_8H_register; + case AArch64::ST1x3WB_4S_fixed: return AArch64::ST1x3WB_4S_register; + case AArch64::ST1x3WB_2D_fixed: return AArch64::ST1x3WB_2D_register; + + case AArch64::ST1x4WB_8B_fixed: return AArch64::ST1x4WB_8B_register; + case AArch64::ST1x4WB_4H_fixed: return AArch64::ST1x4WB_4H_register; + case AArch64::ST1x4WB_2S_fixed: return AArch64::ST1x4WB_2S_register; + case AArch64::ST1x4WB_1D_fixed: return AArch64::ST1x4WB_1D_register; + case AArch64::ST1x4WB_16B_fixed: return AArch64::ST1x4WB_16B_register; + case AArch64::ST1x4WB_8H_fixed: return AArch64::ST1x4WB_8H_register; + case AArch64::ST1x4WB_4S_fixed: return AArch64::ST1x4WB_4S_register; + case AArch64::ST1x4WB_2D_fixed: return AArch64::ST1x4WB_2D_register; + + // Post-index of duplicate loads + case AArch64::LD2R_WB_8B_fixed: return AArch64::LD2R_WB_8B_register; + case AArch64::LD2R_WB_4H_fixed: return AArch64::LD2R_WB_4H_register; + case AArch64::LD2R_WB_2S_fixed: return AArch64::LD2R_WB_2S_register; + case AArch64::LD2R_WB_1D_fixed: return AArch64::LD2R_WB_1D_register; + case AArch64::LD2R_WB_16B_fixed: return AArch64::LD2R_WB_16B_register; + case AArch64::LD2R_WB_8H_fixed: return AArch64::LD2R_WB_8H_register; + case AArch64::LD2R_WB_4S_fixed: return AArch64::LD2R_WB_4S_register; + case AArch64::LD2R_WB_2D_fixed: return AArch64::LD2R_WB_2D_register; + + case AArch64::LD3R_WB_8B_fixed: return AArch64::LD3R_WB_8B_register; + case AArch64::LD3R_WB_4H_fixed: return AArch64::LD3R_WB_4H_register; + case AArch64::LD3R_WB_2S_fixed: return AArch64::LD3R_WB_2S_register; + case AArch64::LD3R_WB_1D_fixed: return AArch64::LD3R_WB_1D_register; + case AArch64::LD3R_WB_16B_fixed: return AArch64::LD3R_WB_16B_register; + case AArch64::LD3R_WB_8H_fixed: return AArch64::LD3R_WB_8H_register; + case AArch64::LD3R_WB_4S_fixed: return AArch64::LD3R_WB_4S_register; + case AArch64::LD3R_WB_2D_fixed: return AArch64::LD3R_WB_2D_register; + + case AArch64::LD4R_WB_8B_fixed: return AArch64::LD4R_WB_8B_register; + case AArch64::LD4R_WB_4H_fixed: return AArch64::LD4R_WB_4H_register; + case AArch64::LD4R_WB_2S_fixed: return AArch64::LD4R_WB_2S_register; + case AArch64::LD4R_WB_1D_fixed: return AArch64::LD4R_WB_1D_register; + case AArch64::LD4R_WB_16B_fixed: return AArch64::LD4R_WB_16B_register; + case AArch64::LD4R_WB_8H_fixed: return AArch64::LD4R_WB_8H_register; + case AArch64::LD4R_WB_4S_fixed: return AArch64::LD4R_WB_4S_register; + case AArch64::LD4R_WB_2D_fixed: return AArch64::LD4R_WB_2D_register; + + // Post-index of lane loads + case AArch64::LD2LN_WB_B_fixed: return AArch64::LD2LN_WB_B_register; + case AArch64::LD2LN_WB_H_fixed: return AArch64::LD2LN_WB_H_register; + case AArch64::LD2LN_WB_S_fixed: return AArch64::LD2LN_WB_S_register; + case AArch64::LD2LN_WB_D_fixed: return AArch64::LD2LN_WB_D_register; + + case AArch64::LD3LN_WB_B_fixed: return AArch64::LD3LN_WB_B_register; + case AArch64::LD3LN_WB_H_fixed: return AArch64::LD3LN_WB_H_register; + case AArch64::LD3LN_WB_S_fixed: return AArch64::LD3LN_WB_S_register; + case AArch64::LD3LN_WB_D_fixed: return AArch64::LD3LN_WB_D_register; + + case AArch64::LD4LN_WB_B_fixed: return AArch64::LD4LN_WB_B_register; + case AArch64::LD4LN_WB_H_fixed: return AArch64::LD4LN_WB_H_register; + case AArch64::LD4LN_WB_S_fixed: return AArch64::LD4LN_WB_S_register; + case AArch64::LD4LN_WB_D_fixed: return AArch64::LD4LN_WB_D_register; + + // Post-index of lane stores + case AArch64::ST2LN_WB_B_fixed: return AArch64::ST2LN_WB_B_register; + case AArch64::ST2LN_WB_H_fixed: return AArch64::ST2LN_WB_H_register; + case AArch64::ST2LN_WB_S_fixed: return AArch64::ST2LN_WB_S_register; + case AArch64::ST2LN_WB_D_fixed: return AArch64::ST2LN_WB_D_register; + + case AArch64::ST3LN_WB_B_fixed: return AArch64::ST3LN_WB_B_register; + case AArch64::ST3LN_WB_H_fixed: return AArch64::ST3LN_WB_H_register; + case AArch64::ST3LN_WB_S_fixed: return AArch64::ST3LN_WB_S_register; + case AArch64::ST3LN_WB_D_fixed: return AArch64::ST3LN_WB_D_register; + + case AArch64::ST4LN_WB_B_fixed: return AArch64::ST4LN_WB_B_register; + case AArch64::ST4LN_WB_H_fixed: return AArch64::ST4LN_WB_H_register; + case AArch64::ST4LN_WB_S_fixed: return AArch64::ST4LN_WB_S_register; + case AArch64::ST4LN_WB_D_fixed: return AArch64::ST4LN_WB_D_register; + } + return Opc; // If not one we handle, return it unchanged. +} + +SDNode *AArch64DAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, + unsigned NumVecs, + const uint16_t *Opcodes) { + assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range"); + + EVT VT = N->getValueType(0); + unsigned OpcodeIndex; + bool is64BitVector = VT.is64BitVector(); + switch (VT.getScalarType().getSizeInBits()) { + case 8: OpcodeIndex = is64BitVector ? 0 : 4; break; + case 16: OpcodeIndex = is64BitVector ? 1 : 5; break; + case 32: OpcodeIndex = is64BitVector ? 2 : 6; break; + case 64: OpcodeIndex = is64BitVector ? 3 : 7; break; + default: llvm_unreachable("unhandled vector load type"); + } + unsigned Opc = Opcodes[OpcodeIndex]; + + SmallVector<SDValue, 2> Ops; + unsigned AddrOpIdx = isUpdating ? 1 : 2; + Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address + + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register + Opc = getVLDSTRegisterUpdateOpcode(Opc); + Ops.push_back(Inc); + } + + Ops.push_back(N->getOperand(0)); // Push back the Chain + + SmallVector<EVT, 3> ResTys; + // Push back the type of return super register + if (NumVecs == 1) + ResTys.push_back(VT); + else if (NumVecs == 3) + ResTys.push_back(MVT::Untyped); + else { + EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, + is64BitVector ? NumVecs : NumVecs * 2); + ResTys.push_back(ResTy); + } + + if (isUpdating) + ResTys.push_back(MVT::i64); // Type of the updated register + ResTys.push_back(MVT::Other); // Type of the Chain + SDLoc dl(N); + SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1); + + if (NumVecs == 1) + return VLd; + + // If NumVecs > 1, the return result is a super register containing 2-4 + // consecutive vector registers. + SDValue SuperReg = SDValue(VLd, 0); + + unsigned Sub0 = is64BitVector ? AArch64::dsub_0 : AArch64::qsub_0; + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + ReplaceUses(SDValue(N, Vec), + CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg)); + // Update users of the Chain + ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1)); + if (isUpdating) + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2)); + + return NULL; +} + +SDNode *AArch64DAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, + unsigned NumVecs, + const uint16_t *Opcodes) { + assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range"); + SDLoc dl(N); + + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + + unsigned AddrOpIdx = isUpdating ? 1 : 2; + unsigned Vec0Idx = 3; + EVT VT = N->getOperand(Vec0Idx).getValueType(); + unsigned OpcodeIndex; + bool is64BitVector = VT.is64BitVector(); + switch (VT.getScalarType().getSizeInBits()) { + case 8: OpcodeIndex = is64BitVector ? 0 : 4; break; + case 16: OpcodeIndex = is64BitVector ? 1 : 5; break; + case 32: OpcodeIndex = is64BitVector ? 2 : 6; break; + case 64: OpcodeIndex = is64BitVector ? 3 : 7; break; + default: llvm_unreachable("unhandled vector store type"); + } + unsigned Opc = Opcodes[OpcodeIndex]; + + SmallVector<EVT, 2> ResTys; + if (isUpdating) + ResTys.push_back(MVT::i64); + ResTys.push_back(MVT::Other); // Type for the Chain + + SmallVector<SDValue, 6> Ops; + Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address + + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register + Opc = getVLDSTRegisterUpdateOpcode(Opc); + Ops.push_back(Inc); + } + + SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx, + N->op_begin() + Vec0Idx + NumVecs); + SDValue SrcReg = is64BitVector ? createDTuple(Regs) : createQTuple(Regs); + Ops.push_back(SrcReg); + + // Push back the Chain + Ops.push_back(N->getOperand(0)); + + // Transfer memoperands. + SDNode *VSt = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1); + + return VSt; +} + +SDValue +AArch64DAGToDAGISel::getTargetSubregToReg(int SRIdx, SDLoc DL, EVT VT, EVT VTD, + SDValue Operand) { + SDNode *Reg = CurDAG->getMachineNode(TargetOpcode::SUBREG_TO_REG, DL, + VT, VTD, MVT::Other, + CurDAG->getTargetConstant(0, MVT::i64), + Operand, + CurDAG->getTargetConstant(AArch64::sub_64, MVT::i32)); + return SDValue(Reg, 0); +} + +SDNode *AArch64DAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, + unsigned NumVecs, + const uint16_t *Opcodes) { + assert(NumVecs >=2 && NumVecs <= 4 && "Load Dup NumVecs out-of-range"); + SDLoc dl(N); + + EVT VT = N->getValueType(0); + unsigned OpcodeIndex; + bool is64BitVector = VT.is64BitVector(); + switch (VT.getScalarType().getSizeInBits()) { + case 8: OpcodeIndex = is64BitVector ? 0 : 4; break; + case 16: OpcodeIndex = is64BitVector ? 1 : 5; break; + case 32: OpcodeIndex = is64BitVector ? 2 : 6; break; + case 64: OpcodeIndex = is64BitVector ? 3 : 7; break; + default: llvm_unreachable("unhandled vector duplicate lane load type"); + } + unsigned Opc = Opcodes[OpcodeIndex]; + + SDValue SuperReg; + SmallVector<SDValue, 6> Ops; + Ops.push_back(N->getOperand(1)); // Push back the Memory Address + if (isUpdating) { + SDValue Inc = N->getOperand(2); + if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register + Opc = getVLDSTRegisterUpdateOpcode(Opc); + Ops.push_back(Inc); + } + Ops.push_back(N->getOperand(0)); // Push back the Chain + + SmallVector<EVT, 3> ResTys; + // Push back the type of return super register + if (NumVecs == 3) + ResTys.push_back(MVT::Untyped); + else { + EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, + is64BitVector ? NumVecs : NumVecs * 2); + ResTys.push_back(ResTy); + } + if (isUpdating) + ResTys.push_back(MVT::i64); // Type of the updated register + ResTys.push_back(MVT::Other); // Type of the Chain + SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1); + + SuperReg = SDValue(VLdDup, 0); + unsigned Sub0 = is64BitVector ? AArch64::dsub_0 : AArch64::qsub_0; + // Update uses of each registers in super register + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + ReplaceUses(SDValue(N, Vec), + CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg)); + // Update uses of the Chain + ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1)); + if (isUpdating) + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2)); + return NULL; +} + +// We only have 128-bit vector type of load/store lane instructions. +// If it is 64-bit vector, we also select it to the 128-bit instructions. +// Just use SUBREG_TO_REG to adapt the input to 128-bit vector and +// EXTRACT_SUBREG to get the 64-bit vector from the 128-bit vector output. +SDNode *AArch64DAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, + bool isUpdating, unsigned NumVecs, + const uint16_t *Opcodes) { + assert(NumVecs >= 2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range"); + SDLoc dl(N); + unsigned AddrOpIdx = isUpdating ? 1 : 2; + unsigned Vec0Idx = 3; + + SDValue Chain = N->getOperand(0); + unsigned Lane = + cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue(); + EVT VT = N->getOperand(Vec0Idx).getValueType(); + bool is64BitVector = VT.is64BitVector(); + EVT VT64; // 64-bit Vector Type + + if (is64BitVector) { + VT64 = VT; + VT = EVT::getVectorVT(*CurDAG->getContext(), VT.getVectorElementType(), + VT.getVectorNumElements() * 2); + } + + unsigned OpcodeIndex; + switch (VT.getScalarType().getSizeInBits()) { + case 8: OpcodeIndex = 0; break; + case 16: OpcodeIndex = 1; break; + case 32: OpcodeIndex = 2; break; + case 64: OpcodeIndex = 3; break; + default: llvm_unreachable("unhandled vector lane load/store type"); + } + unsigned Opc = Opcodes[OpcodeIndex]; + + SmallVector<EVT, 3> ResTys; + if (IsLoad) { + // Push back the type of return super register + if (NumVecs == 3) + ResTys.push_back(MVT::Untyped); + else { + EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, + is64BitVector ? NumVecs : NumVecs * 2); + ResTys.push_back(ResTy); + } + } + if (isUpdating) + ResTys.push_back(MVT::i64); // Type of the updated register + ResTys.push_back(MVT::Other); // Type of Chain + SmallVector<SDValue, 5> Ops; + Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register + Opc = getVLDSTRegisterUpdateOpcode(Opc); + Ops.push_back(Inc); + } + + SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx, + N->op_begin() + Vec0Idx + NumVecs); + if (is64BitVector) + for (unsigned i = 0; i < Regs.size(); i++) + Regs[i] = getTargetSubregToReg(AArch64::sub_64, dl, VT, VT64, Regs[i]); + SDValue SuperReg = createQTuple(Regs); + + Ops.push_back(SuperReg); // Source Reg + SDValue LaneValue = CurDAG->getTargetConstant(Lane, MVT::i32); + Ops.push_back(LaneValue); + Ops.push_back(Chain); // Push back the Chain + + SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1); + if (!IsLoad) + return VLdLn; + + // Extract the subregisters. + SuperReg = SDValue(VLdLn, 0); + unsigned Sub0 = AArch64::qsub_0; + // Update uses of each registers in super register + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { + SDValue SUB0 = CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg); + if (is64BitVector) { + SUB0 = CurDAG->getTargetExtractSubreg(AArch64::sub_64, dl, VT64, SUB0); + } + ReplaceUses(SDValue(N, Vec), SUB0); + } + ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1)); + if (isUpdating) + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2)); + return NULL; +} + +unsigned AArch64DAGToDAGISel::getTBLOpc(bool IsExt, bool Is64Bit, + unsigned NumOfVec) { + assert(NumOfVec >= 1 && NumOfVec <= 4 && "VST NumVecs out-of-range"); + + unsigned Opc = 0; + switch (NumOfVec) { + default: + break; + case 1: + if (IsExt) + Opc = Is64Bit ? AArch64::TBX1_8b : AArch64::TBX1_16b; + else + Opc = Is64Bit ? AArch64::TBL1_8b : AArch64::TBL1_16b; + break; + case 2: + if (IsExt) + Opc = Is64Bit ? AArch64::TBX2_8b : AArch64::TBX2_16b; + else + Opc = Is64Bit ? AArch64::TBL2_8b : AArch64::TBL2_16b; + break; + case 3: + if (IsExt) + Opc = Is64Bit ? AArch64::TBX3_8b : AArch64::TBX3_16b; + else + Opc = Is64Bit ? AArch64::TBL3_8b : AArch64::TBL3_16b; + break; + case 4: + if (IsExt) + Opc = Is64Bit ? AArch64::TBX4_8b : AArch64::TBX4_16b; + else + Opc = Is64Bit ? AArch64::TBL4_8b : AArch64::TBL4_16b; + break; + } + + return Opc; +} + +SDNode *AArch64DAGToDAGISel::SelectVTBL(SDNode *N, unsigned NumVecs, + bool IsExt) { + assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range"); + SDLoc dl(N); + + // Check the element of look up table is 64-bit or not + unsigned Vec0Idx = IsExt ? 2 : 1; + assert(!N->getOperand(Vec0Idx + 0).getValueType().is64BitVector() && + "The element of lookup table for vtbl and vtbx must be 128-bit"); + + // Check the return value type is 64-bit or not + EVT ResVT = N->getValueType(0); + bool is64BitRes = ResVT.is64BitVector(); + + // Create new SDValue for vector list + SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx, + N->op_begin() + Vec0Idx + NumVecs); + SDValue TblReg = createQTuple(Regs); + unsigned Opc = getTBLOpc(IsExt, is64BitRes, NumVecs); + + SmallVector<SDValue, 3> Ops; + if (IsExt) + Ops.push_back(N->getOperand(1)); + Ops.push_back(TblReg); + Ops.push_back(N->getOperand(Vec0Idx + NumVecs)); + return CurDAG->getMachineNode(Opc, dl, ResVT, Ops); +} + SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { // Dump information about the Node being selected DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << "\n"); if (Node->isMachineOpcode()) { DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << "\n"); + Node->setNodeId(-1); return NULL; } @@ -473,7 +1108,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::ATOMIC_CMP_SWAP_I64); case ISD::FrameIndex: { int FI = cast<FrameIndexSDNode>(Node)->getIndex(); - EVT PtrTy = TLI.getPointerTy(); + EVT PtrTy = getTargetLowering()->getPointerTy(); SDValue TFI = CurDAG->getTargetFrameIndex(FI, PtrTy); return CurDAG->SelectNodeTo(Node, AArch64::ADDxxi_lsl0_s, PtrTy, TFI, CurDAG->getTargetConstant(0, PtrTy)); @@ -497,7 +1132,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { assert((Ty == MVT::i32 || Ty == MVT::i64) && "unexpected type"); uint16_t Register = Ty == MVT::i32 ? AArch64::WZR : AArch64::XZR; ResNode = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), - Node->getDebugLoc(), + SDLoc(Node), Register, Ty).getNode(); } @@ -534,6 +1169,399 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { Node = ResNode; break; } + case AArch64ISD::NEON_LD1_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD1WB_8B_fixed, AArch64::LD1WB_4H_fixed, + AArch64::LD1WB_2S_fixed, AArch64::LD1WB_1D_fixed, + AArch64::LD1WB_16B_fixed, AArch64::LD1WB_8H_fixed, + AArch64::LD1WB_4S_fixed, AArch64::LD1WB_2D_fixed + }; + return SelectVLD(Node, true, 1, Opcodes); + } + case AArch64ISD::NEON_LD2_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD2WB_8B_fixed, AArch64::LD2WB_4H_fixed, + AArch64::LD2WB_2S_fixed, AArch64::LD1x2WB_1D_fixed, + AArch64::LD2WB_16B_fixed, AArch64::LD2WB_8H_fixed, + AArch64::LD2WB_4S_fixed, AArch64::LD2WB_2D_fixed + }; + return SelectVLD(Node, true, 2, Opcodes); + } + case AArch64ISD::NEON_LD3_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD3WB_8B_fixed, AArch64::LD3WB_4H_fixed, + AArch64::LD3WB_2S_fixed, AArch64::LD1x3WB_1D_fixed, + AArch64::LD3WB_16B_fixed, AArch64::LD3WB_8H_fixed, + AArch64::LD3WB_4S_fixed, AArch64::LD3WB_2D_fixed + }; + return SelectVLD(Node, true, 3, Opcodes); + } + case AArch64ISD::NEON_LD4_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD4WB_8B_fixed, AArch64::LD4WB_4H_fixed, + AArch64::LD4WB_2S_fixed, AArch64::LD1x4WB_1D_fixed, + AArch64::LD4WB_16B_fixed, AArch64::LD4WB_8H_fixed, + AArch64::LD4WB_4S_fixed, AArch64::LD4WB_2D_fixed + }; + return SelectVLD(Node, true, 4, Opcodes); + } + case AArch64ISD::NEON_LD1x2_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD1x2WB_8B_fixed, AArch64::LD1x2WB_4H_fixed, + AArch64::LD1x2WB_2S_fixed, AArch64::LD1x2WB_1D_fixed, + AArch64::LD1x2WB_16B_fixed, AArch64::LD1x2WB_8H_fixed, + AArch64::LD1x2WB_4S_fixed, AArch64::LD1x2WB_2D_fixed + }; + return SelectVLD(Node, true, 2, Opcodes); + } + case AArch64ISD::NEON_LD1x3_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD1x3WB_8B_fixed, AArch64::LD1x3WB_4H_fixed, + AArch64::LD1x3WB_2S_fixed, AArch64::LD1x3WB_1D_fixed, + AArch64::LD1x3WB_16B_fixed, AArch64::LD1x3WB_8H_fixed, + AArch64::LD1x3WB_4S_fixed, AArch64::LD1x3WB_2D_fixed + }; + return SelectVLD(Node, true, 3, Opcodes); + } + case AArch64ISD::NEON_LD1x4_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD1x4WB_8B_fixed, AArch64::LD1x4WB_4H_fixed, + AArch64::LD1x4WB_2S_fixed, AArch64::LD1x4WB_1D_fixed, + AArch64::LD1x4WB_16B_fixed, AArch64::LD1x4WB_8H_fixed, + AArch64::LD1x4WB_4S_fixed, AArch64::LD1x4WB_2D_fixed + }; + return SelectVLD(Node, true, 4, Opcodes); + } + case AArch64ISD::NEON_ST1_UPD: { + static const uint16_t Opcodes[] = { + AArch64::ST1WB_8B_fixed, AArch64::ST1WB_4H_fixed, + AArch64::ST1WB_2S_fixed, AArch64::ST1WB_1D_fixed, + AArch64::ST1WB_16B_fixed, AArch64::ST1WB_8H_fixed, + AArch64::ST1WB_4S_fixed, AArch64::ST1WB_2D_fixed + }; + return SelectVST(Node, true, 1, Opcodes); + } + case AArch64ISD::NEON_ST2_UPD: { + static const uint16_t Opcodes[] = { + AArch64::ST2WB_8B_fixed, AArch64::ST2WB_4H_fixed, + AArch64::ST2WB_2S_fixed, AArch64::ST1x2WB_1D_fixed, + AArch64::ST2WB_16B_fixed, AArch64::ST2WB_8H_fixed, + AArch64::ST2WB_4S_fixed, AArch64::ST2WB_2D_fixed + }; + return SelectVST(Node, true, 2, Opcodes); + } + case AArch64ISD::NEON_ST3_UPD: { + static const uint16_t Opcodes[] = { + AArch64::ST3WB_8B_fixed, AArch64::ST3WB_4H_fixed, + AArch64::ST3WB_2S_fixed, AArch64::ST1x3WB_1D_fixed, + AArch64::ST3WB_16B_fixed, AArch64::ST3WB_8H_fixed, + AArch64::ST3WB_4S_fixed, AArch64::ST3WB_2D_fixed + }; + return SelectVST(Node, true, 3, Opcodes); + } + case AArch64ISD::NEON_ST4_UPD: { + static const uint16_t Opcodes[] = { + AArch64::ST4WB_8B_fixed, AArch64::ST4WB_4H_fixed, + AArch64::ST4WB_2S_fixed, AArch64::ST1x4WB_1D_fixed, + AArch64::ST4WB_16B_fixed, AArch64::ST4WB_8H_fixed, + AArch64::ST4WB_4S_fixed, AArch64::ST4WB_2D_fixed + }; + return SelectVST(Node, true, 4, Opcodes); + } + case AArch64ISD::NEON_LD2DUP: { + static const uint16_t Opcodes[] = { + AArch64::LD2R_8B, AArch64::LD2R_4H, AArch64::LD2R_2S, + AArch64::LD2R_1D, AArch64::LD2R_16B, AArch64::LD2R_8H, + AArch64::LD2R_4S, AArch64::LD2R_2D + }; + return SelectVLDDup(Node, false, 2, Opcodes); + } + case AArch64ISD::NEON_LD3DUP: { + static const uint16_t Opcodes[] = { + AArch64::LD3R_8B, AArch64::LD3R_4H, AArch64::LD3R_2S, + AArch64::LD3R_1D, AArch64::LD3R_16B, AArch64::LD3R_8H, + AArch64::LD3R_4S, AArch64::LD3R_2D + }; + return SelectVLDDup(Node, false, 3, Opcodes); + } + case AArch64ISD::NEON_LD4DUP: { + static const uint16_t Opcodes[] = { + AArch64::LD4R_8B, AArch64::LD4R_4H, AArch64::LD4R_2S, + AArch64::LD4R_1D, AArch64::LD4R_16B, AArch64::LD4R_8H, + AArch64::LD4R_4S, AArch64::LD4R_2D + }; + return SelectVLDDup(Node, false, 4, Opcodes); + } + case AArch64ISD::NEON_LD2DUP_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD2R_WB_8B_fixed, AArch64::LD2R_WB_4H_fixed, + AArch64::LD2R_WB_2S_fixed, AArch64::LD2R_WB_1D_fixed, + AArch64::LD2R_WB_16B_fixed, AArch64::LD2R_WB_8H_fixed, + AArch64::LD2R_WB_4S_fixed, AArch64::LD2R_WB_2D_fixed + }; + return SelectVLDDup(Node, true, 2, Opcodes); + } + case AArch64ISD::NEON_LD3DUP_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD3R_WB_8B_fixed, AArch64::LD3R_WB_4H_fixed, + AArch64::LD3R_WB_2S_fixed, AArch64::LD3R_WB_1D_fixed, + AArch64::LD3R_WB_16B_fixed, AArch64::LD3R_WB_8H_fixed, + AArch64::LD3R_WB_4S_fixed, AArch64::LD3R_WB_2D_fixed + }; + return SelectVLDDup(Node, true, 3, Opcodes); + } + case AArch64ISD::NEON_LD4DUP_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD4R_WB_8B_fixed, AArch64::LD4R_WB_4H_fixed, + AArch64::LD4R_WB_2S_fixed, AArch64::LD4R_WB_1D_fixed, + AArch64::LD4R_WB_16B_fixed, AArch64::LD4R_WB_8H_fixed, + AArch64::LD4R_WB_4S_fixed, AArch64::LD4R_WB_2D_fixed + }; + return SelectVLDDup(Node, true, 4, Opcodes); + } + case AArch64ISD::NEON_LD2LN_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD2LN_WB_B_fixed, AArch64::LD2LN_WB_H_fixed, + AArch64::LD2LN_WB_S_fixed, AArch64::LD2LN_WB_D_fixed + }; + return SelectVLDSTLane(Node, true, true, 2, Opcodes); + } + case AArch64ISD::NEON_LD3LN_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD3LN_WB_B_fixed, AArch64::LD3LN_WB_H_fixed, + AArch64::LD3LN_WB_S_fixed, AArch64::LD3LN_WB_D_fixed + }; + return SelectVLDSTLane(Node, true, true, 3, Opcodes); + } + case AArch64ISD::NEON_LD4LN_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD4LN_WB_B_fixed, AArch64::LD4LN_WB_H_fixed, + AArch64::LD4LN_WB_S_fixed, AArch64::LD4LN_WB_D_fixed + }; + return SelectVLDSTLane(Node, true, true, 4, Opcodes); + } + case AArch64ISD::NEON_ST2LN_UPD: { + static const uint16_t Opcodes[] = { + AArch64::ST2LN_WB_B_fixed, AArch64::ST2LN_WB_H_fixed, + AArch64::ST2LN_WB_S_fixed, AArch64::ST2LN_WB_D_fixed + }; + return SelectVLDSTLane(Node, false, true, 2, Opcodes); + } + case AArch64ISD::NEON_ST3LN_UPD: { + static const uint16_t Opcodes[] = { + AArch64::ST3LN_WB_B_fixed, AArch64::ST3LN_WB_H_fixed, + AArch64::ST3LN_WB_S_fixed, AArch64::ST3LN_WB_D_fixed + }; + return SelectVLDSTLane(Node, false, true, 3, Opcodes); + } + case AArch64ISD::NEON_ST4LN_UPD: { + static const uint16_t Opcodes[] = { + AArch64::ST4LN_WB_B_fixed, AArch64::ST4LN_WB_H_fixed, + AArch64::ST4LN_WB_S_fixed, AArch64::ST4LN_WB_D_fixed + }; + return SelectVLDSTLane(Node, false, true, 4, Opcodes); + } + case AArch64ISD::NEON_ST1x2_UPD: { + static const uint16_t Opcodes[] = { + AArch64::ST1x2WB_8B_fixed, AArch64::ST1x2WB_4H_fixed, + AArch64::ST1x2WB_2S_fixed, AArch64::ST1x2WB_1D_fixed, + AArch64::ST1x2WB_16B_fixed, AArch64::ST1x2WB_8H_fixed, + AArch64::ST1x2WB_4S_fixed, AArch64::ST1x2WB_2D_fixed + }; + return SelectVST(Node, true, 2, Opcodes); + } + case AArch64ISD::NEON_ST1x3_UPD: { + static const uint16_t Opcodes[] = { + AArch64::ST1x3WB_8B_fixed, AArch64::ST1x3WB_4H_fixed, + AArch64::ST1x3WB_2S_fixed, AArch64::ST1x3WB_1D_fixed, + AArch64::ST1x3WB_16B_fixed, AArch64::ST1x3WB_8H_fixed, + AArch64::ST1x3WB_4S_fixed, AArch64::ST1x3WB_2D_fixed + }; + return SelectVST(Node, true, 3, Opcodes); + } + case AArch64ISD::NEON_ST1x4_UPD: { + static const uint16_t Opcodes[] = { + AArch64::ST1x4WB_8B_fixed, AArch64::ST1x4WB_4H_fixed, + AArch64::ST1x4WB_2S_fixed, AArch64::ST1x4WB_1D_fixed, + AArch64::ST1x4WB_16B_fixed, AArch64::ST1x4WB_8H_fixed, + AArch64::ST1x4WB_4S_fixed, AArch64::ST1x4WB_2D_fixed + }; + return SelectVST(Node, true, 4, Opcodes); + } + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue(); + bool IsExt = false; + switch (IntNo) { + default: + break; + case Intrinsic::aarch64_neon_vtbx1: + IsExt = true; + case Intrinsic::aarch64_neon_vtbl1: + return SelectVTBL(Node, 1, IsExt); + case Intrinsic::aarch64_neon_vtbx2: + IsExt = true; + case Intrinsic::aarch64_neon_vtbl2: + return SelectVTBL(Node, 2, IsExt); + case Intrinsic::aarch64_neon_vtbx3: + IsExt = true; + case Intrinsic::aarch64_neon_vtbl3: + return SelectVTBL(Node, 3, IsExt); + case Intrinsic::aarch64_neon_vtbx4: + IsExt = true; + case Intrinsic::aarch64_neon_vtbl4: + return SelectVTBL(Node, 4, IsExt); + } + break; + } + case ISD::INTRINSIC_VOID: + case ISD::INTRINSIC_W_CHAIN: { + unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: + break; + case Intrinsic::arm_neon_vld1: { + static const uint16_t Opcodes[] = { + AArch64::LD1_8B, AArch64::LD1_4H, AArch64::LD1_2S, AArch64::LD1_1D, + AArch64::LD1_16B, AArch64::LD1_8H, AArch64::LD1_4S, AArch64::LD1_2D + }; + return SelectVLD(Node, false, 1, Opcodes); + } + case Intrinsic::arm_neon_vld2: { + static const uint16_t Opcodes[] = { + AArch64::LD2_8B, AArch64::LD2_4H, AArch64::LD2_2S, AArch64::LD1x2_1D, + AArch64::LD2_16B, AArch64::LD2_8H, AArch64::LD2_4S, AArch64::LD2_2D + }; + return SelectVLD(Node, false, 2, Opcodes); + } + case Intrinsic::arm_neon_vld3: { + static const uint16_t Opcodes[] = { + AArch64::LD3_8B, AArch64::LD3_4H, AArch64::LD3_2S, AArch64::LD1x3_1D, + AArch64::LD3_16B, AArch64::LD3_8H, AArch64::LD3_4S, AArch64::LD3_2D + }; + return SelectVLD(Node, false, 3, Opcodes); + } + case Intrinsic::arm_neon_vld4: { + static const uint16_t Opcodes[] = { + AArch64::LD4_8B, AArch64::LD4_4H, AArch64::LD4_2S, AArch64::LD1x4_1D, + AArch64::LD4_16B, AArch64::LD4_8H, AArch64::LD4_4S, AArch64::LD4_2D + }; + return SelectVLD(Node, false, 4, Opcodes); + } + case Intrinsic::aarch64_neon_vld1x2: { + static const uint16_t Opcodes[] = { + AArch64::LD1x2_8B, AArch64::LD1x2_4H, AArch64::LD1x2_2S, + AArch64::LD1x2_1D, AArch64::LD1x2_16B, AArch64::LD1x2_8H, + AArch64::LD1x2_4S, AArch64::LD1x2_2D + }; + return SelectVLD(Node, false, 2, Opcodes); + } + case Intrinsic::aarch64_neon_vld1x3: { + static const uint16_t Opcodes[] = { + AArch64::LD1x3_8B, AArch64::LD1x3_4H, AArch64::LD1x3_2S, + AArch64::LD1x3_1D, AArch64::LD1x3_16B, AArch64::LD1x3_8H, + AArch64::LD1x3_4S, AArch64::LD1x3_2D + }; + return SelectVLD(Node, false, 3, Opcodes); + } + case Intrinsic::aarch64_neon_vld1x4: { + static const uint16_t Opcodes[] = { + AArch64::LD1x4_8B, AArch64::LD1x4_4H, AArch64::LD1x4_2S, + AArch64::LD1x4_1D, AArch64::LD1x4_16B, AArch64::LD1x4_8H, + AArch64::LD1x4_4S, AArch64::LD1x4_2D + }; + return SelectVLD(Node, false, 4, Opcodes); + } + case Intrinsic::arm_neon_vst1: { + static const uint16_t Opcodes[] = { + AArch64::ST1_8B, AArch64::ST1_4H, AArch64::ST1_2S, AArch64::ST1_1D, + AArch64::ST1_16B, AArch64::ST1_8H, AArch64::ST1_4S, AArch64::ST1_2D + }; + return SelectVST(Node, false, 1, Opcodes); + } + case Intrinsic::arm_neon_vst2: { + static const uint16_t Opcodes[] = { + AArch64::ST2_8B, AArch64::ST2_4H, AArch64::ST2_2S, AArch64::ST1x2_1D, + AArch64::ST2_16B, AArch64::ST2_8H, AArch64::ST2_4S, AArch64::ST2_2D + }; + return SelectVST(Node, false, 2, Opcodes); + } + case Intrinsic::arm_neon_vst3: { + static const uint16_t Opcodes[] = { + AArch64::ST3_8B, AArch64::ST3_4H, AArch64::ST3_2S, AArch64::ST1x3_1D, + AArch64::ST3_16B, AArch64::ST3_8H, AArch64::ST3_4S, AArch64::ST3_2D + }; + return SelectVST(Node, false, 3, Opcodes); + } + case Intrinsic::arm_neon_vst4: { + static const uint16_t Opcodes[] = { + AArch64::ST4_8B, AArch64::ST4_4H, AArch64::ST4_2S, AArch64::ST1x4_1D, + AArch64::ST4_16B, AArch64::ST4_8H, AArch64::ST4_4S, AArch64::ST4_2D + }; + return SelectVST(Node, false, 4, Opcodes); + } + case Intrinsic::aarch64_neon_vst1x2: { + static const uint16_t Opcodes[] = { + AArch64::ST1x2_8B, AArch64::ST1x2_4H, AArch64::ST1x2_2S, + AArch64::ST1x2_1D, AArch64::ST1x2_16B, AArch64::ST1x2_8H, + AArch64::ST1x2_4S, AArch64::ST1x2_2D + }; + return SelectVST(Node, false, 2, Opcodes); + } + case Intrinsic::aarch64_neon_vst1x3: { + static const uint16_t Opcodes[] = { + AArch64::ST1x3_8B, AArch64::ST1x3_4H, AArch64::ST1x3_2S, + AArch64::ST1x3_1D, AArch64::ST1x3_16B, AArch64::ST1x3_8H, + AArch64::ST1x3_4S, AArch64::ST1x3_2D + }; + return SelectVST(Node, false, 3, Opcodes); + } + case Intrinsic::aarch64_neon_vst1x4: { + static const uint16_t Opcodes[] = { + AArch64::ST1x4_8B, AArch64::ST1x4_4H, AArch64::ST1x4_2S, + AArch64::ST1x4_1D, AArch64::ST1x4_16B, AArch64::ST1x4_8H, + AArch64::ST1x4_4S, AArch64::ST1x4_2D + }; + return SelectVST(Node, false, 4, Opcodes); + } + case Intrinsic::arm_neon_vld2lane: { + static const uint16_t Opcodes[] = { + AArch64::LD2LN_B, AArch64::LD2LN_H, AArch64::LD2LN_S, AArch64::LD2LN_D + }; + return SelectVLDSTLane(Node, true, false, 2, Opcodes); + } + case Intrinsic::arm_neon_vld3lane: { + static const uint16_t Opcodes[] = { + AArch64::LD3LN_B, AArch64::LD3LN_H, AArch64::LD3LN_S, AArch64::LD3LN_D + }; + return SelectVLDSTLane(Node, true, false, 3, Opcodes); + } + case Intrinsic::arm_neon_vld4lane: { + static const uint16_t Opcodes[] = { + AArch64::LD4LN_B, AArch64::LD4LN_H, AArch64::LD4LN_S, AArch64::LD4LN_D + }; + return SelectVLDSTLane(Node, true, false, 4, Opcodes); + } + case Intrinsic::arm_neon_vst2lane: { + static const uint16_t Opcodes[] = { + AArch64::ST2LN_B, AArch64::ST2LN_H, AArch64::ST2LN_S, AArch64::ST2LN_D + }; + return SelectVLDSTLane(Node, false, false, 2, Opcodes); + } + case Intrinsic::arm_neon_vst3lane: { + static const uint16_t Opcodes[] = { + AArch64::ST3LN_B, AArch64::ST3LN_H, AArch64::ST3LN_S, AArch64::ST3LN_D + }; + return SelectVLDSTLane(Node, false, false, 3, Opcodes); + } + case Intrinsic::arm_neon_vst4lane: { + static const uint16_t Opcodes[] = { + AArch64::ST4LN_B, AArch64::ST4LN_H, AArch64::ST4LN_S, AArch64::ST4LN_D + }; + return SelectVLDSTLane(Node, false, false, 4, Opcodes); + } + } // End of switch IntNo + break; + } // End of case ISD::INTRINSIC_VOID and :ISD::INTRINSIC_W_CHAIN default: break; // Let generic code handle it } diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 56f6751..4fdb667 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -39,12 +39,10 @@ static TargetLoweringObjectFile *createTLOF(AArch64TargetMachine &TM) { llvm_unreachable("unknown subtarget type"); } - AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM) - : TargetLowering(TM, createTLOF(TM)), - Subtarget(&TM.getSubtarget<AArch64Subtarget>()), - RegInfo(TM.getRegisterInfo()), - Itins(TM.getInstrItineraryData()) { + : TargetLowering(TM, createTLOF(TM)), Itins(TM.getInstrItineraryData()) { + + const AArch64Subtarget *Subtarget = &TM.getSubtarget<AArch64Subtarget>(); // SIMD compares set the entire lane's bits to 1 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); @@ -52,10 +50,34 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM) // Scalar register <-> type mapping addRegisterClass(MVT::i32, &AArch64::GPR32RegClass); addRegisterClass(MVT::i64, &AArch64::GPR64RegClass); - addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); - addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); - addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); - addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); + + if (Subtarget->hasFPARMv8()) { + addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); + addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); + addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); + addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); + } + + if (Subtarget->hasNEON()) { + // And the vectors + addRegisterClass(MVT::v1i8, &AArch64::FPR8RegClass); + addRegisterClass(MVT::v1i16, &AArch64::FPR16RegClass); + addRegisterClass(MVT::v1i32, &AArch64::FPR32RegClass); + addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass); + addRegisterClass(MVT::v1f32, &AArch64::FPR32RegClass); + addRegisterClass(MVT::v1f64, &AArch64::FPR64RegClass); + addRegisterClass(MVT::v8i8, &AArch64::FPR64RegClass); + addRegisterClass(MVT::v4i16, &AArch64::FPR64RegClass); + addRegisterClass(MVT::v2i32, &AArch64::FPR64RegClass); + addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass); + addRegisterClass(MVT::v2f32, &AArch64::FPR64RegClass); + addRegisterClass(MVT::v16i8, &AArch64::FPR128RegClass); + addRegisterClass(MVT::v8i16, &AArch64::FPR128RegClass); + addRegisterClass(MVT::v4i32, &AArch64::FPR128RegClass); + addRegisterClass(MVT::v2i64, &AArch64::FPR128RegClass); + addRegisterClass(MVT::v4f32, &AArch64::FPR128RegClass); + addRegisterClass(MVT::v2f64, &AArch64::FPR128RegClass); + } computeRegisterProperties(); @@ -64,6 +86,12 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM) setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::SRA); + setTargetDAGCombine(ISD::SRL); + setTargetDAGCombine(ISD::SHL); + + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); + setTargetDAGCombine(ISD::INTRINSIC_VOID); + setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); // AArch64 does not have i1 loads, or much of anything for i1 really. setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); @@ -253,14 +281,97 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM) setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); - setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); - setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); - setExceptionPointerRegister(AArch64::X0); setExceptionSelectorRegister(AArch64::X1); + + if (Subtarget->hasNEON()) { + setOperationAction(ISD::BUILD_VECTOR, MVT::v1i8, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v1i16, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v1i32, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v1f32, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v1f64, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); + + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1f64, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); + + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Legal); + + setOperationAction(ISD::SETCC, MVT::v8i8, Custom); + setOperationAction(ISD::SETCC, MVT::v16i8, Custom); + setOperationAction(ISD::SETCC, MVT::v4i16, Custom); + setOperationAction(ISD::SETCC, MVT::v8i16, Custom); + setOperationAction(ISD::SETCC, MVT::v2i32, Custom); + setOperationAction(ISD::SETCC, MVT::v4i32, Custom); + setOperationAction(ISD::SETCC, MVT::v1i64, Custom); + setOperationAction(ISD::SETCC, MVT::v2i64, Custom); + setOperationAction(ISD::SETCC, MVT::v1f32, Custom); + setOperationAction(ISD::SETCC, MVT::v2f32, Custom); + setOperationAction(ISD::SETCC, MVT::v4f32, Custom); + setOperationAction(ISD::SETCC, MVT::v1f64, Custom); + setOperationAction(ISD::SETCC, MVT::v2f64, Custom); + + setOperationAction(ISD::FFLOOR, MVT::v2f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::v1f64, Legal); + setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); + + setOperationAction(ISD::FCEIL, MVT::v2f32, Legal); + setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); + setOperationAction(ISD::FCEIL, MVT::v1f64, Legal); + setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); + + setOperationAction(ISD::FTRUNC, MVT::v2f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::v1f64, Legal); + setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); + + setOperationAction(ISD::FRINT, MVT::v2f32, Legal); + setOperationAction(ISD::FRINT, MVT::v4f32, Legal); + setOperationAction(ISD::FRINT, MVT::v1f64, Legal); + setOperationAction(ISD::FRINT, MVT::v2f64, Legal); + + setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); + + setOperationAction(ISD::FROUND, MVT::v2f32, Legal); + setOperationAction(ISD::FROUND, MVT::v4f32, Legal); + setOperationAction(ISD::FROUND, MVT::v1f64, Legal); + setOperationAction(ISD::FROUND, MVT::v2f64, Legal); + } } -EVT AArch64TargetLowering::getSetCCResultType(EVT VT) const { +EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { // It's reasonably important that this value matches the "natural" legal // promotion from i1 for scalar types. Otherwise LegalizeTypes can get itself // in a twist (e.g. inserting an any_extend which then becomes i64 -> i64). @@ -271,16 +382,16 @@ EVT AArch64TargetLowering::getSetCCResultType(EVT VT) const { static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord, unsigned &LdrOpc, unsigned &StrOpc) { - static unsigned LoadBares[] = {AArch64::LDXR_byte, AArch64::LDXR_hword, - AArch64::LDXR_word, AArch64::LDXR_dword}; - static unsigned LoadAcqs[] = {AArch64::LDAXR_byte, AArch64::LDAXR_hword, - AArch64::LDAXR_word, AArch64::LDAXR_dword}; - static unsigned StoreBares[] = {AArch64::STXR_byte, AArch64::STXR_hword, - AArch64::STXR_word, AArch64::STXR_dword}; - static unsigned StoreRels[] = {AArch64::STLXR_byte, AArch64::STLXR_hword, - AArch64::STLXR_word, AArch64::STLXR_dword}; - - unsigned *LoadOps, *StoreOps; + static const unsigned LoadBares[] = {AArch64::LDXR_byte, AArch64::LDXR_hword, + AArch64::LDXR_word, AArch64::LDXR_dword}; + static const unsigned LoadAcqs[] = {AArch64::LDAXR_byte, AArch64::LDAXR_hword, + AArch64::LDAXR_word, AArch64::LDAXR_dword}; + static const unsigned StoreBares[] = {AArch64::STXR_byte, AArch64::STXR_hword, + AArch64::STXR_word, AArch64::STXR_dword}; + static const unsigned StoreRels[] = {AArch64::STLXR_byte,AArch64::STLXR_hword, + AArch64::STLXR_word, AArch64::STLXR_dword}; + + const unsigned *LoadOps, *StoreOps; if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent) LoadOps = LoadAcqs; else @@ -298,6 +409,29 @@ static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord, StrOpc = StoreOps[Log2_32(Size)]; } +// FIXME: AArch64::DTripleRegClass and AArch64::QTripleRegClass don't really +// have value type mapped, and they are both being defined as MVT::untyped. +// Without knowing the MVT type, MachineLICM::getRegisterClassIDAndCost +// would fail to figure out the register pressure correctly. +std::pair<const TargetRegisterClass*, uint8_t> +AArch64TargetLowering::findRepresentativeClass(MVT VT) const{ + const TargetRegisterClass *RRC = 0; + uint8_t Cost = 1; + switch (VT.SimpleTy) { + default: + return TargetLowering::findRepresentativeClass(VT); + case MVT::v4i64: + RRC = &AArch64::QPairRegClass; + Cost = 2; + break; + case MVT::v8i64: + RRC = &AArch64::QQuadRegClass; + Cost = 4; + break; + } + return std::make_pair(RRC, Cost); +} + MachineBasicBlock * AArch64TargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, unsigned Size, @@ -623,6 +757,12 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, MBB->addSuccessor(TrueBB); MBB->addSuccessor(EndBB); + if (!NZCVKilled) { + // NZCV is live-through TrueBB. + TrueBB->addLiveIn(AArch64::NZCV); + EndBB->addLiveIn(AArch64::NZCV); + } + // IfTrue: // str qIFTRUE, [sp] BuildMI(TrueBB, DL, TII->get(AArch64::LSFP128_STR)) @@ -637,8 +777,6 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, // Done: // ldr qDEST, [sp] // [... rest of incoming MBB ...] - if (!NZCVKilled) - EndBB->addLiveIn(AArch64::NZCV); MachineInstr *StartOfEnd = EndBB->begin(); BuildMI(*EndBB, StartOfEnd, DL, TII->get(AArch64::LSFP128_LDR), DestReg) .addFrameIndex(ScratchFI) @@ -784,7 +922,102 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge"; case AArch64ISD::WrapperSmall: return "AArch64ISD::WrapperSmall"; - default: return NULL; + case AArch64ISD::NEON_BSL: + return "AArch64ISD::NEON_BSL"; + case AArch64ISD::NEON_MOVIMM: + return "AArch64ISD::NEON_MOVIMM"; + case AArch64ISD::NEON_MVNIMM: + return "AArch64ISD::NEON_MVNIMM"; + case AArch64ISD::NEON_FMOVIMM: + return "AArch64ISD::NEON_FMOVIMM"; + case AArch64ISD::NEON_CMP: + return "AArch64ISD::NEON_CMP"; + case AArch64ISD::NEON_CMPZ: + return "AArch64ISD::NEON_CMPZ"; + case AArch64ISD::NEON_TST: + return "AArch64ISD::NEON_TST"; + case AArch64ISD::NEON_QSHLs: + return "AArch64ISD::NEON_QSHLs"; + case AArch64ISD::NEON_QSHLu: + return "AArch64ISD::NEON_QSHLu"; + case AArch64ISD::NEON_VDUP: + return "AArch64ISD::NEON_VDUP"; + case AArch64ISD::NEON_VDUPLANE: + return "AArch64ISD::NEON_VDUPLANE"; + case AArch64ISD::NEON_REV16: + return "AArch64ISD::NEON_REV16"; + case AArch64ISD::NEON_REV32: + return "AArch64ISD::NEON_REV32"; + case AArch64ISD::NEON_REV64: + return "AArch64ISD::NEON_REV64"; + case AArch64ISD::NEON_UZP1: + return "AArch64ISD::NEON_UZP1"; + case AArch64ISD::NEON_UZP2: + return "AArch64ISD::NEON_UZP2"; + case AArch64ISD::NEON_ZIP1: + return "AArch64ISD::NEON_ZIP1"; + case AArch64ISD::NEON_ZIP2: + return "AArch64ISD::NEON_ZIP2"; + case AArch64ISD::NEON_TRN1: + return "AArch64ISD::NEON_TRN1"; + case AArch64ISD::NEON_TRN2: + return "AArch64ISD::NEON_TRN2"; + case AArch64ISD::NEON_LD1_UPD: + return "AArch64ISD::NEON_LD1_UPD"; + case AArch64ISD::NEON_LD2_UPD: + return "AArch64ISD::NEON_LD2_UPD"; + case AArch64ISD::NEON_LD3_UPD: + return "AArch64ISD::NEON_LD3_UPD"; + case AArch64ISD::NEON_LD4_UPD: + return "AArch64ISD::NEON_LD4_UPD"; + case AArch64ISD::NEON_ST1_UPD: + return "AArch64ISD::NEON_ST1_UPD"; + case AArch64ISD::NEON_ST2_UPD: + return "AArch64ISD::NEON_ST2_UPD"; + case AArch64ISD::NEON_ST3_UPD: + return "AArch64ISD::NEON_ST3_UPD"; + case AArch64ISD::NEON_ST4_UPD: + return "AArch64ISD::NEON_ST4_UPD"; + case AArch64ISD::NEON_LD1x2_UPD: + return "AArch64ISD::NEON_LD1x2_UPD"; + case AArch64ISD::NEON_LD1x3_UPD: + return "AArch64ISD::NEON_LD1x3_UPD"; + case AArch64ISD::NEON_LD1x4_UPD: + return "AArch64ISD::NEON_LD1x4_UPD"; + case AArch64ISD::NEON_ST1x2_UPD: + return "AArch64ISD::NEON_ST1x2_UPD"; + case AArch64ISD::NEON_ST1x3_UPD: + return "AArch64ISD::NEON_ST1x3_UPD"; + case AArch64ISD::NEON_ST1x4_UPD: + return "AArch64ISD::NEON_ST1x4_UPD"; + case AArch64ISD::NEON_LD2DUP: + return "AArch64ISD::NEON_LD2DUP"; + case AArch64ISD::NEON_LD3DUP: + return "AArch64ISD::NEON_LD3DUP"; + case AArch64ISD::NEON_LD4DUP: + return "AArch64ISD::NEON_LD4DUP"; + case AArch64ISD::NEON_LD2DUP_UPD: + return "AArch64ISD::NEON_LD2DUP_UPD"; + case AArch64ISD::NEON_LD3DUP_UPD: + return "AArch64ISD::NEON_LD3DUP_UPD"; + case AArch64ISD::NEON_LD4DUP_UPD: + return "AArch64ISD::NEON_LD4DUP_UPD"; + case AArch64ISD::NEON_LD2LN_UPD: + return "AArch64ISD::NEON_LD2LN_UPD"; + case AArch64ISD::NEON_LD3LN_UPD: + return "AArch64ISD::NEON_LD3LN_UPD"; + case AArch64ISD::NEON_LD4LN_UPD: + return "AArch64ISD::NEON_LD4LN_UPD"; + case AArch64ISD::NEON_ST2LN_UPD: + return "AArch64ISD::NEON_ST2LN_UPD"; + case AArch64ISD::NEON_ST3LN_UPD: + return "AArch64ISD::NEON_ST3LN_UPD"; + case AArch64ISD::NEON_ST4LN_UPD: + return "AArch64ISD::NEON_ST4LN_UPD"; + case AArch64ISD::NEON_VEXTRACT: + return "AArch64ISD::NEON_VEXTRACT"; + default: + return NULL; } } @@ -826,7 +1059,7 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { void AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, - DebugLoc DL, SDValue &Chain) const { + SDLoc DL, SDValue &Chain) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); AArch64MachineFunctionInfo *FuncInfo @@ -858,24 +1091,31 @@ AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, } } + if (getSubtarget()->hasFPARMv8()) { unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); int FPRIdx = 0; - if (FPRSaveSize != 0) { - FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false); - - SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy()); - - for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { - unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i], - &AArch64::FPR128RegClass); - SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); - SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN, - MachinePointerInfo::getStack(i * 16), - false, false, 0); - MemOps.push_back(Store); - FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, - DAG.getConstant(16, getPointerTy())); + // According to the AArch64 Procedure Call Standard, section B.1/B.3, we + // can omit a register save area if we know we'll never use registers of + // that class. + if (FPRSaveSize != 0) { + FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false); + + SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy()); + + for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { + unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i], + &AArch64::FPR128RegClass); + SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); + SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN, + MachinePointerInfo::getStack(i * 16), + false, false, 0); + MemOps.push_back(Store); + FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, + DAG.getConstant(16, getPointerTy())); + } } + FuncInfo->setVariadicFPRIdx(FPRIdx); + FuncInfo->setVariadicFPRSize(FPRSaveSize); } int StackIdx = MFI->CreateFixedObject(8, CCInfo.getNextStackOffset(), true); @@ -883,8 +1123,6 @@ AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, FuncInfo->setVariadicStackIdx(StackIdx); FuncInfo->setVariadicGPRIdx(GPRIdx); FuncInfo->setVariadicGPRSize(GPRSaveSize); - FuncInfo->setVariadicFPRIdx(FPRIdx); - FuncInfo->setVariadicFPRSize(FPRSaveSize); if (!MemOps.empty()) { Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0], @@ -897,7 +1135,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); AArch64MachineFunctionInfo *FuncInfo @@ -1012,7 +1250,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, - DebugLoc dl, SelectionDAG &DAG) const { + SDLoc dl, SelectionDAG &DAG) const { // CCValAssign - represent the assignment of the return value to a location. SmallVector<CCValAssign, 16> RVLocs; @@ -1085,10 +1323,10 @@ SDValue AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { SelectionDAG &DAG = CLI.DAG; - DebugLoc &dl = CLI.DL; - SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; - SmallVector<SDValue, 32> &OutVals = CLI.OutVals; - SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; + SDLoc &dl = CLI.DL; + SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; + SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; + SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; bool &IsTailCall = CLI.IsTailCall; @@ -1151,7 +1389,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } if (!IsSibCall) - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), + dl); SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, AArch64::XSP, getPointerTy()); @@ -1282,7 +1521,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // in the correct location. if (IsTailCall && !IsSibCall) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), - DAG.getIntPtrConstant(0, true), InFlag); + DAG.getIntPtrConstant(0, true), InFlag, dl); InFlag = Chain.getValue(1); } @@ -1336,7 +1575,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), DAG.getIntPtrConstant(CalleePopBytes, true), - InFlag); + InFlag, dl); InFlag = Chain.getValue(1); } @@ -1348,7 +1587,7 @@ SDValue AArch64TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; @@ -1537,7 +1776,7 @@ SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, } // Build a tokenfactor for all the chains. - return DAG.getNode(ISD::TokenFactor, Chain.getDebugLoc(), MVT::Other, + return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, &ArgChains[0], ArgChains.size()); } @@ -1570,7 +1809,7 @@ bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Val) const { SDValue AArch64TargetLowering::getSelectableIntSetCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &A64cc, - SelectionDAG &DAG, DebugLoc &dl) const { + SelectionDAG &DAG, SDLoc &dl) const { if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { int64_t C = 0; EVT VT = RHSC->getValueType(0); @@ -1663,7 +1902,7 @@ static A64CC::CondCodes FPCCToA64CC(ISD::CondCode CC, SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); EVT PtrVT = getPointerTy(); const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); @@ -1693,7 +1932,7 @@ AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { // (BRCOND chain, val, dest) SDValue AArch64TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue Chain = Op.getOperand(0); SDValue TheBit = Op.getOperand(1); SDValue DestBB = Op.getOperand(2); @@ -1716,7 +1955,7 @@ AArch64TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { // (BR_CC chain, condcode, lhs, rhs, dest) SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue Chain = Op.getOperand(0); ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); SDValue LHS = Op.getOperand(2); @@ -1802,7 +2041,7 @@ AArch64TargetLowering::LowerF128ToCall(SDValue Op, SelectionDAG &DAG, CallLoweringInfo CLI(InChain, RetTy, false, false, false, false, 0, getLibcallCallingConv(Call), isTailCall, /*doesNotReturn=*/false, /*isReturnValueUsed=*/true, - Callee, Args, DAG, Op->getDebugLoc()); + Callee, Args, DAG, SDLoc(Op)); std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); if (!CallInfo.second.getNode()) @@ -1824,7 +2063,7 @@ AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { SDValue SrcVal = Op.getOperand(0); return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1, - /*isSigned*/ false, Op.getDebugLoc()); + /*isSigned*/ false, SDLoc(Op)).first; } SDValue @@ -1854,6 +2093,45 @@ AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, return LowerF128ToCall(Op, DAG, LC); } +SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MFI->setReturnAddressIsTaken(true); + + EVT VT = Op.getValueType(); + SDLoc dl(Op); + unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + if (Depth) { + SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); + SDValue Offset = DAG.getConstant(8, MVT::i64); + return DAG.getLoad(VT, dl, DAG.getEntryNode(), + DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), + MachinePointerInfo(), false, false, false, 0); + } + + // Return X30, which contains the return address. Mark it an implicit live-in. + unsigned Reg = MF.addLiveIn(AArch64::X30, getRegClassFor(MVT::i64)); + return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, MVT::i64); +} + + +SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) + const { + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setFrameAddressIsTaken(true); + + EVT VT = Op.getValueType(); + SDLoc dl(Op); + unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + unsigned FrameReg = AArch64::X29; + SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); + while (Depth--) + FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, + MachinePointerInfo(), + false, false, false, 0); + return FrameAddr; +} + SDValue AArch64TargetLowering::LowerGlobalAddressELFLarge(SDValue Op, SelectionDAG &DAG) const { @@ -1861,7 +2139,7 @@ AArch64TargetLowering::LowerGlobalAddressELFLarge(SDValue Op, assert(getTargetMachine().getRelocationModel() == Reloc::Static); EVT PtrVT = getPointerTy(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = GN->getGlobal(); @@ -1885,7 +2163,7 @@ AArch64TargetLowering::LowerGlobalAddressELFSmall(SDValue Op, assert(getTargetMachine().getCodeModel() == CodeModel::Small); EVT PtrVT = getPointerTy(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = GN->getGlobal(); unsigned Alignment = GV->getAlignment(); @@ -1927,7 +2205,7 @@ AArch64TargetLowering::LowerGlobalAddressELFSmall(SDValue Op, } unsigned char HiFixup, LoFixup; - bool UseGOT = Subtarget->GVIsIndirectSymbol(GV, RelocM); + bool UseGOT = getSubtarget()->GVIsIndirectSymbol(GV, RelocM); if (UseGOT) { HiFixup = AArch64II::MO_GOT; @@ -1978,7 +2256,7 @@ AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op, SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr, SDValue DescAddr, - DebugLoc DL, + SDLoc DL, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(); @@ -2023,7 +2301,7 @@ SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr, SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { - assert(Subtarget->isTargetELF() && + assert(getSubtarget()->isTargetELF() && "TLS not implemented for non-ELF targets"); assert(getTargetMachine().getCodeModel() == CodeModel::Small && "TLS only supported in small memory model"); @@ -2033,7 +2311,7 @@ AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, SDValue TPOff; EVT PtrVT = getPointerTy(); - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); const GlobalValue *GV = GA->getGlobal(); SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); @@ -2054,7 +2332,7 @@ AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, AArch64II::MO_TPREL_G0_NC); TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar, - DAG.getTargetConstant(0, MVT::i32)), 0); + DAG.getTargetConstant(1, MVT::i32)), 0); TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT, TPOff, LoVar, DAG.getTargetConstant(0, MVT::i32)), 0); @@ -2134,7 +2412,7 @@ AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); - DebugLoc dl = JT->getDebugLoc(); + SDLoc dl(JT); EVT PtrVT = getPointerTy(); // When compiling PIC, jump tables get put in the code section so a static @@ -2161,7 +2439,7 @@ AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { // (SELECT_CC lhs, rhs, iftrue, iffalse, condcode) SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDValue IfTrue = Op.getOperand(2); @@ -2217,7 +2495,7 @@ AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { // (SELECT testbit, iftrue, iffalse) SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue TheBit = Op.getOperand(0); SDValue IfTrue = Op.getOperand(1); SDValue IfFalse = Op.getOperand(2); @@ -2236,15 +2514,225 @@ AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { DAG.getConstant(A64CC::NE, MVT::i32)); } +static SDValue LowerVectorSETCC(SDValue Op, SelectionDAG &DAG) { + SDLoc DL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); + EVT VT = Op.getValueType(); + bool Invert = false; + SDValue Op0, Op1; + unsigned Opcode; + + if (LHS.getValueType().isInteger()) { + + // Attempt to use Vector Integer Compare Mask Test instruction. + // TST = icmp ne (and (op0, op1), zero). + if (CC == ISD::SETNE) { + if (((LHS.getOpcode() == ISD::AND) && + ISD::isBuildVectorAllZeros(RHS.getNode())) || + ((RHS.getOpcode() == ISD::AND) && + ISD::isBuildVectorAllZeros(LHS.getNode()))) { + + SDValue AndOp = (LHS.getOpcode() == ISD::AND) ? LHS : RHS; + SDValue NewLHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(0)); + SDValue NewRHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(1)); + return DAG.getNode(AArch64ISD::NEON_TST, DL, VT, NewLHS, NewRHS); + } + } + + // Attempt to use Vector Integer Compare Mask against Zero instr (Signed). + // Note: Compare against Zero does not support unsigned predicates. + if ((ISD::isBuildVectorAllZeros(RHS.getNode()) || + ISD::isBuildVectorAllZeros(LHS.getNode())) && + !isUnsignedIntSetCC(CC)) { + + // If LHS is the zero value, swap operands and CondCode. + if (ISD::isBuildVectorAllZeros(LHS.getNode())) { + CC = getSetCCSwappedOperands(CC); + Op0 = RHS; + } else + Op0 = LHS; + + // Ensure valid CondCode for Compare Mask against Zero instruction: + // EQ, GE, GT, LE, LT. + if (ISD::SETNE == CC) { + Invert = true; + CC = ISD::SETEQ; + } + + // Using constant type to differentiate integer and FP compares with zero. + Op1 = DAG.getConstant(0, MVT::i32); + Opcode = AArch64ISD::NEON_CMPZ; + + } else { + // Attempt to use Vector Integer Compare Mask instr (Signed/Unsigned). + // Ensure valid CondCode for Compare Mask instr: EQ, GE, GT, UGE, UGT. + bool Swap = false; + switch (CC) { + default: + llvm_unreachable("Illegal integer comparison."); + case ISD::SETEQ: + case ISD::SETGT: + case ISD::SETGE: + case ISD::SETUGT: + case ISD::SETUGE: + break; + case ISD::SETNE: + Invert = true; + CC = ISD::SETEQ; + break; + case ISD::SETULT: + case ISD::SETULE: + case ISD::SETLT: + case ISD::SETLE: + Swap = true; + CC = getSetCCSwappedOperands(CC); + } + + if (Swap) + std::swap(LHS, RHS); + + Opcode = AArch64ISD::NEON_CMP; + Op0 = LHS; + Op1 = RHS; + } + + // Generate Compare Mask instr or Compare Mask against Zero instr. + SDValue NeonCmp = + DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC)); + + if (Invert) + NeonCmp = DAG.getNOT(DL, NeonCmp, VT); + + return NeonCmp; + } + + // Now handle Floating Point cases. + // Attempt to use Vector Floating Point Compare Mask against Zero instruction. + if (ISD::isBuildVectorAllZeros(RHS.getNode()) || + ISD::isBuildVectorAllZeros(LHS.getNode())) { + + // If LHS is the zero value, swap operands and CondCode. + if (ISD::isBuildVectorAllZeros(LHS.getNode())) { + CC = getSetCCSwappedOperands(CC); + Op0 = RHS; + } else + Op0 = LHS; + + // Using constant type to differentiate integer and FP compares with zero. + Op1 = DAG.getConstantFP(0, MVT::f32); + Opcode = AArch64ISD::NEON_CMPZ; + } else { + // Attempt to use Vector Floating Point Compare Mask instruction. + Op0 = LHS; + Op1 = RHS; + Opcode = AArch64ISD::NEON_CMP; + } + + SDValue NeonCmpAlt; + // Some register compares have to be implemented with swapped CC and operands, + // e.g.: OLT implemented as OGT with swapped operands. + bool SwapIfRegArgs = false; + + // Ensure valid CondCode for FP Compare Mask against Zero instruction: + // EQ, GE, GT, LE, LT. + // And ensure valid CondCode for FP Compare Mask instruction: EQ, GE, GT. + switch (CC) { + default: + llvm_unreachable("Illegal FP comparison"); + case ISD::SETUNE: + case ISD::SETNE: + Invert = true; // Fallthrough + case ISD::SETOEQ: + case ISD::SETEQ: + CC = ISD::SETEQ; + break; + case ISD::SETOLT: + case ISD::SETLT: + CC = ISD::SETLT; + SwapIfRegArgs = true; + break; + case ISD::SETOGT: + case ISD::SETGT: + CC = ISD::SETGT; + break; + case ISD::SETOLE: + case ISD::SETLE: + CC = ISD::SETLE; + SwapIfRegArgs = true; + break; + case ISD::SETOGE: + case ISD::SETGE: + CC = ISD::SETGE; + break; + case ISD::SETUGE: + Invert = true; + CC = ISD::SETLT; + SwapIfRegArgs = true; + break; + case ISD::SETULE: + Invert = true; + CC = ISD::SETGT; + break; + case ISD::SETUGT: + Invert = true; + CC = ISD::SETLE; + SwapIfRegArgs = true; + break; + case ISD::SETULT: + Invert = true; + CC = ISD::SETGE; + break; + case ISD::SETUEQ: + Invert = true; // Fallthrough + case ISD::SETONE: + // Expand this to (OGT |OLT). + NeonCmpAlt = + DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGT)); + CC = ISD::SETLT; + SwapIfRegArgs = true; + break; + case ISD::SETUO: + Invert = true; // Fallthrough + case ISD::SETO: + // Expand this to (OGE | OLT). + NeonCmpAlt = + DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGE)); + CC = ISD::SETLT; + SwapIfRegArgs = true; + break; + } + + if (Opcode == AArch64ISD::NEON_CMP && SwapIfRegArgs) { + CC = getSetCCSwappedOperands(CC); + std::swap(Op0, Op1); + } + + // Generate FP Compare Mask instr or FP Compare Mask against Zero instr + SDValue NeonCmp = DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC)); + + if (NeonCmpAlt.getNode()) + NeonCmp = DAG.getNode(ISD::OR, DL, VT, NeonCmp, NeonCmpAlt); + + if (Invert) + NeonCmp = DAG.getNOT(DL, NeonCmp, VT); + + return NeonCmp; +} + // (SETCC lhs, rhs, condcode) SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); EVT VT = Op.getValueType(); + if (VT.isVector()) + return LowerVectorSETCC(Op, DAG); + if (LHS.getValueType() == MVT::f128) { // f128 comparisons will be lowered to libcalls giving a valid LHS and RHS // for the rest of the function (some i32 or i64 values). @@ -2298,7 +2786,7 @@ AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { // We have to make sure we copy the entire structure: 8+8+8+4+4 = 32 bytes // rather than just 8. - return DAG.getMemcpy(Op.getOperand(0), Op.getDebugLoc(), + return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1), Op.getOperand(2), DAG.getConstant(32, MVT::i32), 8, false, false, MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); @@ -2311,7 +2799,7 @@ AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); AArch64MachineFunctionInfo *FuncInfo = MF.getInfo<AArch64MachineFunctionInfo>(); - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); SDValue Chain = Op.getOperand(0); SDValue VAList = Op.getOperand(1); @@ -2389,6 +2877,8 @@ AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG, false); case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); + case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); + case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); @@ -2401,16 +2891,161 @@ AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); + case ISD::BUILD_VECTOR: + return LowerBUILD_VECTOR(Op, DAG, getSubtarget()); + case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); } return SDValue(); } +/// Check if the specified splat value corresponds to a valid vector constant +/// for a Neon instruction with a "modified immediate" operand (e.g., MOVI). If +/// so, return the encoded 8-bit immediate and the OpCmode instruction fields +/// values. +static bool isNeonModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, + unsigned SplatBitSize, SelectionDAG &DAG, + bool is128Bits, NeonModImmType type, EVT &VT, + unsigned &Imm, unsigned &OpCmode) { + switch (SplatBitSize) { + default: + llvm_unreachable("unexpected size for isNeonModifiedImm"); + case 8: { + if (type != Neon_Mov_Imm) + return false; + assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); + // Neon movi per byte: Op=0, Cmode=1110. + OpCmode = 0xe; + Imm = SplatBits; + VT = is128Bits ? MVT::v16i8 : MVT::v8i8; + break; + } + case 16: { + // Neon move inst per halfword + VT = is128Bits ? MVT::v8i16 : MVT::v4i16; + if ((SplatBits & ~0xff) == 0) { + // Value = 0x00nn is 0x00nn LSL 0 + // movi: Op=0, Cmode=1000; mvni: Op=1, Cmode=1000 + // bic: Op=1, Cmode=1001; orr: Op=0, Cmode=1001 + // Op=x, Cmode=100y + Imm = SplatBits; + OpCmode = 0x8; + break; + } + if ((SplatBits & ~0xff00) == 0) { + // Value = 0xnn00 is 0x00nn LSL 8 + // movi: Op=0, Cmode=1010; mvni: Op=1, Cmode=1010 + // bic: Op=1, Cmode=1011; orr: Op=0, Cmode=1011 + // Op=x, Cmode=101x + Imm = SplatBits >> 8; + OpCmode = 0xa; + break; + } + // can't handle any other + return false; + } + + case 32: { + // First the LSL variants (MSL is unusable by some interested instructions). + + // Neon move instr per word, shift zeros + VT = is128Bits ? MVT::v4i32 : MVT::v2i32; + if ((SplatBits & ~0xff) == 0) { + // Value = 0x000000nn is 0x000000nn LSL 0 + // movi: Op=0, Cmode= 0000; mvni: Op=1, Cmode= 0000 + // bic: Op=1, Cmode= 0001; orr: Op=0, Cmode= 0001 + // Op=x, Cmode=000x + Imm = SplatBits; + OpCmode = 0; + break; + } + if ((SplatBits & ~0xff00) == 0) { + // Value = 0x0000nn00 is 0x000000nn LSL 8 + // movi: Op=0, Cmode= 0010; mvni: Op=1, Cmode= 0010 + // bic: Op=1, Cmode= 0011; orr : Op=0, Cmode= 0011 + // Op=x, Cmode=001x + Imm = SplatBits >> 8; + OpCmode = 0x2; + break; + } + if ((SplatBits & ~0xff0000) == 0) { + // Value = 0x00nn0000 is 0x000000nn LSL 16 + // movi: Op=0, Cmode= 0100; mvni: Op=1, Cmode= 0100 + // bic: Op=1, Cmode= 0101; orr: Op=0, Cmode= 0101 + // Op=x, Cmode=010x + Imm = SplatBits >> 16; + OpCmode = 0x4; + break; + } + if ((SplatBits & ~0xff000000) == 0) { + // Value = 0xnn000000 is 0x000000nn LSL 24 + // movi: Op=0, Cmode= 0110; mvni: Op=1, Cmode= 0110 + // bic: Op=1, Cmode= 0111; orr: Op=0, Cmode= 0111 + // Op=x, Cmode=011x + Imm = SplatBits >> 24; + OpCmode = 0x6; + break; + } + + // Now the MSL immediates. + + // Neon move instr per word, shift ones + if ((SplatBits & ~0xffff) == 0 && + ((SplatBits | SplatUndef) & 0xff) == 0xff) { + // Value = 0x0000nnff is 0x000000nn MSL 8 + // movi: Op=0, Cmode= 1100; mvni: Op=1, Cmode= 1100 + // Op=x, Cmode=1100 + Imm = SplatBits >> 8; + OpCmode = 0xc; + break; + } + if ((SplatBits & ~0xffffff) == 0 && + ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { + // Value = 0x00nnffff is 0x000000nn MSL 16 + // movi: Op=1, Cmode= 1101; mvni: Op=1, Cmode= 1101 + // Op=x, Cmode=1101 + Imm = SplatBits >> 16; + OpCmode = 0xd; + break; + } + // can't handle any other + return false; + } + + case 64: { + if (type != Neon_Mov_Imm) + return false; + // Neon move instr bytemask, where each byte is either 0x00 or 0xff. + // movi Op=1, Cmode=1110. + OpCmode = 0x1e; + uint64_t BitMask = 0xff; + uint64_t Val = 0; + unsigned ImmMask = 1; + Imm = 0; + for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { + if (((SplatBits | SplatUndef) & BitMask) == BitMask) { + Val |= BitMask; + Imm |= ImmMask; + } else if ((SplatBits & BitMask) != 0) { + return false; + } + BitMask <<= 8; + ImmMask <<= 1; + } + SplatBits = Val; + VT = is128Bits ? MVT::v2i64 : MVT::v1i64; + break; + } + } + + return true; +} + static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); EVT VT = N->getValueType(0); // We're looking for an SRA/SHL pair which form an SBFX. @@ -2448,7 +3083,7 @@ static SDValue PerformANDCombine(SDNode *N, /// a compatible SHL operation (unless they're already low). This function /// checks that condition and returns the least-significant bit that's /// intended. If the operation not a field preparation, -1 is returned. -static int32_t getLSBForBFI(SelectionDAG &DAG, DebugLoc DL, EVT VT, +static int32_t getLSBForBFI(SelectionDAG &DAG, SDLoc DL, EVT VT, SDValue &MaskedVal, uint64_t Mask) { if (!isShiftedMask_64(Mask)) return -1; @@ -2464,7 +3099,7 @@ static int32_t getLSBForBFI(SelectionDAG &DAG, DebugLoc DL, EVT VT, // cases (e.g. bitfield to bitfield copy) may still need a real shift before // the BFI. - uint64_t LSB = CountTrailingZeros_64(Mask); + uint64_t LSB = countTrailingZeros(Mask); int64_t ShiftRightRequired = LSB; if (MaskedVal.getOpcode() == ISD::SHL && isa<ConstantSDNode>(MaskedVal.getOperand(1))) { @@ -2524,7 +3159,7 @@ static SDValue tryCombineToBFI(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); EVT VT = N->getValueType(0); assert(N->getOpcode() == ISD::OR && "Unexpected root"); @@ -2605,7 +3240,7 @@ static SDValue tryCombineToLargerBFI(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); EVT VT = N->getValueType(0); // First job is to hunt for a MaskedBFI on either the left or right. Swap @@ -2687,7 +3322,7 @@ static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, static SDValue tryCombineToEXTR(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); EVT VT = N->getValueType(0); assert(N->getOpcode() == ISD::OR && "Unexpected root"); @@ -2731,6 +3366,7 @@ static SDValue PerformORCombine(SDNode *N, const AArch64Subtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); EVT VT = N->getValueType(0); if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) @@ -2751,6 +3387,44 @@ static SDValue PerformORCombine(SDNode *N, if (Res.getNode()) return Res; + if (!Subtarget->hasNEON()) + return SDValue(); + + // Attempt to use vector immediate-form BSL + // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. + + SDValue N0 = N->getOperand(0); + if (N0.getOpcode() != ISD::AND) + return SDValue(); + + SDValue N1 = N->getOperand(1); + if (N1.getOpcode() != ISD::AND) + return SDValue(); + + if (VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) { + APInt SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1)); + APInt SplatBits0; + if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, + HasAnyUndefs) && + !HasAnyUndefs) { + BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1)); + APInt SplatBits1; + if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, + HasAnyUndefs) && + !HasAnyUndefs && SplatBits0 == ~SplatBits1) { + // Canonicalize the vector type to make instruction selection simpler. + EVT CanonicalVT = VT.is128BitVector() ? MVT::v16i8 : MVT::v8i8; + SDValue Result = DAG.getNode(AArch64ISD::NEON_BSL, DL, CanonicalVT, + N0->getOperand(1), N0->getOperand(0), + N1->getOperand(0)); + return DAG.getNode(ISD::BITCAST, DL, VT, Result); + } + } + } + return SDValue(); } @@ -2759,7 +3433,7 @@ static SDValue PerformSRACombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); EVT VT = N->getValueType(0); // We're looking for an SRA/SHL pair which form an SBFX. @@ -2791,6 +3465,336 @@ static SDValue PerformSRACombine(SDNode *N, DAG.getConstant(LSB + Width - 1, MVT::i64)); } +/// Check if this is a valid build_vector for the immediate operand of +/// a vector shift operation, where all the elements of the build_vector +/// must have the same constant integer value. +static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { + // Ignore bit_converts. + while (Op.getOpcode() == ISD::BITCAST) + Op = Op.getOperand(0); + BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, + HasAnyUndefs, ElementBits) || + SplatBitSize > ElementBits) + return false; + Cnt = SplatBits.getSExtValue(); + return true; +} + +/// Check if this is a valid build_vector for the immediate operand of +/// a vector shift left operation. That value must be in the range: +/// 0 <= Value < ElementBits +static bool isVShiftLImm(SDValue Op, EVT VT, int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + if (!getVShiftImm(Op, ElementBits, Cnt)) + return false; + return (Cnt >= 0 && Cnt < ElementBits); +} + +/// Check if this is a valid build_vector for the immediate operand of a +/// vector shift right operation. The value must be in the range: +/// 1 <= Value <= ElementBits +static bool isVShiftRImm(SDValue Op, EVT VT, int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + if (!getVShiftImm(Op, ElementBits, Cnt)) + return false; + return (Cnt >= 1 && Cnt <= ElementBits); +} + +/// Checks for immediate versions of vector shifts and lowers them. +static SDValue PerformShiftCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *ST) { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + if (N->getOpcode() == ISD::SRA && (VT == MVT::i32 || VT == MVT::i64)) + return PerformSRACombine(N, DCI); + + // Nothing to be done for scalar shifts. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!VT.isVector() || !TLI.isTypeLegal(VT)) + return SDValue(); + + assert(ST->hasNEON() && "unexpected vector shift"); + int64_t Cnt; + + switch (N->getOpcode()) { + default: + llvm_unreachable("unexpected shift opcode"); + + case ISD::SHL: + if (isVShiftLImm(N->getOperand(1), VT, Cnt)) { + SDValue RHS = + DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT, + DAG.getConstant(Cnt, MVT::i32)); + return DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), RHS); + } + break; + + case ISD::SRA: + case ISD::SRL: + if (isVShiftRImm(N->getOperand(1), VT, Cnt)) { + SDValue RHS = + DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT, + DAG.getConstant(Cnt, MVT::i32)); + return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N->getOperand(0), RHS); + } + break; + } + + return SDValue(); +} + +/// ARM-specific DAG combining for intrinsics. +static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + + switch (IntNo) { + default: + // Don't do anything for most intrinsics. + break; + + case Intrinsic::arm_neon_vqshifts: + case Intrinsic::arm_neon_vqshiftu: + EVT VT = N->getOperand(1).getValueType(); + int64_t Cnt; + if (!isVShiftLImm(N->getOperand(2), VT, Cnt)) + break; + unsigned VShiftOpc = (IntNo == Intrinsic::arm_neon_vqshifts) + ? AArch64ISD::NEON_QSHLs + : AArch64ISD::NEON_QSHLu; + return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0), + N->getOperand(1), DAG.getConstant(Cnt, MVT::i32)); + } + + return SDValue(); +} + +/// Target-specific DAG combine function for NEON load/store intrinsics +/// to merge base address updates. +static SDValue CombineBaseUpdate(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || + N->getOpcode() == ISD::INTRINSIC_W_CHAIN); + unsigned AddrOpIdx = (isIntrinsic ? 2 : 1); + SDValue Addr = N->getOperand(AddrOpIdx); + + // Search for a use of the address operand that is an increment. + for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), + UE = Addr.getNode()->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + if (User->getOpcode() != ISD::ADD || + UI.getUse().getResNo() != Addr.getResNo()) + continue; + + // Check that the add is independent of the load/store. Otherwise, folding + // it would create a cycle. + if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) + continue; + + // Find the new opcode for the updating load/store. + bool isLoad = true; + bool isLaneOp = false; + unsigned NewOpc = 0; + unsigned NumVecs = 0; + if (isIntrinsic) { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: llvm_unreachable("unexpected intrinsic for Neon base update"); + case Intrinsic::arm_neon_vld1: NewOpc = AArch64ISD::NEON_LD1_UPD; + NumVecs = 1; break; + case Intrinsic::arm_neon_vld2: NewOpc = AArch64ISD::NEON_LD2_UPD; + NumVecs = 2; break; + case Intrinsic::arm_neon_vld3: NewOpc = AArch64ISD::NEON_LD3_UPD; + NumVecs = 3; break; + case Intrinsic::arm_neon_vld4: NewOpc = AArch64ISD::NEON_LD4_UPD; + NumVecs = 4; break; + case Intrinsic::arm_neon_vst1: NewOpc = AArch64ISD::NEON_ST1_UPD; + NumVecs = 1; isLoad = false; break; + case Intrinsic::arm_neon_vst2: NewOpc = AArch64ISD::NEON_ST2_UPD; + NumVecs = 2; isLoad = false; break; + case Intrinsic::arm_neon_vst3: NewOpc = AArch64ISD::NEON_ST3_UPD; + NumVecs = 3; isLoad = false; break; + case Intrinsic::arm_neon_vst4: NewOpc = AArch64ISD::NEON_ST4_UPD; + NumVecs = 4; isLoad = false; break; + case Intrinsic::aarch64_neon_vld1x2: NewOpc = AArch64ISD::NEON_LD1x2_UPD; + NumVecs = 2; break; + case Intrinsic::aarch64_neon_vld1x3: NewOpc = AArch64ISD::NEON_LD1x3_UPD; + NumVecs = 3; break; + case Intrinsic::aarch64_neon_vld1x4: NewOpc = AArch64ISD::NEON_LD1x4_UPD; + NumVecs = 4; break; + case Intrinsic::aarch64_neon_vst1x2: NewOpc = AArch64ISD::NEON_ST1x2_UPD; + NumVecs = 2; isLoad = false; break; + case Intrinsic::aarch64_neon_vst1x3: NewOpc = AArch64ISD::NEON_ST1x3_UPD; + NumVecs = 3; isLoad = false; break; + case Intrinsic::aarch64_neon_vst1x4: NewOpc = AArch64ISD::NEON_ST1x4_UPD; + NumVecs = 4; isLoad = false; break; + case Intrinsic::arm_neon_vld2lane: NewOpc = AArch64ISD::NEON_LD2LN_UPD; + NumVecs = 2; isLaneOp = true; break; + case Intrinsic::arm_neon_vld3lane: NewOpc = AArch64ISD::NEON_LD3LN_UPD; + NumVecs = 3; isLaneOp = true; break; + case Intrinsic::arm_neon_vld4lane: NewOpc = AArch64ISD::NEON_LD4LN_UPD; + NumVecs = 4; isLaneOp = true; break; + case Intrinsic::arm_neon_vst2lane: NewOpc = AArch64ISD::NEON_ST2LN_UPD; + NumVecs = 2; isLoad = false; isLaneOp = true; break; + case Intrinsic::arm_neon_vst3lane: NewOpc = AArch64ISD::NEON_ST3LN_UPD; + NumVecs = 3; isLoad = false; isLaneOp = true; break; + case Intrinsic::arm_neon_vst4lane: NewOpc = AArch64ISD::NEON_ST4LN_UPD; + NumVecs = 4; isLoad = false; isLaneOp = true; break; + } + } else { + isLaneOp = true; + switch (N->getOpcode()) { + default: llvm_unreachable("unexpected opcode for Neon base update"); + case AArch64ISD::NEON_LD2DUP: NewOpc = AArch64ISD::NEON_LD2DUP_UPD; + NumVecs = 2; break; + case AArch64ISD::NEON_LD3DUP: NewOpc = AArch64ISD::NEON_LD3DUP_UPD; + NumVecs = 3; break; + case AArch64ISD::NEON_LD4DUP: NewOpc = AArch64ISD::NEON_LD4DUP_UPD; + NumVecs = 4; break; + } + } + + // Find the size of memory referenced by the load/store. + EVT VecTy; + if (isLoad) + VecTy = N->getValueType(0); + else + VecTy = N->getOperand(AddrOpIdx + 1).getValueType(); + unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; + if (isLaneOp) + NumBytes /= VecTy.getVectorNumElements(); + + // If the increment is a constant, it must match the memory ref size. + SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); + if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { + uint32_t IncVal = CInc->getZExtValue(); + if (IncVal != NumBytes) + continue; + Inc = DAG.getTargetConstant(IncVal, MVT::i32); + } + + // Create the new updating load/store node. + EVT Tys[6]; + unsigned NumResultVecs = (isLoad ? NumVecs : 0); + unsigned n; + for (n = 0; n < NumResultVecs; ++n) + Tys[n] = VecTy; + Tys[n++] = MVT::i64; + Tys[n] = MVT::Other; + SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs + 2); + SmallVector<SDValue, 8> Ops; + Ops.push_back(N->getOperand(0)); // incoming chain + Ops.push_back(N->getOperand(AddrOpIdx)); + Ops.push_back(Inc); + for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) { + Ops.push_back(N->getOperand(i)); + } + MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N); + SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, + Ops.data(), Ops.size(), + MemInt->getMemoryVT(), + MemInt->getMemOperand()); + + // Update the uses. + std::vector<SDValue> NewResults; + for (unsigned i = 0; i < NumResultVecs; ++i) { + NewResults.push_back(SDValue(UpdN.getNode(), i)); + } + NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain + DCI.CombineTo(N, NewResults); + DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); + + break; + } + return SDValue(); +} + +/// For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1) +/// intrinsic, and if all the other uses of that intrinsic are also VDUPLANEs. +/// If so, combine them to a vldN-dup operation and return true. +static SDValue CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + + // Check if the VDUPLANE operand is a vldN-dup intrinsic. + SDNode *VLD = N->getOperand(0).getNode(); + if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) + return SDValue(); + unsigned NumVecs = 0; + unsigned NewOpc = 0; + unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue(); + if (IntNo == Intrinsic::arm_neon_vld2lane) { + NumVecs = 2; + NewOpc = AArch64ISD::NEON_LD2DUP; + } else if (IntNo == Intrinsic::arm_neon_vld3lane) { + NumVecs = 3; + NewOpc = AArch64ISD::NEON_LD3DUP; + } else if (IntNo == Intrinsic::arm_neon_vld4lane) { + NumVecs = 4; + NewOpc = AArch64ISD::NEON_LD4DUP; + } else { + return SDValue(); + } + + // First check that all the vldN-lane uses are VDUPLANEs and that the lane + // numbers match the load. + unsigned VLDLaneNo = + cast<ConstantSDNode>(VLD->getOperand(NumVecs + 3))->getZExtValue(); + for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); + UI != UE; ++UI) { + // Ignore uses of the chain result. + if (UI.getUse().getResNo() == NumVecs) + continue; + SDNode *User = *UI; + if (User->getOpcode() != AArch64ISD::NEON_VDUPLANE || + VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue()) + return SDValue(); + } + + // Create the vldN-dup node. + EVT Tys[5]; + unsigned n; + for (n = 0; n < NumVecs; ++n) + Tys[n] = VT; + Tys[n] = MVT::Other; + SDVTList SDTys = DAG.getVTList(Tys, NumVecs + 1); + SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; + MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); + SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, Ops, 2, + VLDMemInt->getMemoryVT(), + VLDMemInt->getMemOperand()); + + // Update the uses. + for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); + UI != UE; ++UI) { + unsigned ResNo = UI.getUse().getResNo(); + // Ignore uses of the chain result. + if (ResNo == NumVecs) + continue; + SDNode *User = *UI; + DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); + } + + // Now the vldN-lane intrinsic is dead except for its chain result. + // Update uses of the chain. + std::vector<SDValue> VLDDupResults; + for (unsigned n = 0; n < NumVecs; ++n) + VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); + VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); + DCI.CombineTo(VLD, VLDDupResults); + + return SDValue(N, 0); +} SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, @@ -2798,12 +3802,578 @@ AArch64TargetLowering::PerformDAGCombine(SDNode *N, switch (N->getOpcode()) { default: break; case ISD::AND: return PerformANDCombine(N, DCI); - case ISD::OR: return PerformORCombine(N, DCI, Subtarget); - case ISD::SRA: return PerformSRACombine(N, DCI); + case ISD::OR: return PerformORCombine(N, DCI, getSubtarget()); + case ISD::SHL: + case ISD::SRA: + case ISD::SRL: + return PerformShiftCombine(N, DCI, getSubtarget()); + case ISD::INTRINSIC_WO_CHAIN: + return PerformIntrinsicCombine(N, DCI.DAG); + case AArch64ISD::NEON_VDUPLANE: + return CombineVLDDUP(N, DCI); + case AArch64ISD::NEON_LD2DUP: + case AArch64ISD::NEON_LD3DUP: + case AArch64ISD::NEON_LD4DUP: + return CombineBaseUpdate(N, DCI); + case ISD::INTRINSIC_VOID: + case ISD::INTRINSIC_W_CHAIN: + switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { + case Intrinsic::arm_neon_vld1: + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: + case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: + case Intrinsic::arm_neon_vld2lane: + case Intrinsic::arm_neon_vld3lane: + case Intrinsic::arm_neon_vld4lane: + case Intrinsic::aarch64_neon_vld1x2: + case Intrinsic::aarch64_neon_vld1x3: + case Intrinsic::aarch64_neon_vld1x4: + case Intrinsic::aarch64_neon_vst1x2: + case Intrinsic::aarch64_neon_vst1x3: + case Intrinsic::aarch64_neon_vst1x4: + case Intrinsic::arm_neon_vst2lane: + case Intrinsic::arm_neon_vst3lane: + case Intrinsic::arm_neon_vst4lane: + return CombineBaseUpdate(N, DCI); + default: + break; + } } return SDValue(); } +bool +AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { + VT = VT.getScalarType(); + + if (!VT.isSimple()) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f16: + case MVT::f32: + case MVT::f64: + return true; + case MVT::f128: + return false; + default: + break; + } + + return false; +} + +// Check whether a Build Vector could be presented as Shuffle Vector. If yes, +// try to call LowerVECTOR_SHUFFLE to lower it. +bool AArch64TargetLowering::isKnownShuffleVector(SDValue Op, SelectionDAG &DAG, + SDValue &Res) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned V0NumElts = 0; + int Mask[16]; + SDValue V0, V1; + + // Check if all elements are extracted from less than 3 vectors. + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Elt = Op.getOperand(i); + if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return false; + + if (V0.getNode() == 0) { + V0 = Elt.getOperand(0); + V0NumElts = V0.getValueType().getVectorNumElements(); + } + if (Elt.getOperand(0) == V0) { + Mask[i] = (cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue()); + continue; + } else if (V1.getNode() == 0) { + V1 = Elt.getOperand(0); + } + if (Elt.getOperand(0) == V1) { + unsigned Lane = cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue(); + Mask[i] = (Lane + V0NumElts); + continue; + } else { + return false; + } + } + + if (!V1.getNode() && V0NumElts == NumElts * 2) { + V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0, + DAG.getConstant(NumElts, MVT::i64)); + V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0, + DAG.getConstant(0, MVT::i64)); + V0NumElts = V0.getValueType().getVectorNumElements(); + } + + if (V1.getNode() && NumElts == V0NumElts && + V0NumElts == V1.getValueType().getVectorNumElements()) { + SDValue Shuffle = DAG.getVectorShuffle(VT, DL, V0, V1, Mask); + Res = LowerVECTOR_SHUFFLE(Shuffle, DAG); + return true; + } else + return false; +} + +// If this is a case we can't handle, return null and let the default +// expansion code take care of it. +SDValue +AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, + const AArch64Subtarget *ST) const { + + BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + + unsigned UseNeonMov = VT.getSizeInBits() >= 64; + + // Note we favor lowering MOVI over MVNI. + // This has implications on the definition of patterns in TableGen to select + // BIC immediate instructions but not ORR immediate instructions. + // If this lowering order is changed, TableGen patterns for BIC immediate and + // ORR immediate instructions have to be updated. + if (UseNeonMov && + BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { + if (SplatBitSize <= 64) { + // First attempt to use vector immediate-form MOVI + EVT NeonMovVT; + unsigned Imm = 0; + unsigned OpCmode = 0; + + if (isNeonModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), + SplatBitSize, DAG, VT.is128BitVector(), + Neon_Mov_Imm, NeonMovVT, Imm, OpCmode)) { + SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32); + SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32); + + if (ImmVal.getNode() && OpCmodeVal.getNode()) { + SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MOVIMM, DL, NeonMovVT, + ImmVal, OpCmodeVal); + return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov); + } + } + + // Then attempt to use vector immediate-form MVNI + uint64_t NegatedImm = (~SplatBits).getZExtValue(); + if (isNeonModifiedImm(NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, + DAG, VT.is128BitVector(), Neon_Mvn_Imm, NeonMovVT, + Imm, OpCmode)) { + SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32); + SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32); + if (ImmVal.getNode() && OpCmodeVal.getNode()) { + SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MVNIMM, DL, NeonMovVT, + ImmVal, OpCmodeVal); + return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov); + } + } + + // Attempt to use vector immediate-form FMOV + if (((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) || + (VT == MVT::v2f64 && SplatBitSize == 64)) { + APFloat RealVal( + SplatBitSize == 32 ? APFloat::IEEEsingle : APFloat::IEEEdouble, + SplatBits); + uint32_t ImmVal; + if (A64Imms::isFPImm(RealVal, ImmVal)) { + SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32); + return DAG.getNode(AArch64ISD::NEON_FMOVIMM, DL, VT, Val); + } + } + } + } + + unsigned NumElts = VT.getVectorNumElements(); + bool isOnlyLowElement = true; + bool usesOnlyOneValue = true; + bool hasDominantValue = false; + bool isConstant = true; + + // Map of the number of times a particular SDValue appears in the + // element list. + DenseMap<SDValue, unsigned> ValueCounts; + SDValue Value; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + if (i > 0) + isOnlyLowElement = false; + if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) + isConstant = false; + + ValueCounts.insert(std::make_pair(V, 0)); + unsigned &Count = ValueCounts[V]; + + // Is this value dominant? (takes up more than half of the lanes) + if (++Count > (NumElts / 2)) { + hasDominantValue = true; + Value = V; + } + } + if (ValueCounts.size() != 1) + usesOnlyOneValue = false; + if (!Value.getNode() && ValueCounts.size() > 0) + Value = ValueCounts.begin()->first; + + if (ValueCounts.size() == 0) + return DAG.getUNDEF(VT); + + // Loads are better lowered with insert_vector_elt. + // Keep going if we are hitting this case. + if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value); + + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (hasDominantValue && EltSize <= 64) { + // Use VDUP for non-constant splats. + if (!isConstant) { + SDValue N; + + // If we are DUPing a value that comes directly from a vector, we could + // just use DUPLANE. We can only do this if the lane being extracted + // is at a constant index, as the DUP from lane instructions only have + // constant-index forms. + if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + isa<ConstantSDNode>(Value->getOperand(1))) { + N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, + Value->getOperand(0), Value->getOperand(1)); + } else + N = DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value); + + if (!usesOnlyOneValue) { + // The dominant value was splatted as 'N', but we now have to insert + // all differing elements. + for (unsigned I = 0; I < NumElts; ++I) { + if (Op.getOperand(I) == Value) + continue; + SmallVector<SDValue, 3> Ops; + Ops.push_back(N); + Ops.push_back(Op.getOperand(I)); + Ops.push_back(DAG.getConstant(I, MVT::i64)); + N = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, &Ops[0], 3); + } + } + return N; + } + if (usesOnlyOneValue && isConstant) { + return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value); + } + } + // If all elements are constants and the case above didn't get hit, fall back + // to the default expansion, which will generate a load from the constant + // pool. + if (isConstant) + return SDValue(); + + // Try to lower this in lowering ShuffleVector way. + SDValue Shuf; + if (isKnownShuffleVector(Op, DAG, Shuf)) + return Shuf; + + // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we + // know the default expansion would otherwise fall back on something even + // worse. For a vector with one or two non-undef values, that's + // scalar_to_vector for the elements followed by a shuffle (provided the + // shuffle is valid for the target) and materialization element by element + // on the stack followed by a load for everything else. + if (!isConstant && !usesOnlyOneValue) { + SDValue Vec = DAG.getUNDEF(VT); + for (unsigned i = 0 ; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + SDValue LaneIdx = DAG.getConstant(i, MVT::i64); + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx); + } + return Vec; + } + return SDValue(); +} + +/// isREVMask - Check if a vector shuffle corresponds to a REV +/// instruction with the specified blocksize. (The order of the elements +/// within each block of the vector is reversed.) +static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { + assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && + "Only possible block sizes for REV are: 16, 32, 64"); + + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + unsigned BlockElts = M[0] + 1; + // If the first shuffle index is UNDEF, be optimistic. + if (M[0] < 0) + BlockElts = BlockSize / EltSz; + + if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) + return false; + + for (unsigned i = 0; i < NumElts; ++i) { + if (M[i] < 0) + continue; // ignore UNDEF indices + if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) + return false; + } + + return true; +} + +// isPermuteMask - Check whether the vector shuffle matches to UZP, ZIP and +// TRN instruction. +static unsigned isPermuteMask(ArrayRef<int> M, EVT VT) { + unsigned NumElts = VT.getVectorNumElements(); + if (NumElts < 4) + return 0; + + bool ismatch = true; + + // Check UZP1 + for (unsigned i = 0; i < NumElts; ++i) { + if ((unsigned)M[i] != i * 2) { + ismatch = false; + break; + } + } + if (ismatch) + return AArch64ISD::NEON_UZP1; + + // Check UZP2 + ismatch = true; + for (unsigned i = 0; i < NumElts; ++i) { + if ((unsigned)M[i] != i * 2 + 1) { + ismatch = false; + break; + } + } + if (ismatch) + return AArch64ISD::NEON_UZP2; + + // Check ZIP1 + ismatch = true; + for (unsigned i = 0; i < NumElts; ++i) { + if ((unsigned)M[i] != i / 2 + NumElts * (i % 2)) { + ismatch = false; + break; + } + } + if (ismatch) + return AArch64ISD::NEON_ZIP1; + + // Check ZIP2 + ismatch = true; + for (unsigned i = 0; i < NumElts; ++i) { + if ((unsigned)M[i] != (NumElts + i) / 2 + NumElts * (i % 2)) { + ismatch = false; + break; + } + } + if (ismatch) + return AArch64ISD::NEON_ZIP2; + + // Check TRN1 + ismatch = true; + for (unsigned i = 0; i < NumElts; ++i) { + if ((unsigned)M[i] != i + (NumElts - 1) * (i % 2)) { + ismatch = false; + break; + } + } + if (ismatch) + return AArch64ISD::NEON_TRN1; + + // Check TRN2 + ismatch = true; + for (unsigned i = 0; i < NumElts; ++i) { + if ((unsigned)M[i] != 1 + i + (NumElts - 1) * (i % 2)) { + ismatch = false; + break; + } + } + if (ismatch) + return AArch64ISD::NEON_TRN2; + + return 0; +} + +SDValue +AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, + SelectionDAG &DAG) const { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); + + // Convert shuffles that are directly supported on NEON to target-specific + // DAG nodes, instead of keeping them as shuffles and matching them again + // during code selection. This is more efficient and avoids the possibility + // of inconsistencies between legalization and selection. + ArrayRef<int> ShuffleMask = SVN->getMask(); + + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (EltSize > 64) + return SDValue(); + + if (isREVMask(ShuffleMask, VT, 64)) + return DAG.getNode(AArch64ISD::NEON_REV64, dl, VT, V1); + if (isREVMask(ShuffleMask, VT, 32)) + return DAG.getNode(AArch64ISD::NEON_REV32, dl, VT, V1); + if (isREVMask(ShuffleMask, VT, 16)) + return DAG.getNode(AArch64ISD::NEON_REV16, dl, VT, V1); + + unsigned ISDNo = isPermuteMask(ShuffleMask, VT); + if (ISDNo) + return DAG.getNode(ISDNo, dl, VT, V1, V2); + + // If the element of shuffle mask are all the same constant, we can + // transform it into either NEON_VDUP or NEON_VDUPLANE + if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) { + int Lane = SVN->getSplatIndex(); + // If this is undef splat, generate it via "just" vdup, if possible. + if (Lane == -1) Lane = 0; + + // Test if V1 is a SCALAR_TO_VECTOR. + if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { + return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT, V1.getOperand(0)); + } + // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR. + if (V1.getOpcode() == ISD::BUILD_VECTOR) { + bool IsScalarToVector = true; + for (unsigned i = 0, e = V1.getNumOperands(); i != e; ++i) + if (V1.getOperand(i).getOpcode() != ISD::UNDEF && + i != (unsigned)Lane) { + IsScalarToVector = false; + break; + } + if (IsScalarToVector) + return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT, + V1.getOperand(Lane)); + } + + // Test if V1 is a EXTRACT_SUBVECTOR. + if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { + int ExtLane = cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue(); + return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1.getOperand(0), + DAG.getConstant(Lane + ExtLane, MVT::i64)); + } + // Test if V1 is a CONCAT_VECTORS. + if (V1.getOpcode() == ISD::CONCAT_VECTORS && + V1.getOperand(1).getOpcode() == ISD::UNDEF) { + SDValue Op0 = V1.getOperand(0); + assert((unsigned)Lane < Op0.getValueType().getVectorNumElements() && + "Invalid vector lane access"); + return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, Op0, + DAG.getConstant(Lane, MVT::i64)); + } + + return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1, + DAG.getConstant(Lane, MVT::i64)); + } + + int Length = ShuffleMask.size(); + int V1EltNum = V1.getValueType().getVectorNumElements(); + + // If the number of v1 elements is the same as the number of shuffle mask + // element and the shuffle masks are sequential values, we can transform + // it into NEON_VEXTRACT. + if (V1EltNum == Length) { + // Check if the shuffle mask is sequential. + bool IsSequential = true; + int CurMask = ShuffleMask[0]; + for (int I = 0; I < Length; ++I) { + if (ShuffleMask[I] != CurMask) { + IsSequential = false; + break; + } + CurMask++; + } + if (IsSequential) { + assert((EltSize % 8 == 0) && "Bitsize of vector element is incorrect"); + unsigned VecSize = EltSize * V1EltNum; + unsigned Index = (EltSize/8) * ShuffleMask[0]; + if (VecSize == 64 || VecSize == 128) + return DAG.getNode(AArch64ISD::NEON_VEXTRACT, dl, VT, V1, V2, + DAG.getConstant(Index, MVT::i64)); + } + } + + // For shuffle mask like "0, 1, 2, 3, 4, 5, 13, 7", try to generate insert + // by element from V2 to V1 . + // If shuffle mask is like "0, 1, 10, 11, 12, 13, 14, 15", V2 would be a + // better choice to be inserted than V1 as less insert needed, so we count + // element to be inserted for both V1 and V2, and select less one as insert + // target. + + // Collect elements need to be inserted and their index. + SmallVector<int, 8> NV1Elt; + SmallVector<int, 8> N1Index; + SmallVector<int, 8> NV2Elt; + SmallVector<int, 8> N2Index; + for (int I = 0; I != Length; ++I) { + if (ShuffleMask[I] != I) { + NV1Elt.push_back(ShuffleMask[I]); + N1Index.push_back(I); + } + } + for (int I = 0; I != Length; ++I) { + if (ShuffleMask[I] != (I + V1EltNum)) { + NV2Elt.push_back(ShuffleMask[I]); + N2Index.push_back(I); + } + } + + // Decide which to be inserted. If all lanes mismatch, neither V1 nor V2 + // will be inserted. + SDValue InsV = V1; + SmallVector<int, 8> InsMasks = NV1Elt; + SmallVector<int, 8> InsIndex = N1Index; + if ((int)NV1Elt.size() != Length || (int)NV2Elt.size() != Length) { + if (NV1Elt.size() > NV2Elt.size()) { + InsV = V2; + InsMasks = NV2Elt; + InsIndex = N2Index; + } + } else { + InsV = DAG.getNode(ISD::UNDEF, dl, VT); + } + + for (int I = 0, E = InsMasks.size(); I != E; ++I) { + SDValue ExtV = V1; + int Mask = InsMasks[I]; + if (Mask >= V1EltNum) { + ExtV = V2; + Mask -= V1EltNum; + } + // Any value type smaller than i32 is illegal in AArch64, and this lower + // function is called after legalize pass, so we need to legalize + // the result here. + EVT EltVT; + if (VT.getVectorElementType().isFloatingPoint()) + EltVT = (EltSize == 64) ? MVT::f64 : MVT::f32; + else + EltVT = (EltSize == 64) ? MVT::i64 : MVT::i32; + + if (Mask >= 0) { + ExtV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ExtV, + DAG.getConstant(Mask, MVT::i64)); + InsV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, InsV, ExtV, + DAG.getConstant(InsIndex[I], MVT::i64)); + } + } + return InsV; +} + AArch64TargetLowering::ConstraintType AArch64TargetLowering::getConstraintType(const std::string &Constraint) const { if (Constraint.size() == 1) { @@ -2899,7 +4469,7 @@ AArch64TargetLowering::LowerAsmOperandForConstraint(SDValue Op, case 'S': { // An absolute symbolic address or label reference. if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) { - Result = DAG.getTargetGlobalAddress(GA->getGlobal(), Op.getDebugLoc(), + Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op), GA->getValueType(0)); } else if (const BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op)) { @@ -2935,7 +4505,7 @@ AArch64TargetLowering::LowerAsmOperandForConstraint(SDValue Op, std::pair<unsigned, const TargetRegisterClass*> AArch64TargetLowering::getRegForInlineAsmConstraint( const std::string &Constraint, - EVT VT) const { + MVT VT) const { if (Constraint.size() == 1) { switch (Constraint[0]) { case 'r': @@ -2949,14 +4519,10 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( return std::make_pair(0U, &AArch64::FPR16RegClass); else if (VT == MVT::f32) return std::make_pair(0U, &AArch64::FPR32RegClass); - else if (VT == MVT::f64) - return std::make_pair(0U, &AArch64::FPR64RegClass); else if (VT.getSizeInBits() == 64) - return std::make_pair(0U, &AArch64::VPR64RegClass); - else if (VT == MVT::f128) - return std::make_pair(0U, &AArch64::FPR128RegClass); + return std::make_pair(0U, &AArch64::FPR64RegClass); else if (VT.getSizeInBits() == 128) - return std::make_pair(0U, &AArch64::VPR128RegClass); + return std::make_pair(0U, &AArch64::FPR128RegClass); break; } } @@ -2965,3 +4531,69 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( // constraint into a member of a register class. return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); } + +/// Represent NEON load and store intrinsics as MemIntrinsicNodes. +/// The associated MachineMemOperands record the alignment specified +/// in the intrinsic calls. +bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, + unsigned Intrinsic) const { + switch (Intrinsic) { + case Intrinsic::arm_neon_vld1: + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: + case Intrinsic::aarch64_neon_vld1x2: + case Intrinsic::aarch64_neon_vld1x3: + case Intrinsic::aarch64_neon_vld1x4: + case Intrinsic::arm_neon_vld2lane: + case Intrinsic::arm_neon_vld3lane: + case Intrinsic::arm_neon_vld4lane: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + // Conservatively set memVT to the entire set of vectors loaded. + uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8; + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); + Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); + Info.vol = false; // volatile loads with NEON intrinsics not supported + Info.readMem = true; + Info.writeMem = false; + return true; + } + case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: + case Intrinsic::aarch64_neon_vst1x2: + case Intrinsic::aarch64_neon_vst1x3: + case Intrinsic::aarch64_neon_vst1x4: + case Intrinsic::arm_neon_vst2lane: + case Intrinsic::arm_neon_vst3lane: + case Intrinsic::arm_neon_vst4lane: { + Info.opc = ISD::INTRINSIC_VOID; + // Conservatively set memVT to the entire set of vectors stored. + unsigned NumElts = 0; + for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { + Type *ArgTy = I.getArgOperand(ArgI)->getType(); + if (!ArgTy->isVectorTy()) + break; + NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8; + } + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); + Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); + Info.vol = false; // volatile stores with NEON intrinsics not supported + Info.readMem = false; + Info.writeMem = true; + return true; + } + default: + break; + } + + return false; +} diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index d49b3ee..8ad5a79 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -19,7 +19,7 @@ #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/Target/TargetLowering.h" - +#include "llvm/IR/Intrinsics.h" namespace llvm { namespace AArch64ISD { @@ -111,7 +111,92 @@ namespace AArch64ISD { // created using the small memory model style: i.e. adrp/add or // adrp/mem-op. This exists to prevent bare TargetAddresses which may never // get selected. - WrapperSmall + WrapperSmall, + + // Vector bitwise select + NEON_BSL, + + // Vector move immediate + NEON_MOVIMM, + + // Vector Move Inverted Immediate + NEON_MVNIMM, + + // Vector FP move immediate + NEON_FMOVIMM, + + // Vector permute + NEON_UZP1, + NEON_UZP2, + NEON_ZIP1, + NEON_ZIP2, + NEON_TRN1, + NEON_TRN2, + + // Vector Element reverse + NEON_REV64, + NEON_REV32, + NEON_REV16, + + // Vector compare + NEON_CMP, + + // Vector compare zero + NEON_CMPZ, + + // Vector compare bitwise test + NEON_TST, + + // Vector saturating shift + NEON_QSHLs, + NEON_QSHLu, + + // Vector dup + NEON_VDUP, + + // Vector dup by lane + NEON_VDUPLANE, + + // Vector extract + NEON_VEXTRACT, + + // NEON duplicate lane loads + NEON_LD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, + NEON_LD3DUP, + NEON_LD4DUP, + + // NEON loads with post-increment base updates: + NEON_LD1_UPD, + NEON_LD2_UPD, + NEON_LD3_UPD, + NEON_LD4_UPD, + NEON_LD1x2_UPD, + NEON_LD1x3_UPD, + NEON_LD1x4_UPD, + + // NEON stores with post-increment base updates: + NEON_ST1_UPD, + NEON_ST2_UPD, + NEON_ST3_UPD, + NEON_ST4_UPD, + NEON_ST1x2_UPD, + NEON_ST1x3_UPD, + NEON_ST1x4_UPD, + + // NEON duplicate lane loads with post-increment base updates: + NEON_LD2DUP_UPD, + NEON_LD3DUP_UPD, + NEON_LD4DUP_UPD, + + // NEON lane loads with post-increment base updates: + NEON_LD2LN_UPD, + NEON_LD3LN_UPD, + NEON_LD4LN_UPD, + + // NEON lane store with post-increment base updates: + NEON_ST2LN_UPD, + NEON_ST3LN_UPD, + NEON_ST4LN_UPD }; } @@ -130,14 +215,14 @@ public: SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const; SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, - DebugLoc dl, SelectionDAG &DAG) const; + SDLoc dl, SelectionDAG &DAG) const; SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const; @@ -145,12 +230,18 @@ public: SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const; - void SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, - DebugLoc DL, SDValue &Chain) const; + bool isKnownShuffleVector(SDValue Op, SelectionDAG &DAG, SDValue &Res) const; + + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, + const AArch64Subtarget *ST) const; + + SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; + void SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL, + SDValue &Chain) const; /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call @@ -171,7 +262,7 @@ public: SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo *MFI, int ClobberedFI) const; - EVT getSetCCResultType(EVT VT) const; + EVT getSetCCResultType(LLVMContext &Context, EVT VT) const; bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const; @@ -181,7 +272,7 @@ public: bool isLegalICmpImmediate(int64_t Val) const; SDValue getSelectableIntSetCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, - SDValue &A64cc, SelectionDAG &DAG, DebugLoc &dl) const; + SDValue &A64cc, SelectionDAG &DAG, SDLoc &dl) const; virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const; @@ -211,12 +302,14 @@ public: SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, bool IsSigned) const; + SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddressELFSmall(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddressELFLarge(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerTLSDescCall(SDValue SymAddr, SDValue DescAddr, DebugLoc DL, + SDValue LowerTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool IsSigned) const; @@ -229,11 +322,11 @@ public: virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; - /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than - /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to - /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd - /// is expanded to mul + add. - virtual bool isFMAFasterThanMulAndAdd(EVT) const { return true; } + /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster + /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be + /// expanded to FMAs when this method returns true, otherwise fmuladd is + /// expanded to fmul + fadd. + virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const; ConstraintType getConstraintType(const std::string &Constraint) const; @@ -245,12 +338,30 @@ public: SelectionDAG &DAG) const; std::pair<unsigned, const TargetRegisterClass*> - getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const; + getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const; + + virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, + unsigned Intrinsic) const LLVM_OVERRIDE; + +protected: + std::pair<const TargetRegisterClass*, uint8_t> + findRepresentativeClass(MVT VT) const; + private: - const AArch64Subtarget *Subtarget; - const TargetRegisterInfo *RegInfo; const InstrItineraryData *Itins; + + const AArch64Subtarget *getSubtarget() const { + return &getTargetMachine().getSubtarget<AArch64Subtarget>(); + } }; +enum NeonModImmType { + Neon_Mov_Imm, + Neon_Mvn_Imm +}; + +extern SDValue ScanBUILD_VECTOR(SDValue Op, bool &isOnlyLowElement, + bool &usesOnlyOneValue, bool &hasDominantValue, + bool &isConstant, bool &isUNDEF); } // namespace llvm #endif // LLVM_TARGET_AARCH64_ISELLOWERING_H diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td index 9dd122f..34f917c 100644 --- a/lib/Target/AArch64/AArch64InstrFormats.td +++ b/lib/Target/AArch64/AArch64InstrFormats.td @@ -120,6 +120,14 @@ class A64InstRdnm<dag outs, dag ins, string asmstr, let Inst{20-16} = Rm; } +class A64InstRtnm<dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRtn<outs, ins, asmstr, patterns, itin> { + bits<5> Rm; + + let Inst{20-16} = Rm; +} + //===----------------------------------------------------------------------===// // // Actual A64 Instruction Formats @@ -383,6 +391,8 @@ class A64I_extract<bit sf, bits<3> op, bit n, // Inherits Rd in 4-0 } +let Predicates = [HasFPARMv8] in { + // Format for floating-point compare instructions. class A64I_fpcmp<bit m, bit s, bits<2> type, bits<2> op, bits<5> opcode2, dag outs, dag ins, string asmstr, @@ -562,6 +572,8 @@ class A64I_fpimm<bit m, bit s, bits<2> type, bits<5> imm5, // Inherit Rd in 4-0 } +} + // Format for load-register (literal) instructions. class A64I_LDRlit<bits<2> opc, bit v, dag outs, dag ins, string asmstr, @@ -959,3 +971,519 @@ class A64I_Breg<bits<4> opc, bits<5> op2, bits<6> op3, bits<5> op4, let Inst{4-0} = op4; } + +//===----------------------------------------------------------------------===// +// +// Neon Instruction Format Definitions. +// + +let Predicates = [HasNEON] in { + +class NeonInstAlias<string Asm, dag Result, bit Emit = 0b1> + : InstAlias<Asm, Result, Emit> { +} + +// Format AdvSIMD bitwise extract +class NeonI_BitExtract<bit q, bits<2> op2, + dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRdnm<outs, ins, asmstr, patterns, itin> { + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29-24} = 0b101110; + let Inst{23-22} = op2; + let Inst{21} = 0b0; + // Inherit Rm in 20-16 + let Inst{15} = 0b0; + // imm4 in 14-11 + let Inst{10} = 0b0; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD perm +class NeonI_Perm<bit q, bits<2> size, bits<3> opcode, + dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRdnm<outs, ins, asmstr, patterns, itin> { + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29-24} = 0b001110; + let Inst{23-22} = size; + let Inst{21} = 0b0; + // Inherit Rm in 20-16 + let Inst{15} = 0b0; + let Inst{14-12} = opcode; + let Inst{11-10} = 0b10; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD table lookup +class NeonI_TBL<bit q, bits<2> op2, bits<2> len, bit op, + dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRdnm<outs, ins, asmstr, patterns, itin> { + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29-24} = 0b001110; + let Inst{23-22} = op2; + let Inst{21} = 0b0; + // Inherit Rm in 20-16 + let Inst{15} = 0b0; + let Inst{14-13} = len; + let Inst{12} = op; + let Inst{11-10} = 0b00; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD 3 vector registers with same vector type +class NeonI_3VSame<bit q, bit u, bits<2> size, bits<5> opcode, + dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRdnm<outs, ins, asmstr, patterns, itin> { + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29} = u; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21} = 0b1; + // Inherit Rm in 20-16 + let Inst{15-11} = opcode; + let Inst{10} = 0b1; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD 3 vector registers with different vector type +class NeonI_3VDiff<bit q, bit u, bits<2> size, bits<4> opcode, + dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRdnm<outs, ins, asmstr, patterns, itin> { + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29} = u; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21} = 0b1; + // Inherit Rm in 20-16 + let Inst{15-12} = opcode; + let Inst{11} = 0b0; + let Inst{10} = 0b0; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD two registers and an element +class NeonI_2VElem<bit q, bit u, bits<2> size, bits<4> opcode, + dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRdnm<outs, ins, asmstr, patterns, itin> { + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29} = u; + let Inst{28-24} = 0b01111; + let Inst{23-22} = size; + // l in Inst{21} + // m in Inst{20} + // Inherit Rm in 19-16 + let Inst{15-12} = opcode; + // h in Inst{11} + let Inst{10} = 0b0; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD 1 vector register with modified immediate +class NeonI_1VModImm<bit q, bit op, + dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRd<outs,ins, asmstr, patterns, itin> { + bits<8> Imm; + bits<4> cmode; + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29} = op; + let Inst{28-19} = 0b0111100000; + let Inst{15-12} = cmode; + let Inst{11} = 0b0; // o2 + let Inst{10} = 1; + // Inherit Rd in 4-0 + let Inst{18-16} = Imm{7-5}; // imm a:b:c + let Inst{9-5} = Imm{4-0}; // imm d:e:f:g:h +} + +// Format AdvSIMD 3 scalar registers with same type + +class NeonI_Scalar3Same<bit u, bits<2> size, bits<5> opcode, + dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRdnm<outs, ins, asmstr, patterns, itin> { + let Inst{31} = 0b0; + let Inst{30} = 0b1; + let Inst{29} = u; + let Inst{28-24} = 0b11110; + let Inst{23-22} = size; + let Inst{21} = 0b1; + // Inherit Rm in 20-16 + let Inst{15-11} = opcode; + let Inst{10} = 0b1; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + + +// Format AdvSIMD 2 vector registers miscellaneous +class NeonI_2VMisc<bit q, bit u, bits<2> size, bits<5> opcode, + dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRdn<outs, ins, asmstr, patterns, itin> { + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29} = u; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD 2 vector 1 immediate shift +class NeonI_2VShiftImm<bit q, bit u, bits<5> opcode, + dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRdn<outs, ins, asmstr, patterns, itin> { + bits<7> Imm; + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29} = u; + let Inst{28-23} = 0b011110; + let Inst{22-16} = Imm; + let Inst{15-11} = opcode; + let Inst{10} = 0b1; + + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD duplicate and insert +class NeonI_copy<bit q, bit op, bits<4> imm4, + dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRdn<outs, ins, asmstr, patterns, itin> { + bits<5> Imm5; + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29} = op; + let Inst{28-21} = 0b01110000; + let Inst{20-16} = Imm5; + let Inst{15} = 0b0; + let Inst{14-11} = imm4; + let Inst{10} = 0b1; + + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} +// Format AdvSIMD insert from element to vector +class NeonI_insert<bit q, bit op, + dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRdn<outs, ins, asmstr, patterns, itin> { + bits<5> Imm5; + bits<4> Imm4; + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29} = op; + let Inst{28-21} = 0b01110000; + let Inst{20-16} = Imm5; + let Inst{15} = 0b0; + let Inst{14-11} = Imm4; + let Inst{10} = 0b1; + + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD scalar pairwise +class NeonI_ScalarPair<bit u, bits<2> size, bits<5> opcode, + dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRdn<outs, ins, asmstr, patterns, itin> { + let Inst{31} = 0b0; + let Inst{30} = 0b1; + let Inst{29} = u; + let Inst{28-24} = 0b11110; + let Inst{23-22} = size; + let Inst{21-17} = 0b11000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD 2 vector across lanes +class NeonI_2VAcross<bit q, bit u, bits<2> size, bits<5> opcode, + dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRdn<outs, ins, asmstr, patterns, itin> +{ + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29} = u; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21-17} = 0b11000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD scalar two registers miscellaneous +class NeonI_Scalar2SameMisc<bit u, bits<2> size, bits<5> opcode, dag outs, dag ins, + string asmstr, list<dag> patterns, InstrItinClass itin> + : A64InstRdn<outs, ins, asmstr, patterns, itin> { + let Inst{31} = 0b0; + let Inst{30} = 0b1; + let Inst{29} = u; + let Inst{28-24} = 0b11110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD vector load/store multiple N-element structure +class NeonI_LdStMult<bit q, bit l, bits<4> opcode, bits<2> size, + dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRtn<outs, ins, asmstr, patterns, itin> +{ + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29-23} = 0b0011000; + let Inst{22} = l; + let Inst{21-16} = 0b000000; + let Inst{15-12} = opcode; + let Inst{11-10} = size; + + // Inherit Rn in 9-5 + // Inherit Rt in 4-0 +} + +// Format AdvSIMD vector load/store multiple N-element structure (post-index) +class NeonI_LdStMult_Post<bit q, bit l, bits<4> opcode, bits<2> size, + dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRtnm<outs, ins, asmstr, patterns, itin> +{ + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29-23} = 0b0011001; + let Inst{22} = l; + let Inst{21} = 0b0; + // Inherit Rm in 20-16 + let Inst{15-12} = opcode; + let Inst{11-10} = size; + // Inherit Rn in 9-5 + // Inherit Rt in 4-0 +} + +// Format AdvSIMD vector load Single N-element structure to all lanes +class NeonI_LdOne_Dup<bit q, bit r, bits<3> opcode, bits<2> size, dag outs, + dag ins, string asmstr, list<dag> patterns, + InstrItinClass itin> + : A64InstRtn<outs, ins, asmstr, patterns, itin> +{ + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29-23} = 0b0011010; + let Inst{22} = 0b1; + let Inst{21} = r; + let Inst{20-16} = 0b00000; + let Inst{15-13} = opcode; + let Inst{12} = 0b0; + let Inst{11-10} = size; + + // Inherit Rn in 9-5 + // Inherit Rt in 4-0 +} + +// Format AdvSIMD vector load/store Single N-element structure to/from one lane +class NeonI_LdStOne_Lane<bit l, bit r, bits<2> op2_1, bit op0, dag outs, + dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRtn<outs, ins, asmstr, patterns, itin> +{ + bits<4> lane; + let Inst{31} = 0b0; + let Inst{29-23} = 0b0011010; + let Inst{22} = l; + let Inst{21} = r; + let Inst{20-16} = 0b00000; + let Inst{15-14} = op2_1; + let Inst{13} = op0; + + // Inherit Rn in 9-5 + // Inherit Rt in 4-0 +} + +// Format AdvSIMD post-index vector load Single N-element structure to all lanes +class NeonI_LdOne_Dup_Post<bit q, bit r, bits<3> opcode, bits<2> size, dag outs, + dag ins, string asmstr, list<dag> patterns, + InstrItinClass itin> + : A64InstRtnm<outs, ins, asmstr, patterns, itin> +{ + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29-23} = 0b0011011; + let Inst{22} = 0b1; + let Inst{21} = r; + // Inherit Rm in 20-16 + let Inst{15-13} = opcode; + let Inst{12} = 0b0; + let Inst{11-10} = size; + + // Inherit Rn in 9-5 + // Inherit Rt in 4-0 +} + +// Format AdvSIMD post-index vector load/store Single N-element structure +// to/from one lane +class NeonI_LdStOne_Lane_Post<bit l, bit r, bits<2> op2_1, bit op0, dag outs, + dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRtnm<outs, ins, asmstr, patterns, itin> +{ + bits<4> lane; + let Inst{31} = 0b0; + let Inst{29-23} = 0b0011011; + let Inst{22} = l; + let Inst{21} = r; + // Inherit Rm in 20-16 + let Inst{15-14} = op2_1; + let Inst{13} = op0; + + // Inherit Rn in 9-5 + // Inherit Rt in 4-0 +} + +// Format AdvSIMD 3 scalar registers with different type + +class NeonI_Scalar3Diff<bit u, bits<2> size, bits<4> opcode, + dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRdnm<outs, ins, asmstr, patterns, itin> { + let Inst{31-30} = 0b01; + let Inst{29} = u; + let Inst{28-24} = 0b11110; + let Inst{23-22} = size; + let Inst{21} = 0b1; + // Inherit Rm in 20-16 + let Inst{15-12} = opcode; + let Inst{11-10} = 0b00; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD scalar shift by immediate + +class NeonI_ScalarShiftImm<bit u, bits<5> opcode, + dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRdn<outs, ins, asmstr, patterns, itin> { + bits<4> Imm4; + bits<3> Imm3; + let Inst{31-30} = 0b01; + let Inst{29} = u; + let Inst{28-23} = 0b111110; + let Inst{22-19} = Imm4; + let Inst{18-16} = Imm3; + let Inst{15-11} = opcode; + let Inst{10} = 0b1; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD crypto AES +class NeonI_Crypto_AES<bits<2> size, bits<5> opcode, + dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRdn<outs, ins, asmstr, patterns, itin> { + let Inst{31-24} = 0b01001110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10100; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD crypto SHA +class NeonI_Crypto_SHA<bits<2> size, bits<5> opcode, + dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRdn<outs, ins, asmstr, patterns, itin> { + let Inst{31-24} = 0b01011110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10100; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD crypto 3V SHA +class NeonI_Crypto_3VSHA<bits<2> size, bits<3> opcode, + dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRdnm<outs, ins, asmstr, patterns, itin> { + let Inst{31-24} = 0b01011110; + let Inst{23-22} = size; + let Inst{21} = 0b0; + // Inherit Rm in 20-16 + let Inst{15} = 0b0; + let Inst{14-12} = opcode; + let Inst{11-10} = 0b00; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD scalar x indexed element +class NeonI_ScalarXIndexedElem<bit u, bit szhi, bit szlo, + bits<4> opcode, dag outs, dag ins, + string asmstr, list<dag> patterns, + InstrItinClass itin> + : A64InstRdnm<outs, ins, asmstr, patterns, itin> +{ + let Inst{31} = 0b0; + let Inst{30} = 0b1; + let Inst{29} = u; + let Inst{28-24} = 0b11111; + let Inst{23} = szhi; + let Inst{22} = szlo; + // l in Inst{21} + // m in Instr{20} + // Inherit Rm in 19-16 + let Inst{15-12} = opcode; + // h in Inst{11} + let Inst{10} = 0b0; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} +// Format AdvSIMD scalar copy - insert from element to scalar +class NeonI_ScalarCopy<dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : NeonI_copy<0b1, 0b0, 0b0000, outs, ins, asmstr, patterns, itin> { + let Inst{28} = 0b1; +} +} + diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index cf3a2c3..180110a 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -29,14 +29,14 @@ #include <algorithm> -#define GET_INSTRINFO_CTOR +#define GET_INSTRINFO_CTOR_DTOR #include "AArch64GenInstrInfo.inc" using namespace llvm; AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP), - RI(*this, STI), Subtarget(STI) {} + Subtarget(STI) {} void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, @@ -68,43 +68,71 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, get(AArch64::MRSxi), DestReg) .addImm(A64SysReg::NZCV); } else if (AArch64::GPR64RegClass.contains(DestReg)) { - assert(AArch64::GPR64RegClass.contains(SrcReg)); - Opc = AArch64::ORRxxx_lsl; - ZeroReg = AArch64::XZR; + if(AArch64::GPR64RegClass.contains(SrcReg)){ + Opc = AArch64::ORRxxx_lsl; + ZeroReg = AArch64::XZR; + } else{ + assert(AArch64::FPR64RegClass.contains(SrcReg)); + BuildMI(MBB, I, DL, get(AArch64::FMOVxd), DestReg) + .addReg(SrcReg); + return; + } } else if (AArch64::GPR32RegClass.contains(DestReg)) { - assert(AArch64::GPR32RegClass.contains(SrcReg)); - Opc = AArch64::ORRwww_lsl; - ZeroReg = AArch64::WZR; + if(AArch64::GPR32RegClass.contains(SrcReg)){ + Opc = AArch64::ORRwww_lsl; + ZeroReg = AArch64::WZR; + } else{ + assert(AArch64::FPR32RegClass.contains(SrcReg)); + BuildMI(MBB, I, DL, get(AArch64::FMOVws), DestReg) + .addReg(SrcReg); + return; + } } else if (AArch64::FPR32RegClass.contains(DestReg)) { - assert(AArch64::FPR32RegClass.contains(SrcReg)); - BuildMI(MBB, I, DL, get(AArch64::FMOVss), DestReg) - .addReg(SrcReg); - return; + if(AArch64::FPR32RegClass.contains(SrcReg)){ + BuildMI(MBB, I, DL, get(AArch64::FMOVss), DestReg) + .addReg(SrcReg); + return; + } + else { + assert(AArch64::GPR32RegClass.contains(SrcReg)); + BuildMI(MBB, I, DL, get(AArch64::FMOVsw), DestReg) + .addReg(SrcReg); + return; + } } else if (AArch64::FPR64RegClass.contains(DestReg)) { - assert(AArch64::FPR64RegClass.contains(SrcReg)); - BuildMI(MBB, I, DL, get(AArch64::FMOVdd), DestReg) - .addReg(SrcReg); - return; + if(AArch64::FPR64RegClass.contains(SrcReg)){ + BuildMI(MBB, I, DL, get(AArch64::FMOVdd), DestReg) + .addReg(SrcReg); + return; + } + else { + assert(AArch64::GPR64RegClass.contains(SrcReg)); + BuildMI(MBB, I, DL, get(AArch64::FMOVdx), DestReg) + .addReg(SrcReg); + return; + } } else if (AArch64::FPR128RegClass.contains(DestReg)) { assert(AArch64::FPR128RegClass.contains(SrcReg)); - // FIXME: there's no good way to do this, at least without NEON: - // + There's no single move instruction for q-registers - // + We can't create a spill slot and use normal STR/LDR because stack - // allocation has already happened - // + We can't go via X-registers with FMOV because register allocation has - // already happened. - // This may not be efficient, but at least it works. - BuildMI(MBB, I, DL, get(AArch64::LSFP128_PreInd_STR), AArch64::XSP) - .addReg(SrcReg) - .addReg(AArch64::XSP) - .addImm(0x1ff & -16); - - BuildMI(MBB, I, DL, get(AArch64::LSFP128_PostInd_LDR), DestReg) - .addReg(AArch64::XSP, RegState::Define) - .addReg(AArch64::XSP) - .addImm(16); - return; + // If NEON is enable, we use ORR to implement this copy. + // If NEON isn't available, emit STR and LDR to handle this. + if(getSubTarget().hasNEON()) { + BuildMI(MBB, I, DL, get(AArch64::ORRvvv_16B), DestReg) + .addReg(SrcReg) + .addReg(SrcReg); + return; + } else { + BuildMI(MBB, I, DL, get(AArch64::LSFP128_PreInd_STR), AArch64::XSP) + .addReg(SrcReg) + .addReg(AArch64::XSP) + .addImm(0x1ff & -16); + + BuildMI(MBB, I, DL, get(AArch64::LSFP128_PostInd_LDR), DestReg) + .addReg(AArch64::XSP, RegState::Define) + .addReg(AArch64::XSP) + .addImm(16); + return; + } } else { llvm_unreachable("Unknown register class in copyPhysReg"); } @@ -116,17 +144,6 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, .addImm(0); } -MachineInstr * -AArch64InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx, - uint64_t Offset, const MDNode *MDPtr, - DebugLoc DL) const { - MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE)) - .addFrameIndex(FrameIx).addImm(0) - .addImm(Offset) - .addMetadata(MDPtr); - return &*MIB; -} - /// Does the Opcode represent a conditional branch that we can remove and re-add /// at the end of a basic block? static bool isCondBranch(unsigned Opc) { diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h index 22a2ab4..620ecc9 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.h +++ b/lib/Target/AArch64/AArch64InstrInfo.h @@ -43,10 +43,6 @@ public: unsigned DestReg, unsigned SrcReg, bool KillSrc) const; - MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx, - uint64_t Offset, const MDNode *MDPtr, - DebugLoc DL) const; - void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, bool isKill, int FrameIndex, diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index d2cfc7d..23d81fc 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -11,6 +11,19 @@ // //===----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// +// ARM Instruction Predicate Definitions. +// +def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">, + AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">; +def HasNEON : Predicate<"Subtarget->hasNEON()">, + AssemblerPredicate<"FeatureNEON", "neon">; +def HasCrypto : Predicate<"Subtarget->hasCrypto()">, + AssemblerPredicate<"FeatureCrypto","crypto">; + +// Use fused MAC if more precision in FP computation is allowed. +def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion ==" + " FPOpFusion::Fast)">; include "AArch64InstrFormats.td" //===----------------------------------------------------------------------===// @@ -114,6 +127,8 @@ def A64Sbfx : SDNode<"AArch64ISD::SBFX", SDTA64BFX>; def A64Ubfx : SDNode<"AArch64ISD::UBFX", SDTA64BFX>; +class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>; + //===----------------------------------------------------------------------===// // Call sequence pseudo-instructions //===----------------------------------------------------------------------===// @@ -1263,7 +1278,7 @@ def : Pat<(i64 (sext_inreg (anyext i32:$Rn), i1)), // UBFX makes sense as an implementation of a 64-bit zero-extension too. Could // use either 64-bit or 32-bit variant, but 32-bit might be more efficient. -def : Pat<(zext i32:$Rn), (SUBREG_TO_REG (i64 0), (UBFXwwii $Rn, 0, 31), +def : Pat<(i64 (zext i32:$Rn)), (SUBREG_TO_REG (i64 0), (UBFXwwii $Rn, 0, 31), sub_32)>; //===------------------------------- @@ -1967,6 +1982,13 @@ def fpz64 : Operand<f64>, let DecoderMethod = "DecodeFPZeroOperand"; } +def fpz64movi : Operand<i64>, + ComplexPattern<f64, 1, "SelectFPZeroOperand", [fpimm]> { + let ParserMatchClass = fpzero_asmoperand; + let PrintMethod = "printFPZeroOperand"; + let DecoderMethod = "DecodeFPZeroOperand"; +} + multiclass A64I_fpcmpSignal<bits<2> type, bit imm, dag ins, dag pattern> { def _quiet : A64I_fpcmp<0b0, 0b0, type, 0b00, {0b0, imm, 0b0, 0b0, 0b0}, (outs), ins, "fcmp\t$Rn, $Rm", [pattern], @@ -2173,6 +2195,29 @@ def FMSUBdddd : A64I_fpdp3Impl<"fmsub", FPR64, f64, 0b01, 0b0, 0b1, fmsub>; def FNMADDdddd : A64I_fpdp3Impl<"fnmadd", FPR64, f64, 0b01, 0b1, 0b0, fnmadd>; def FNMSUBdddd : A64I_fpdp3Impl<"fnmsub", FPR64, f64, 0b01, 0b1, 0b1, fnmsub>; +// Extra patterns for when we're allowed to optimise separate multiplication and +// addition. +let Predicates = [HasFPARMv8, UseFusedMAC] in { +def : Pat<(f32 (fadd FPR32:$Ra, (f32 (fmul FPR32:$Rn, FPR32:$Rm)))), + (FMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; +def : Pat<(f32 (fsub FPR32:$Ra, (f32 (fmul FPR32:$Rn, FPR32:$Rm)))), + (FMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; +def : Pat<(f32 (fsub (f32 (fmul FPR32:$Rn, FPR32:$Rm)), FPR32:$Ra)), + (FNMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; +def : Pat<(f32 (fsub (f32 (fneg FPR32:$Ra)), (f32 (fmul FPR32:$Rn, FPR32:$Rm)))), + (FNMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; + +def : Pat<(f64 (fadd FPR64:$Ra, (f64 (fmul FPR64:$Rn, FPR64:$Rm)))), + (FMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; +def : Pat<(f64 (fsub FPR64:$Ra, (f64 (fmul FPR64:$Rn, FPR64:$Rm)))), + (FMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; +def : Pat<(f64 (fsub (f64 (fmul FPR64:$Rn, FPR64:$Rm)), FPR64:$Ra)), + (FNMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; +def : Pat<(f64 (fsub (f64 (fneg FPR64:$Ra)), (f64 (fmul FPR64:$Rn, FPR64:$Rm)))), + (FNMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; +} + + //===----------------------------------------------------------------------===// // Floating-point <-> fixed-point conversion instructions //===----------------------------------------------------------------------===// @@ -2308,6 +2353,7 @@ defm FCVTM : A64I_fptointRM<0b10, 0b0, "fcvtm">; defm FCVTZ : A64I_fptointRM<0b11, 0b0, "fcvtz">; defm FCVTA : A64I_fptointRM<0b00, 0b1, "fcvta">; +let Predicates = [HasFPARMv8] in { def : Pat<(i32 (fp_to_sint f32:$Rn)), (FCVTZSws $Rn)>; def : Pat<(i64 (fp_to_sint f32:$Rn)), (FCVTZSxs $Rn)>; def : Pat<(i32 (fp_to_uint f32:$Rn)), (FCVTZUws $Rn)>; @@ -2316,6 +2362,7 @@ def : Pat<(i32 (fp_to_sint f64:$Rn)), (FCVTZSwd $Rn)>; def : Pat<(i64 (fp_to_sint f64:$Rn)), (FCVTZSxd $Rn)>; def : Pat<(i32 (fp_to_uint f64:$Rn)), (FCVTZUwd $Rn)>; def : Pat<(i64 (fp_to_uint f64:$Rn)), (FCVTZUxd $Rn)>; +} multiclass A64I_inttofp<bit o0, string asmop> { def CVTFsw : A64I_fpintI<0b0, 0b00, 0b00, {0, 1, o0}, FPR32, GPR32, asmop>; @@ -2327,6 +2374,7 @@ multiclass A64I_inttofp<bit o0, string asmop> { defm S : A64I_inttofp<0b0, "scvtf">; defm U : A64I_inttofp<0b1, "ucvtf">; +let Predicates = [HasFPARMv8] in { def : Pat<(f32 (sint_to_fp i32:$Rn)), (SCVTFsw $Rn)>; def : Pat<(f32 (sint_to_fp i64:$Rn)), (SCVTFsx $Rn)>; def : Pat<(f64 (sint_to_fp i32:$Rn)), (SCVTFdw $Rn)>; @@ -2335,16 +2383,19 @@ def : Pat<(f32 (uint_to_fp i32:$Rn)), (UCVTFsw $Rn)>; def : Pat<(f32 (uint_to_fp i64:$Rn)), (UCVTFsx $Rn)>; def : Pat<(f64 (uint_to_fp i32:$Rn)), (UCVTFdw $Rn)>; def : Pat<(f64 (uint_to_fp i64:$Rn)), (UCVTFdx $Rn)>; +} def FMOVws : A64I_fpintI<0b0, 0b00, 0b00, 0b110, GPR32, FPR32, "fmov">; def FMOVsw : A64I_fpintI<0b0, 0b00, 0b00, 0b111, FPR32, GPR32, "fmov">; def FMOVxd : A64I_fpintI<0b1, 0b01, 0b00, 0b110, GPR64, FPR64, "fmov">; def FMOVdx : A64I_fpintI<0b1, 0b01, 0b00, 0b111, FPR64, GPR64, "fmov">; +let Predicates = [HasFPARMv8] in { def : Pat<(i32 (bitconvert f32:$Rn)), (FMOVws $Rn)>; def : Pat<(f32 (bitconvert i32:$Rn)), (FMOVsw $Rn)>; def : Pat<(i64 (bitconvert f64:$Rn)), (FMOVxd $Rn)>; def : Pat<(f64 (bitconvert i64:$Rn)), (FMOVdx $Rn)>; +} def lane1_asmoperand : AsmOperandClass { let Name = "Lane1"; @@ -2367,11 +2418,13 @@ let DecoderMethod = "DecodeFMOVLaneInstruction" in { "fmov\t$Rd.d[$Lane], $Rn", [], NoItinerary>; } +let Predicates = [HasFPARMv8] in { def : InstAlias<"fmov $Rd, $Rn.2d[$Lane]", (FMOVxv GPR64:$Rd, VPR128:$Rn, lane1:$Lane), 0b0>; def : InstAlias<"fmov $Rd.2d[$Lane], $Rn", (FMOVvx VPR128:$Rd, GPR64:$Rn, lane1:$Lane), 0b0>; +} //===----------------------------------------------------------------------===// // Floating-point immediate instructions @@ -2465,11 +2518,15 @@ let mayLoad = 1 in { def LDRx_lit : A64I_LDRlitSimple<0b01, 0b0, GPR64>; } +let Predicates = [HasFPARMv8] in { def LDRs_lit : A64I_LDRlitSimple<0b00, 0b1, FPR32>; def LDRd_lit : A64I_LDRlitSimple<0b01, 0b1, FPR64>; +} let mayLoad = 1 in { + let Predicates = [HasFPARMv8] in { def LDRq_lit : A64I_LDRlitSimple<0b10, 0b1, FPR128>; + } def LDRSWx_lit : A64I_LDRlit<0b10, 0b0, @@ -3063,6 +3120,7 @@ defm LS32 defm LS64 : A64I_LDRSTR_unsigned<"LS64", 0b11, 0b0, 0b0, "", GPR64, dword_addrparams>; +let Predicates = [HasFPARMv8] in { // STR/LDR to/from a B register defm LSFP8 : A64I_LDRSTR_unsigned<"LSFP8", 0b00, 0b1, 0b0, "", FPR8, byte_addrparams>; @@ -3081,6 +3139,7 @@ defm LSFP64 defm LSFP128 : A64I_LDRSTR_unsigned<"LSFP128", 0b00, 0b1, 0b1, "", FPR128, qword_addrparams>; +} //===------------------------------ // 2.3 Signed loads @@ -3536,10 +3595,13 @@ multiclass A64I_LSPsimple<bits<2> opc, bit v, RegisterClass SomeReg, defm LSPair32 : A64I_LSPsimple<0b00, 0b0, GPR32, word_simm7, "LSPair32">; defm LSPair64 : A64I_LSPsimple<0b10, 0b0, GPR64, dword_simm7, "LSPair64">; + +let Predicates = [HasFPARMv8] in { defm LSFPPair32 : A64I_LSPsimple<0b00, 0b1, FPR32, word_simm7, "LSFPPair32">; defm LSFPPair64 : A64I_LSPsimple<0b01, 0b1, FPR64, dword_simm7, "LSFPPair64">; defm LSFPPair128 : A64I_LSPsimple<0b10, 0b1, FPR128, qword_simm7, "LSFPPair128">; +} def LDPSWx : A64I_LSPoffset<0b01, 0b0, 0b1, @@ -3974,14 +4036,17 @@ def : movalias<MOVZxii, GPR64, movz64_movimm>; def : movalias<MOVNwii, GPR32, movn32_movimm>; def : movalias<MOVNxii, GPR64, movn64_movimm>; -def movw_addressref : ComplexPattern<i64, 2, "SelectMOVWAddressRef">; +def movw_addressref_g0 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<0>">; +def movw_addressref_g1 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<1>">; +def movw_addressref_g2 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<2>">; +def movw_addressref_g3 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<3>">; -def : Pat<(A64WrapperLarge movw_addressref:$G3, movw_addressref:$G2, - movw_addressref:$G1, movw_addressref:$G0), - (MOVKxii (MOVKxii (MOVKxii (MOVZxii movw_addressref:$G3), - movw_addressref:$G2), - movw_addressref:$G1), - movw_addressref:$G0)>; +def : Pat<(A64WrapperLarge movw_addressref_g3:$G3, movw_addressref_g2:$G2, + movw_addressref_g1:$G1, movw_addressref_g0:$G0), + (MOVKxii (MOVKxii (MOVKxii (MOVZxii movw_addressref_g3:$G3), + movw_addressref_g2:$G2), + movw_addressref_g1:$G1), + movw_addressref_g0:$G0)>; //===----------------------------------------------------------------------===// // PC-relative addressing instructions @@ -5120,3 +5185,9 @@ defm : regoff_pats<"Xm", (add i64:$Rn, i64:$Rm), defm : regoff_pats<"Xm", (add i64:$Rn, (shl i64:$Rm, SHIFT)), (i64 i64:$Rn), (i64 i64:$Rm), (i64 3)>; + +//===----------------------------------------------------------------------===// +// Advanced SIMD (NEON) Support +// + +include "AArch64InstrNEON.td" diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td new file mode 100644 index 0000000..d71749d --- /dev/null +++ b/lib/Target/AArch64/AArch64InstrNEON.td @@ -0,0 +1,8671 @@ +//===-- AArch64InstrNEON.td - NEON support for AArch64 -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the AArch64 NEON instruction set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// NEON-specific DAG Nodes. +//===----------------------------------------------------------------------===// +def Neon_bsl : SDNode<"AArch64ISD::NEON_BSL", SDTypeProfile<1, 3, + [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>]>>; + +// (outs Result), (ins Imm, OpCmode) +def SDT_Neon_movi : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVT<1, i32>]>; + +def Neon_movi : SDNode<"AArch64ISD::NEON_MOVIMM", SDT_Neon_movi>; + +def Neon_mvni : SDNode<"AArch64ISD::NEON_MVNIMM", SDT_Neon_movi>; + +// (outs Result), (ins Imm) +def Neon_fmovi : SDNode<"AArch64ISD::NEON_FMOVIMM", SDTypeProfile<1, 1, + [SDTCisVec<0>, SDTCisVT<1, i32>]>>; + +// (outs Result), (ins LHS, RHS, CondCode) +def Neon_cmp : SDNode<"AArch64ISD::NEON_CMP", SDTypeProfile<1, 3, + [SDTCisVec<0>, SDTCisSameAs<1, 2>]>>; + +// (outs Result), (ins LHS, 0/0.0 constant, CondCode) +def Neon_cmpz : SDNode<"AArch64ISD::NEON_CMPZ", SDTypeProfile<1, 3, + [SDTCisVec<0>, SDTCisVec<1>]>>; + +// (outs Result), (ins LHS, RHS) +def Neon_tst : SDNode<"AArch64ISD::NEON_TST", SDTypeProfile<1, 2, + [SDTCisVec<0>, SDTCisSameAs<1, 2>]>>; + +def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisVT<2, i32>]>; +def Neon_sqrshlImm : SDNode<"AArch64ISD::NEON_QSHLs", SDTARMVSH>; +def Neon_uqrshlImm : SDNode<"AArch64ISD::NEON_QSHLu", SDTARMVSH>; + +def SDTPERMUTE : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>]>; +def Neon_uzp1 : SDNode<"AArch64ISD::NEON_UZP1", SDTPERMUTE>; +def Neon_uzp2 : SDNode<"AArch64ISD::NEON_UZP2", SDTPERMUTE>; +def Neon_zip1 : SDNode<"AArch64ISD::NEON_ZIP1", SDTPERMUTE>; +def Neon_zip2 : SDNode<"AArch64ISD::NEON_ZIP2", SDTPERMUTE>; +def Neon_trn1 : SDNode<"AArch64ISD::NEON_TRN1", SDTPERMUTE>; +def Neon_trn2 : SDNode<"AArch64ISD::NEON_TRN2", SDTPERMUTE>; + +def SDTVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>; +def Neon_rev64 : SDNode<"AArch64ISD::NEON_REV64", SDTVSHUF>; +def Neon_rev32 : SDNode<"AArch64ISD::NEON_REV32", SDTVSHUF>; +def Neon_rev16 : SDNode<"AArch64ISD::NEON_REV16", SDTVSHUF>; +def Neon_vdup : SDNode<"AArch64ISD::NEON_VDUP", SDTypeProfile<1, 1, + [SDTCisVec<0>]>>; +def Neon_vduplane : SDNode<"AArch64ISD::NEON_VDUPLANE", SDTypeProfile<1, 2, + [SDTCisVec<0>, SDTCisVec<1>, SDTCisVT<2, i64>]>>; +def Neon_vextract : SDNode<"AArch64ISD::NEON_VEXTRACT", SDTypeProfile<1, 3, + [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, SDTCisVT<3, i64>]>>; + +def SDT_assertext : SDTypeProfile<1, 1, + [SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 0>]>; +def assertsext : SDNode<"ISD::AssertSext", SDT_assertext>; +def assertzext : SDNode<"ISD::AssertZext", SDT_assertext>; + +//===----------------------------------------------------------------------===// +// Multiclasses +//===----------------------------------------------------------------------===// + +multiclass NeonI_3VSame_B_sizes<bit u, bits<2> size, bits<5> opcode, + string asmop, SDPatternOperator opnode8B, + SDPatternOperator opnode16B, + bit Commutable = 0> { + let isCommutable = Commutable in { + def _8B : NeonI_3VSame<0b0, u, size, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm), + asmop # "\t$Rd.8b, $Rn.8b, $Rm.8b", + [(set (v8i8 VPR64:$Rd), + (v8i8 (opnode8B (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))))], + NoItinerary>; + + def _16B : NeonI_3VSame<0b1, u, size, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.16b, $Rn.16b, $Rm.16b", + [(set (v16i8 VPR128:$Rd), + (v16i8 (opnode16B (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))))], + NoItinerary>; + } + +} + +multiclass NeonI_3VSame_HS_sizes<bit u, bits<5> opcode, + string asmop, SDPatternOperator opnode, + bit Commutable = 0> { + let isCommutable = Commutable in { + def _4H : NeonI_3VSame<0b0, u, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm), + asmop # "\t$Rd.4h, $Rn.4h, $Rm.4h", + [(set (v4i16 VPR64:$Rd), + (v4i16 (opnode (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))))], + NoItinerary>; + + def _8H : NeonI_3VSame<0b1, u, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.8h, $Rn.8h, $Rm.8h", + [(set (v8i16 VPR128:$Rd), + (v8i16 (opnode (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))))], + NoItinerary>; + + def _2S : NeonI_3VSame<0b0, u, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm), + asmop # "\t$Rd.2s, $Rn.2s, $Rm.2s", + [(set (v2i32 VPR64:$Rd), + (v2i32 (opnode (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))))], + NoItinerary>; + + def _4S : NeonI_3VSame<0b1, u, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s", + [(set (v4i32 VPR128:$Rd), + (v4i32 (opnode (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))))], + NoItinerary>; + } +} +multiclass NeonI_3VSame_BHS_sizes<bit u, bits<5> opcode, + string asmop, SDPatternOperator opnode, + bit Commutable = 0> + : NeonI_3VSame_HS_sizes<u, opcode, asmop, opnode, Commutable> { + let isCommutable = Commutable in { + def _8B : NeonI_3VSame<0b0, u, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm), + asmop # "\t$Rd.8b, $Rn.8b, $Rm.8b", + [(set (v8i8 VPR64:$Rd), + (v8i8 (opnode (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))))], + NoItinerary>; + + def _16B : NeonI_3VSame<0b1, u, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.16b, $Rn.16b, $Rm.16b", + [(set (v16i8 VPR128:$Rd), + (v16i8 (opnode (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))))], + NoItinerary>; + } +} + +multiclass NeonI_3VSame_BHSD_sizes<bit u, bits<5> opcode, + string asmop, SDPatternOperator opnode, + bit Commutable = 0> + : NeonI_3VSame_BHS_sizes<u, opcode, asmop, opnode, Commutable> { + let isCommutable = Commutable in { + def _2D : NeonI_3VSame<0b1, u, 0b11, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.2d, $Rn.2d, $Rm.2d", + [(set (v2i64 VPR128:$Rd), + (v2i64 (opnode (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))))], + NoItinerary>; + } +} + +// Multiclass NeonI_3VSame_SD_sizes: Operand types are floating point types, +// but Result types can be integer or floating point types. +multiclass NeonI_3VSame_SD_sizes<bit u, bit size, bits<5> opcode, + string asmop, SDPatternOperator opnode2S, + SDPatternOperator opnode4S, + SDPatternOperator opnode2D, + ValueType ResTy2S, ValueType ResTy4S, + ValueType ResTy2D, bit Commutable = 0> { + let isCommutable = Commutable in { + def _2S : NeonI_3VSame<0b0, u, {size, 0b0}, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm), + asmop # "\t$Rd.2s, $Rn.2s, $Rm.2s", + [(set (ResTy2S VPR64:$Rd), + (ResTy2S (opnode2S (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))))], + NoItinerary>; + + def _4S : NeonI_3VSame<0b1, u, {size, 0b0}, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s", + [(set (ResTy4S VPR128:$Rd), + (ResTy4S (opnode4S (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))))], + NoItinerary>; + + def _2D : NeonI_3VSame<0b1, u, {size, 0b1}, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.2d, $Rn.2d, $Rm.2d", + [(set (ResTy2D VPR128:$Rd), + (ResTy2D (opnode2D (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))))], + NoItinerary>; + } +} + +//===----------------------------------------------------------------------===// +// Instruction Definitions +//===----------------------------------------------------------------------===// + +// Vector Arithmetic Instructions + +// Vector Add (Integer and Floating-Point) + +defm ADDvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b10000, "add", add, 1>; +defm FADDvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11010, "fadd", fadd, fadd, fadd, + v2f32, v4f32, v2f64, 1>; + +// Vector Sub (Integer and Floating-Point) + +defm SUBvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b10000, "sub", sub, 0>; +defm FSUBvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11010, "fsub", fsub, fsub, fsub, + v2f32, v4f32, v2f64, 0>; + +// Vector Multiply (Integer and Floating-Point) + +defm MULvvv : NeonI_3VSame_BHS_sizes<0b0, 0b10011, "mul", mul, 1>; +defm FMULvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11011, "fmul", fmul, fmul, fmul, + v2f32, v4f32, v2f64, 1>; + +// Vector Multiply (Polynomial) + +defm PMULvvv : NeonI_3VSame_B_sizes<0b1, 0b00, 0b10011, "pmul", + int_arm_neon_vmulp, int_arm_neon_vmulp, 1>; + +// Vector Multiply-accumulate and Multiply-subtract (Integer) + +// class NeonI_3VSame_Constraint_impl: NeonI_3VSame with no data type and +// two operands constraints. +class NeonI_3VSame_Constraint_impl<string asmop, string asmlane, + RegisterOperand VPRC, ValueType OpTy, bit q, bit u, bits<2> size, + bits<5> opcode, SDPatternOperator opnode> + : NeonI_3VSame<q, u, size, opcode, + (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, VPRC:$Rm), + asmop # "\t$Rd" # asmlane # ", $Rn" # asmlane # ", $Rm" # asmlane, + [(set (OpTy VPRC:$Rd), + (OpTy (opnode (OpTy VPRC:$src), (OpTy VPRC:$Rn), (OpTy VPRC:$Rm))))], + NoItinerary> { + let Constraints = "$src = $Rd"; +} + +def Neon_mla : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (add node:$Ra, (mul node:$Rn, node:$Rm))>; + +def Neon_mls : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (sub node:$Ra, (mul node:$Rn, node:$Rm))>; + + +def MLAvvv_8B: NeonI_3VSame_Constraint_impl<"mla", ".8b", VPR64, v8i8, + 0b0, 0b0, 0b00, 0b10010, Neon_mla>; +def MLAvvv_16B: NeonI_3VSame_Constraint_impl<"mla", ".16b", VPR128, v16i8, + 0b1, 0b0, 0b00, 0b10010, Neon_mla>; +def MLAvvv_4H: NeonI_3VSame_Constraint_impl<"mla", ".4h", VPR64, v4i16, + 0b0, 0b0, 0b01, 0b10010, Neon_mla>; +def MLAvvv_8H: NeonI_3VSame_Constraint_impl<"mla", ".8h", VPR128, v8i16, + 0b1, 0b0, 0b01, 0b10010, Neon_mla>; +def MLAvvv_2S: NeonI_3VSame_Constraint_impl<"mla", ".2s", VPR64, v2i32, + 0b0, 0b0, 0b10, 0b10010, Neon_mla>; +def MLAvvv_4S: NeonI_3VSame_Constraint_impl<"mla", ".4s", VPR128, v4i32, + 0b1, 0b0, 0b10, 0b10010, Neon_mla>; + +def MLSvvv_8B: NeonI_3VSame_Constraint_impl<"mls", ".8b", VPR64, v8i8, + 0b0, 0b1, 0b00, 0b10010, Neon_mls>; +def MLSvvv_16B: NeonI_3VSame_Constraint_impl<"mls", ".16b", VPR128, v16i8, + 0b1, 0b1, 0b00, 0b10010, Neon_mls>; +def MLSvvv_4H: NeonI_3VSame_Constraint_impl<"mls", ".4h", VPR64, v4i16, + 0b0, 0b1, 0b01, 0b10010, Neon_mls>; +def MLSvvv_8H: NeonI_3VSame_Constraint_impl<"mls", ".8h", VPR128, v8i16, + 0b1, 0b1, 0b01, 0b10010, Neon_mls>; +def MLSvvv_2S: NeonI_3VSame_Constraint_impl<"mls", ".2s", VPR64, v2i32, + 0b0, 0b1, 0b10, 0b10010, Neon_mls>; +def MLSvvv_4S: NeonI_3VSame_Constraint_impl<"mls", ".4s", VPR128, v4i32, + 0b1, 0b1, 0b10, 0b10010, Neon_mls>; + +// Vector Multiply-accumulate and Multiply-subtract (Floating Point) + +def Neon_fmla : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (fadd node:$Ra, (fmul node:$Rn, node:$Rm))>; + +def Neon_fmls : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (fsub node:$Ra, (fmul node:$Rn, node:$Rm))>; + +let Predicates = [HasNEON, UseFusedMAC] in { +def FMLAvvv_2S: NeonI_3VSame_Constraint_impl<"fmla", ".2s", VPR64, v2f32, + 0b0, 0b0, 0b00, 0b11001, Neon_fmla>; +def FMLAvvv_4S: NeonI_3VSame_Constraint_impl<"fmla", ".4s", VPR128, v4f32, + 0b1, 0b0, 0b00, 0b11001, Neon_fmla>; +def FMLAvvv_2D: NeonI_3VSame_Constraint_impl<"fmla", ".2d", VPR128, v2f64, + 0b1, 0b0, 0b01, 0b11001, Neon_fmla>; + +def FMLSvvv_2S: NeonI_3VSame_Constraint_impl<"fmls", ".2s", VPR64, v2f32, + 0b0, 0b0, 0b10, 0b11001, Neon_fmls>; +def FMLSvvv_4S: NeonI_3VSame_Constraint_impl<"fmls", ".4s", VPR128, v4f32, + 0b1, 0b0, 0b10, 0b11001, Neon_fmls>; +def FMLSvvv_2D: NeonI_3VSame_Constraint_impl<"fmls", ".2d", VPR128, v2f64, + 0b1, 0b0, 0b11, 0b11001, Neon_fmls>; +} + +// We're also allowed to match the fma instruction regardless of compile +// options. +def : Pat<(v2f32 (fma VPR64:$Rn, VPR64:$Rm, VPR64:$Ra)), + (FMLAvvv_2S VPR64:$Ra, VPR64:$Rn, VPR64:$Rm)>; +def : Pat<(v4f32 (fma VPR128:$Rn, VPR128:$Rm, VPR128:$Ra)), + (FMLAvvv_4S VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>; +def : Pat<(v2f64 (fma VPR128:$Rn, VPR128:$Rm, VPR128:$Ra)), + (FMLAvvv_2D VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>; + +def : Pat<(v2f32 (fma (fneg VPR64:$Rn), VPR64:$Rm, VPR64:$Ra)), + (FMLSvvv_2S VPR64:$Ra, VPR64:$Rn, VPR64:$Rm)>; +def : Pat<(v4f32 (fma (fneg VPR128:$Rn), VPR128:$Rm, VPR128:$Ra)), + (FMLSvvv_4S VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>; +def : Pat<(v2f64 (fma (fneg VPR128:$Rn), VPR128:$Rm, VPR128:$Ra)), + (FMLSvvv_2D VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>; + +// Vector Divide (Floating-Point) + +defm FDIVvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11111, "fdiv", fdiv, fdiv, fdiv, + v2f32, v4f32, v2f64, 0>; + +// Vector Bitwise Operations + +// Vector Bitwise AND + +defm ANDvvv : NeonI_3VSame_B_sizes<0b0, 0b00, 0b00011, "and", and, and, 1>; + +// Vector Bitwise Exclusive OR + +defm EORvvv : NeonI_3VSame_B_sizes<0b1, 0b00, 0b00011, "eor", xor, xor, 1>; + +// Vector Bitwise OR + +defm ORRvvv : NeonI_3VSame_B_sizes<0b0, 0b10, 0b00011, "orr", or, or, 1>; + +// ORR disassembled as MOV if Vn==Vm + +// Vector Move - register +// Alias for ORR if Vn=Vm. +// FIXME: This is actually the preferred syntax but TableGen can't deal with +// custom printing of aliases. +def : NeonInstAlias<"mov $Rd.8b, $Rn.8b", + (ORRvvv_8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rn), 0>; +def : NeonInstAlias<"mov $Rd.16b, $Rn.16b", + (ORRvvv_16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rn), 0>; + +// The MOVI instruction takes two immediate operands. The first is the +// immediate encoding, while the second is the cmode. A cmode of 14, or +// 0b1110, produces a MOVI operation, rather than a MVNI, ORR, or BIC. +def Neon_AllZero : PatFrag<(ops), (Neon_movi (i32 0), (i32 14))>; +def Neon_AllOne : PatFrag<(ops), (Neon_movi (i32 255), (i32 14))>; + +def Neon_not8B : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v8i8 Neon_AllOne)))>; +def Neon_not16B : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v16i8 Neon_AllOne)))>; + +def Neon_orn8B : PatFrag<(ops node:$Rn, node:$Rm), + (or node:$Rn, (Neon_not8B node:$Rm))>; + +def Neon_orn16B : PatFrag<(ops node:$Rn, node:$Rm), + (or node:$Rn, (Neon_not16B node:$Rm))>; + +def Neon_bic8B : PatFrag<(ops node:$Rn, node:$Rm), + (and node:$Rn, (Neon_not8B node:$Rm))>; + +def Neon_bic16B : PatFrag<(ops node:$Rn, node:$Rm), + (and node:$Rn, (Neon_not16B node:$Rm))>; + + +// Vector Bitwise OR NOT - register + +defm ORNvvv : NeonI_3VSame_B_sizes<0b0, 0b11, 0b00011, "orn", + Neon_orn8B, Neon_orn16B, 0>; + +// Vector Bitwise Bit Clear (AND NOT) - register + +defm BICvvv : NeonI_3VSame_B_sizes<0b0, 0b01, 0b00011, "bic", + Neon_bic8B, Neon_bic16B, 0>; + +multiclass Neon_bitwise2V_patterns<SDPatternOperator opnode8B, + SDPatternOperator opnode16B, + Instruction INST8B, + Instruction INST16B> { + def : Pat<(v2i32 (opnode8B VPR64:$Rn, VPR64:$Rm)), + (INST8B VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v4i16 (opnode8B VPR64:$Rn, VPR64:$Rm)), + (INST8B VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v1i64 (opnode8B VPR64:$Rn, VPR64:$Rm)), + (INST8B VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v4i32 (opnode16B VPR128:$Rn, VPR128:$Rm)), + (INST16B VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v8i16 (opnode16B VPR128:$Rn, VPR128:$Rm)), + (INST16B VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v2i64 (opnode16B VPR128:$Rn, VPR128:$Rm)), + (INST16B VPR128:$Rn, VPR128:$Rm)>; +} + +// Additional patterns for bitwise instructions AND, EOR, ORR, BIC, ORN +defm : Neon_bitwise2V_patterns<and, and, ANDvvv_8B, ANDvvv_16B>; +defm : Neon_bitwise2V_patterns<or, or, ORRvvv_8B, ORRvvv_16B>; +defm : Neon_bitwise2V_patterns<xor, xor, EORvvv_8B, EORvvv_16B>; +defm : Neon_bitwise2V_patterns<Neon_bic8B, Neon_bic16B, BICvvv_8B, BICvvv_16B>; +defm : Neon_bitwise2V_patterns<Neon_orn8B, Neon_orn16B, ORNvvv_8B, ORNvvv_16B>; + +// Vector Bitwise Select +def BSLvvv_8B : NeonI_3VSame_Constraint_impl<"bsl", ".8b", VPR64, v8i8, + 0b0, 0b1, 0b01, 0b00011, Neon_bsl>; + +def BSLvvv_16B : NeonI_3VSame_Constraint_impl<"bsl", ".16b", VPR128, v16i8, + 0b1, 0b1, 0b01, 0b00011, Neon_bsl>; + +multiclass Neon_bitwise3V_patterns<SDPatternOperator opnode, + Instruction INST8B, + Instruction INST16B> { + // Disassociate type from instruction definition + def : Pat<(v2i32 (opnode VPR64:$src,VPR64:$Rn, VPR64:$Rm)), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v4i16 (opnode VPR64:$src, VPR64:$Rn, VPR64:$Rm)), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v1i64 (opnode VPR64:$src, VPR64:$Rn, VPR64:$Rm)), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v4i32 (opnode VPR128:$src, VPR128:$Rn, VPR128:$Rm)), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v8i16 (opnode VPR128:$src, VPR128:$Rn, VPR128:$Rm)), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v2i64 (opnode VPR128:$src, VPR128:$Rn, VPR128:$Rm)), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + + // Allow to match BSL instruction pattern with non-constant operand + def : Pat<(v8i8 (or (and VPR64:$Rn, VPR64:$Rd), + (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))), + (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v4i16 (or (and VPR64:$Rn, VPR64:$Rd), + (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))), + (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v2i32 (or (and VPR64:$Rn, VPR64:$Rd), + (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))), + (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v1i64 (or (and VPR64:$Rn, VPR64:$Rd), + (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))), + (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v16i8 (or (and VPR128:$Rn, VPR128:$Rd), + (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))), + (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v8i16 (or (and VPR128:$Rn, VPR128:$Rd), + (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))), + (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v4i32 (or (and VPR128:$Rn, VPR128:$Rd), + (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))), + (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v2i64 (or (and VPR128:$Rn, VPR128:$Rd), + (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))), + (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>; + + // Allow to match llvm.arm.* intrinsics. + def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 VPR64:$src), + (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 VPR64:$src), + (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 VPR64:$src), + (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 VPR64:$src), + (v1i64 VPR64:$Rn), (v1i64 VPR64:$Rm))), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 VPR64:$src), + (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v1f64 (int_arm_neon_vbsl (v1f64 VPR64:$src), + (v1f64 VPR64:$Rn), (v1f64 VPR64:$Rm))), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 VPR128:$src), + (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 VPR128:$src), + (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 VPR128:$src), + (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 VPR128:$src), + (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v4f32 (int_arm_neon_vbsl (v4f32 VPR128:$src), + (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v2f64 (int_arm_neon_vbsl (v2f64 VPR128:$src), + (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; +} + +// Additional patterns for bitwise instruction BSL +defm: Neon_bitwise3V_patterns<Neon_bsl, BSLvvv_8B, BSLvvv_16B>; + +def Neon_NoBSLop : PatFrag<(ops node:$src, node:$Rn, node:$Rm), + (Neon_bsl node:$src, node:$Rn, node:$Rm), + [{ (void)N; return false; }]>; + +// Vector Bitwise Insert if True + +def BITvvv_8B : NeonI_3VSame_Constraint_impl<"bit", ".8b", VPR64, v8i8, + 0b0, 0b1, 0b10, 0b00011, Neon_NoBSLop>; +def BITvvv_16B : NeonI_3VSame_Constraint_impl<"bit", ".16b", VPR128, v16i8, + 0b1, 0b1, 0b10, 0b00011, Neon_NoBSLop>; + +// Vector Bitwise Insert if False + +def BIFvvv_8B : NeonI_3VSame_Constraint_impl<"bif", ".8b", VPR64, v8i8, + 0b0, 0b1, 0b11, 0b00011, Neon_NoBSLop>; +def BIFvvv_16B : NeonI_3VSame_Constraint_impl<"bif", ".16b", VPR128, v16i8, + 0b1, 0b1, 0b11, 0b00011, Neon_NoBSLop>; + +// Vector Absolute Difference and Accumulate (Signed, Unsigned) + +def Neon_uaba : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (add node:$Ra, (int_arm_neon_vabdu node:$Rn, node:$Rm))>; +def Neon_saba : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (add node:$Ra, (int_arm_neon_vabds node:$Rn, node:$Rm))>; + +// Vector Absolute Difference and Accumulate (Unsigned) +def UABAvvv_8B : NeonI_3VSame_Constraint_impl<"uaba", ".8b", VPR64, v8i8, + 0b0, 0b1, 0b00, 0b01111, Neon_uaba>; +def UABAvvv_16B : NeonI_3VSame_Constraint_impl<"uaba", ".16b", VPR128, v16i8, + 0b1, 0b1, 0b00, 0b01111, Neon_uaba>; +def UABAvvv_4H : NeonI_3VSame_Constraint_impl<"uaba", ".4h", VPR64, v4i16, + 0b0, 0b1, 0b01, 0b01111, Neon_uaba>; +def UABAvvv_8H : NeonI_3VSame_Constraint_impl<"uaba", ".8h", VPR128, v8i16, + 0b1, 0b1, 0b01, 0b01111, Neon_uaba>; +def UABAvvv_2S : NeonI_3VSame_Constraint_impl<"uaba", ".2s", VPR64, v2i32, + 0b0, 0b1, 0b10, 0b01111, Neon_uaba>; +def UABAvvv_4S : NeonI_3VSame_Constraint_impl<"uaba", ".4s", VPR128, v4i32, + 0b1, 0b1, 0b10, 0b01111, Neon_uaba>; + +// Vector Absolute Difference and Accumulate (Signed) +def SABAvvv_8B : NeonI_3VSame_Constraint_impl<"saba", ".8b", VPR64, v8i8, + 0b0, 0b0, 0b00, 0b01111, Neon_saba>; +def SABAvvv_16B : NeonI_3VSame_Constraint_impl<"saba", ".16b", VPR128, v16i8, + 0b1, 0b0, 0b00, 0b01111, Neon_saba>; +def SABAvvv_4H : NeonI_3VSame_Constraint_impl<"saba", ".4h", VPR64, v4i16, + 0b0, 0b0, 0b01, 0b01111, Neon_saba>; +def SABAvvv_8H : NeonI_3VSame_Constraint_impl<"saba", ".8h", VPR128, v8i16, + 0b1, 0b0, 0b01, 0b01111, Neon_saba>; +def SABAvvv_2S : NeonI_3VSame_Constraint_impl<"saba", ".2s", VPR64, v2i32, + 0b0, 0b0, 0b10, 0b01111, Neon_saba>; +def SABAvvv_4S : NeonI_3VSame_Constraint_impl<"saba", ".4s", VPR128, v4i32, + 0b1, 0b0, 0b10, 0b01111, Neon_saba>; + + +// Vector Absolute Difference (Signed, Unsigned) +defm UABDvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01110, "uabd", int_arm_neon_vabdu, 0>; +defm SABDvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01110, "sabd", int_arm_neon_vabds, 0>; + +// Vector Absolute Difference (Floating Point) +defm FABDvvv: NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11010, "fabd", + int_arm_neon_vabds, int_arm_neon_vabds, + int_arm_neon_vabds, v2f32, v4f32, v2f64, 0>; + +// Vector Reciprocal Step (Floating Point) +defm FRECPSvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11111, "frecps", + int_arm_neon_vrecps, int_arm_neon_vrecps, + int_arm_neon_vrecps, + v2f32, v4f32, v2f64, 0>; + +// Vector Reciprocal Square Root Step (Floating Point) +defm FRSQRTSvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11111, "frsqrts", + int_arm_neon_vrsqrts, + int_arm_neon_vrsqrts, + int_arm_neon_vrsqrts, + v2f32, v4f32, v2f64, 0>; + +// Vector Comparisons + +def Neon_cmeq : PatFrag<(ops node:$lhs, node:$rhs), + (Neon_cmp node:$lhs, node:$rhs, SETEQ)>; +def Neon_cmphs : PatFrag<(ops node:$lhs, node:$rhs), + (Neon_cmp node:$lhs, node:$rhs, SETUGE)>; +def Neon_cmge : PatFrag<(ops node:$lhs, node:$rhs), + (Neon_cmp node:$lhs, node:$rhs, SETGE)>; +def Neon_cmhi : PatFrag<(ops node:$lhs, node:$rhs), + (Neon_cmp node:$lhs, node:$rhs, SETUGT)>; +def Neon_cmgt : PatFrag<(ops node:$lhs, node:$rhs), + (Neon_cmp node:$lhs, node:$rhs, SETGT)>; + +// NeonI_compare_aliases class: swaps register operands to implement +// comparison aliases, e.g., CMLE is alias for CMGE with operands reversed. +class NeonI_compare_aliases<string asmop, string asmlane, + Instruction inst, RegisterOperand VPRC> + : NeonInstAlias<asmop # "\t$Rd" # asmlane #", $Rn" # asmlane # + ", $Rm" # asmlane, + (inst VPRC:$Rd, VPRC:$Rm, VPRC:$Rn), 0b0>; + +// Vector Comparisons (Integer) + +// Vector Compare Mask Equal (Integer) +let isCommutable =1 in { +defm CMEQvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b10001, "cmeq", Neon_cmeq, 0>; +} + +// Vector Compare Mask Higher or Same (Unsigned Integer) +defm CMHSvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00111, "cmhs", Neon_cmphs, 0>; + +// Vector Compare Mask Greater Than or Equal (Integer) +defm CMGEvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00111, "cmge", Neon_cmge, 0>; + +// Vector Compare Mask Higher (Unsigned Integer) +defm CMHIvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00110, "cmhi", Neon_cmhi, 0>; + +// Vector Compare Mask Greater Than (Integer) +defm CMGTvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00110, "cmgt", Neon_cmgt, 0>; + +// Vector Compare Mask Bitwise Test (Integer) +defm CMTSTvvv: NeonI_3VSame_BHSD_sizes<0b0, 0b10001, "cmtst", Neon_tst, 0>; + +// Vector Compare Mask Less or Same (Unsigned Integer) +// CMLS is alias for CMHS with operands reversed. +def CMLSvvv_8B : NeonI_compare_aliases<"cmls", ".8b", CMHSvvv_8B, VPR64>; +def CMLSvvv_16B : NeonI_compare_aliases<"cmls", ".16b", CMHSvvv_16B, VPR128>; +def CMLSvvv_4H : NeonI_compare_aliases<"cmls", ".4h", CMHSvvv_4H, VPR64>; +def CMLSvvv_8H : NeonI_compare_aliases<"cmls", ".8h", CMHSvvv_8H, VPR128>; +def CMLSvvv_2S : NeonI_compare_aliases<"cmls", ".2s", CMHSvvv_2S, VPR64>; +def CMLSvvv_4S : NeonI_compare_aliases<"cmls", ".4s", CMHSvvv_4S, VPR128>; +def CMLSvvv_2D : NeonI_compare_aliases<"cmls", ".2d", CMHSvvv_2D, VPR128>; + +// Vector Compare Mask Less Than or Equal (Integer) +// CMLE is alias for CMGE with operands reversed. +def CMLEvvv_8B : NeonI_compare_aliases<"cmle", ".8b", CMGEvvv_8B, VPR64>; +def CMLEvvv_16B : NeonI_compare_aliases<"cmle", ".16b", CMGEvvv_16B, VPR128>; +def CMLEvvv_4H : NeonI_compare_aliases<"cmle", ".4h", CMGEvvv_4H, VPR64>; +def CMLEvvv_8H : NeonI_compare_aliases<"cmle", ".8h", CMGEvvv_8H, VPR128>; +def CMLEvvv_2S : NeonI_compare_aliases<"cmle", ".2s", CMGEvvv_2S, VPR64>; +def CMLEvvv_4S : NeonI_compare_aliases<"cmle", ".4s", CMGEvvv_4S, VPR128>; +def CMLEvvv_2D : NeonI_compare_aliases<"cmle", ".2d", CMGEvvv_2D, VPR128>; + +// Vector Compare Mask Lower (Unsigned Integer) +// CMLO is alias for CMHI with operands reversed. +def CMLOvvv_8B : NeonI_compare_aliases<"cmlo", ".8b", CMHIvvv_8B, VPR64>; +def CMLOvvv_16B : NeonI_compare_aliases<"cmlo", ".16b", CMHIvvv_16B, VPR128>; +def CMLOvvv_4H : NeonI_compare_aliases<"cmlo", ".4h", CMHIvvv_4H, VPR64>; +def CMLOvvv_8H : NeonI_compare_aliases<"cmlo", ".8h", CMHIvvv_8H, VPR128>; +def CMLOvvv_2S : NeonI_compare_aliases<"cmlo", ".2s", CMHIvvv_2S, VPR64>; +def CMLOvvv_4S : NeonI_compare_aliases<"cmlo", ".4s", CMHIvvv_4S, VPR128>; +def CMLOvvv_2D : NeonI_compare_aliases<"cmlo", ".2d", CMHIvvv_2D, VPR128>; + +// Vector Compare Mask Less Than (Integer) +// CMLT is alias for CMGT with operands reversed. +def CMLTvvv_8B : NeonI_compare_aliases<"cmlt", ".8b", CMGTvvv_8B, VPR64>; +def CMLTvvv_16B : NeonI_compare_aliases<"cmlt", ".16b", CMGTvvv_16B, VPR128>; +def CMLTvvv_4H : NeonI_compare_aliases<"cmlt", ".4h", CMGTvvv_4H, VPR64>; +def CMLTvvv_8H : NeonI_compare_aliases<"cmlt", ".8h", CMGTvvv_8H, VPR128>; +def CMLTvvv_2S : NeonI_compare_aliases<"cmlt", ".2s", CMGTvvv_2S, VPR64>; +def CMLTvvv_4S : NeonI_compare_aliases<"cmlt", ".4s", CMGTvvv_4S, VPR128>; +def CMLTvvv_2D : NeonI_compare_aliases<"cmlt", ".2d", CMGTvvv_2D, VPR128>; + + +def neon_uimm0_asmoperand : AsmOperandClass +{ + let Name = "UImm0"; + let PredicateMethod = "isUImm<0>"; + let RenderMethod = "addImmOperands"; +} + +def neon_uimm0 : Operand<i32>, ImmLeaf<i32, [{return Imm == 0;}]> { + let ParserMatchClass = neon_uimm0_asmoperand; + let PrintMethod = "printNeonUImm0Operand"; + +} + +multiclass NeonI_cmpz_sizes<bit u, bits<5> opcode, string asmop, CondCode CC> +{ + def _8B : NeonI_2VMisc<0b0, u, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm), + asmop # "\t$Rd.8b, $Rn.8b, $Imm", + [(set (v8i8 VPR64:$Rd), + (v8i8 (Neon_cmpz (v8i8 VPR64:$Rn), (i32 imm:$Imm), CC)))], + NoItinerary>; + + def _16B : NeonI_2VMisc<0b1, u, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm), + asmop # "\t$Rd.16b, $Rn.16b, $Imm", + [(set (v16i8 VPR128:$Rd), + (v16i8 (Neon_cmpz (v16i8 VPR128:$Rn), (i32 imm:$Imm), CC)))], + NoItinerary>; + + def _4H : NeonI_2VMisc<0b0, u, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm), + asmop # "\t$Rd.4h, $Rn.4h, $Imm", + [(set (v4i16 VPR64:$Rd), + (v4i16 (Neon_cmpz (v4i16 VPR64:$Rn), (i32 imm:$Imm), CC)))], + NoItinerary>; + + def _8H : NeonI_2VMisc<0b1, u, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm), + asmop # "\t$Rd.8h, $Rn.8h, $Imm", + [(set (v8i16 VPR128:$Rd), + (v8i16 (Neon_cmpz (v8i16 VPR128:$Rn), (i32 imm:$Imm), CC)))], + NoItinerary>; + + def _2S : NeonI_2VMisc<0b0, u, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm), + asmop # "\t$Rd.2s, $Rn.2s, $Imm", + [(set (v2i32 VPR64:$Rd), + (v2i32 (Neon_cmpz (v2i32 VPR64:$Rn), (i32 imm:$Imm), CC)))], + NoItinerary>; + + def _4S : NeonI_2VMisc<0b1, u, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm), + asmop # "\t$Rd.4s, $Rn.4s, $Imm", + [(set (v4i32 VPR128:$Rd), + (v4i32 (Neon_cmpz (v4i32 VPR128:$Rn), (i32 imm:$Imm), CC)))], + NoItinerary>; + + def _2D : NeonI_2VMisc<0b1, u, 0b11, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm), + asmop # "\t$Rd.2d, $Rn.2d, $Imm", + [(set (v2i64 VPR128:$Rd), + (v2i64 (Neon_cmpz (v2i64 VPR128:$Rn), (i32 imm:$Imm), CC)))], + NoItinerary>; +} + +// Vector Compare Mask Equal to Zero (Integer) +defm CMEQvvi : NeonI_cmpz_sizes<0b0, 0b01001, "cmeq", SETEQ>; + +// Vector Compare Mask Greater Than or Equal to Zero (Signed Integer) +defm CMGEvvi : NeonI_cmpz_sizes<0b1, 0b01000, "cmge", SETGE>; + +// Vector Compare Mask Greater Than Zero (Signed Integer) +defm CMGTvvi : NeonI_cmpz_sizes<0b0, 0b01000, "cmgt", SETGT>; + +// Vector Compare Mask Less Than or Equal To Zero (Signed Integer) +defm CMLEvvi : NeonI_cmpz_sizes<0b1, 0b01001, "cmle", SETLE>; + +// Vector Compare Mask Less Than Zero (Signed Integer) +defm CMLTvvi : NeonI_cmpz_sizes<0b0, 0b01010, "cmlt", SETLT>; + +// Vector Comparisons (Floating Point) + +// Vector Compare Mask Equal (Floating Point) +let isCommutable =1 in { +defm FCMEQvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11100, "fcmeq", Neon_cmeq, + Neon_cmeq, Neon_cmeq, + v2i32, v4i32, v2i64, 0>; +} + +// Vector Compare Mask Greater Than Or Equal (Floating Point) +defm FCMGEvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11100, "fcmge", Neon_cmge, + Neon_cmge, Neon_cmge, + v2i32, v4i32, v2i64, 0>; + +// Vector Compare Mask Greater Than (Floating Point) +defm FCMGTvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11100, "fcmgt", Neon_cmgt, + Neon_cmgt, Neon_cmgt, + v2i32, v4i32, v2i64, 0>; + +// Vector Compare Mask Less Than Or Equal (Floating Point) +// FCMLE is alias for FCMGE with operands reversed. +def FCMLEvvv_2S : NeonI_compare_aliases<"fcmle", ".2s", FCMGEvvv_2S, VPR64>; +def FCMLEvvv_4S : NeonI_compare_aliases<"fcmle", ".4s", FCMGEvvv_4S, VPR128>; +def FCMLEvvv_2D : NeonI_compare_aliases<"fcmle", ".2d", FCMGEvvv_2D, VPR128>; + +// Vector Compare Mask Less Than (Floating Point) +// FCMLT is alias for FCMGT with operands reversed. +def FCMLTvvv_2S : NeonI_compare_aliases<"fcmlt", ".2s", FCMGTvvv_2S, VPR64>; +def FCMLTvvv_4S : NeonI_compare_aliases<"fcmlt", ".4s", FCMGTvvv_4S, VPR128>; +def FCMLTvvv_2D : NeonI_compare_aliases<"fcmlt", ".2d", FCMGTvvv_2D, VPR128>; + + +multiclass NeonI_fpcmpz_sizes<bit u, bit size, bits<5> opcode, + string asmop, CondCode CC> +{ + def _2S : NeonI_2VMisc<0b0, u, {size, 0b0}, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, fpz32:$FPImm), + asmop # "\t$Rd.2s, $Rn.2s, $FPImm", + [(set (v2i32 VPR64:$Rd), + (v2i32 (Neon_cmpz (v2f32 VPR64:$Rn), (f32 fpimm:$FPImm), CC)))], + NoItinerary>; + + def _4S : NeonI_2VMisc<0b1, u, {size, 0b0}, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, fpz32:$FPImm), + asmop # "\t$Rd.4s, $Rn.4s, $FPImm", + [(set (v4i32 VPR128:$Rd), + (v4i32 (Neon_cmpz (v4f32 VPR128:$Rn), (f32 fpimm:$FPImm), CC)))], + NoItinerary>; + + def _2D : NeonI_2VMisc<0b1, u, {size, 0b1}, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, fpz32:$FPImm), + asmop # "\t$Rd.2d, $Rn.2d, $FPImm", + [(set (v2i64 VPR128:$Rd), + (v2i64 (Neon_cmpz (v2f64 VPR128:$Rn), (f32 fpimm:$FPImm), CC)))], + NoItinerary>; +} + +// Vector Compare Mask Equal to Zero (Floating Point) +defm FCMEQvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01101, "fcmeq", SETEQ>; + +// Vector Compare Mask Greater Than or Equal to Zero (Floating Point) +defm FCMGEvvi : NeonI_fpcmpz_sizes<0b1, 0b1, 0b01100, "fcmge", SETGE>; + +// Vector Compare Mask Greater Than Zero (Floating Point) +defm FCMGTvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01100, "fcmgt", SETGT>; + +// Vector Compare Mask Less Than or Equal To Zero (Floating Point) +defm FCMLEvvi : NeonI_fpcmpz_sizes<0b1, 0b1, 0b01101, "fcmle", SETLE>; + +// Vector Compare Mask Less Than Zero (Floating Point) +defm FCMLTvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01110, "fcmlt", SETLT>; + +// Vector Absolute Comparisons (Floating Point) + +// Vector Absolute Compare Mask Greater Than Or Equal (Floating Point) +defm FACGEvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11101, "facge", + int_arm_neon_vacged, int_arm_neon_vacgeq, + int_aarch64_neon_vacgeq, + v2i32, v4i32, v2i64, 0>; + +// Vector Absolute Compare Mask Greater Than (Floating Point) +defm FACGTvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11101, "facgt", + int_arm_neon_vacgtd, int_arm_neon_vacgtq, + int_aarch64_neon_vacgtq, + v2i32, v4i32, v2i64, 0>; + +// Vector Absolute Compare Mask Less Than Or Equal (Floating Point) +// FACLE is alias for FACGE with operands reversed. +def FACLEvvv_2S : NeonI_compare_aliases<"facle", ".2s", FACGEvvv_2S, VPR64>; +def FACLEvvv_4S : NeonI_compare_aliases<"facle", ".4s", FACGEvvv_4S, VPR128>; +def FACLEvvv_2D : NeonI_compare_aliases<"facle", ".2d", FACGEvvv_2D, VPR128>; + +// Vector Absolute Compare Mask Less Than (Floating Point) +// FACLT is alias for FACGT with operands reversed. +def FACLTvvv_2S : NeonI_compare_aliases<"faclt", ".2s", FACGTvvv_2S, VPR64>; +def FACLTvvv_4S : NeonI_compare_aliases<"faclt", ".4s", FACGTvvv_4S, VPR128>; +def FACLTvvv_2D : NeonI_compare_aliases<"faclt", ".2d", FACGTvvv_2D, VPR128>; + +// Vector halving add (Integer Signed, Unsigned) +defm SHADDvvv : NeonI_3VSame_BHS_sizes<0b0, 0b00000, "shadd", + int_arm_neon_vhadds, 1>; +defm UHADDvvv : NeonI_3VSame_BHS_sizes<0b1, 0b00000, "uhadd", + int_arm_neon_vhaddu, 1>; + +// Vector halving sub (Integer Signed, Unsigned) +defm SHSUBvvv : NeonI_3VSame_BHS_sizes<0b0, 0b00100, "shsub", + int_arm_neon_vhsubs, 0>; +defm UHSUBvvv : NeonI_3VSame_BHS_sizes<0b1, 0b00100, "uhsub", + int_arm_neon_vhsubu, 0>; + +// Vector rouding halving add (Integer Signed, Unsigned) +defm SRHADDvvv : NeonI_3VSame_BHS_sizes<0b0, 0b00010, "srhadd", + int_arm_neon_vrhadds, 1>; +defm URHADDvvv : NeonI_3VSame_BHS_sizes<0b1, 0b00010, "urhadd", + int_arm_neon_vrhaddu, 1>; + +// Vector Saturating add (Integer Signed, Unsigned) +defm SQADDvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00001, "sqadd", + int_arm_neon_vqadds, 1>; +defm UQADDvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00001, "uqadd", + int_arm_neon_vqaddu, 1>; + +// Vector Saturating sub (Integer Signed, Unsigned) +defm SQSUBvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00101, "sqsub", + int_arm_neon_vqsubs, 1>; +defm UQSUBvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00101, "uqsub", + int_arm_neon_vqsubu, 1>; + +// Vector Shift Left (Signed and Unsigned Integer) +defm SSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01000, "sshl", + int_arm_neon_vshifts, 1>; +defm USHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01000, "ushl", + int_arm_neon_vshiftu, 1>; + +// Vector Saturating Shift Left (Signed and Unsigned Integer) +defm SQSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01001, "sqshl", + int_arm_neon_vqshifts, 1>; +defm UQSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01001, "uqshl", + int_arm_neon_vqshiftu, 1>; + +// Vector Rouding Shift Left (Signed and Unsigned Integer) +defm SRSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01010, "srshl", + int_arm_neon_vrshifts, 1>; +defm URSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01010, "urshl", + int_arm_neon_vrshiftu, 1>; + +// Vector Saturating Rouding Shift Left (Signed and Unsigned Integer) +defm SQRSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01011, "sqrshl", + int_arm_neon_vqrshifts, 1>; +defm UQRSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01011, "uqrshl", + int_arm_neon_vqrshiftu, 1>; + +// Vector Maximum (Signed and Unsigned Integer) +defm SMAXvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01100, "smax", int_arm_neon_vmaxs, 1>; +defm UMAXvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01100, "umax", int_arm_neon_vmaxu, 1>; + +// Vector Minimum (Signed and Unsigned Integer) +defm SMINvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01101, "smin", int_arm_neon_vmins, 1>; +defm UMINvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01101, "umin", int_arm_neon_vminu, 1>; + +// Vector Maximum (Floating Point) +defm FMAXvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11110, "fmax", + int_arm_neon_vmaxs, int_arm_neon_vmaxs, + int_arm_neon_vmaxs, v2f32, v4f32, v2f64, 1>; + +// Vector Minimum (Floating Point) +defm FMINvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11110, "fmin", + int_arm_neon_vmins, int_arm_neon_vmins, + int_arm_neon_vmins, v2f32, v4f32, v2f64, 1>; + +// Vector maxNum (Floating Point) - prefer a number over a quiet NaN) +defm FMAXNMvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11000, "fmaxnm", + int_aarch64_neon_vmaxnm, + int_aarch64_neon_vmaxnm, + int_aarch64_neon_vmaxnm, + v2f32, v4f32, v2f64, 1>; + +// Vector minNum (Floating Point) - prefer a number over a quiet NaN) +defm FMINNMvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11000, "fminnm", + int_aarch64_neon_vminnm, + int_aarch64_neon_vminnm, + int_aarch64_neon_vminnm, + v2f32, v4f32, v2f64, 1>; + +// Vector Maximum Pairwise (Signed and Unsigned Integer) +defm SMAXPvvv : NeonI_3VSame_BHS_sizes<0b0, 0b10100, "smaxp", int_arm_neon_vpmaxs, 1>; +defm UMAXPvvv : NeonI_3VSame_BHS_sizes<0b1, 0b10100, "umaxp", int_arm_neon_vpmaxu, 1>; + +// Vector Minimum Pairwise (Signed and Unsigned Integer) +defm SMINPvvv : NeonI_3VSame_BHS_sizes<0b0, 0b10101, "sminp", int_arm_neon_vpmins, 1>; +defm UMINPvvv : NeonI_3VSame_BHS_sizes<0b1, 0b10101, "uminp", int_arm_neon_vpminu, 1>; + +// Vector Maximum Pairwise (Floating Point) +defm FMAXPvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11110, "fmaxp", + int_arm_neon_vpmaxs, int_arm_neon_vpmaxs, + int_arm_neon_vpmaxs, v2f32, v4f32, v2f64, 1>; + +// Vector Minimum Pairwise (Floating Point) +defm FMINPvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11110, "fminp", + int_arm_neon_vpmins, int_arm_neon_vpmins, + int_arm_neon_vpmins, v2f32, v4f32, v2f64, 1>; + +// Vector maxNum Pairwise (Floating Point) - prefer a number over a quiet NaN) +defm FMAXNMPvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11000, "fmaxnmp", + int_aarch64_neon_vpmaxnm, + int_aarch64_neon_vpmaxnm, + int_aarch64_neon_vpmaxnm, + v2f32, v4f32, v2f64, 1>; + +// Vector minNum Pairwise (Floating Point) - prefer a number over a quiet NaN) +defm FMINNMPvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11000, "fminnmp", + int_aarch64_neon_vpminnm, + int_aarch64_neon_vpminnm, + int_aarch64_neon_vpminnm, + v2f32, v4f32, v2f64, 1>; + +// Vector Addition Pairwise (Integer) +defm ADDP : NeonI_3VSame_BHSD_sizes<0b0, 0b10111, "addp", int_arm_neon_vpadd, 1>; + +// Vector Addition Pairwise (Floating Point) +defm FADDP : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11010, "faddp", + int_arm_neon_vpadd, + int_arm_neon_vpadd, + int_arm_neon_vpadd, + v2f32, v4f32, v2f64, 1>; + +// Vector Saturating Doubling Multiply High +defm SQDMULHvvv : NeonI_3VSame_HS_sizes<0b0, 0b10110, "sqdmulh", + int_arm_neon_vqdmulh, 1>; + +// Vector Saturating Rouding Doubling Multiply High +defm SQRDMULHvvv : NeonI_3VSame_HS_sizes<0b1, 0b10110, "sqrdmulh", + int_arm_neon_vqrdmulh, 1>; + +// Vector Multiply Extended (Floating Point) +defm FMULXvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11011, "fmulx", + int_aarch64_neon_vmulx, + int_aarch64_neon_vmulx, + int_aarch64_neon_vmulx, + v2f32, v4f32, v2f64, 1>; + +// Vector Immediate Instructions + +multiclass neon_mov_imm_shift_asmoperands<string PREFIX> +{ + def _asmoperand : AsmOperandClass + { + let Name = "NeonMovImmShift" # PREFIX; + let RenderMethod = "addNeonMovImmShift" # PREFIX # "Operands"; + let PredicateMethod = "isNeonMovImmShift" # PREFIX; + } +} + +// Definition of vector immediates shift operands + +// The selectable use-cases extract the shift operation +// information from the OpCmode fields encoded in the immediate. +def neon_mod_shift_imm_XFORM : SDNodeXForm<imm, [{ + uint64_t OpCmode = N->getZExtValue(); + unsigned ShiftImm; + unsigned ShiftOnesIn; + unsigned HasShift = + A64Imms::decodeNeonModShiftImm(OpCmode, ShiftImm, ShiftOnesIn); + if (!HasShift) return SDValue(); + return CurDAG->getTargetConstant(ShiftImm, MVT::i32); +}]>; + +// Vector immediates shift operands which accept LSL and MSL +// shift operators with shift value in the range of 0, 8, 16, 24 (LSL), +// or 0, 8 (LSLH) or 8, 16 (MSL). +defm neon_mov_imm_LSL : neon_mov_imm_shift_asmoperands<"LSL">; +defm neon_mov_imm_MSL : neon_mov_imm_shift_asmoperands<"MSL">; +// LSLH restricts shift amount to 0, 8 out of 0, 8, 16, 24 +defm neon_mov_imm_LSLH : neon_mov_imm_shift_asmoperands<"LSLH">; + +multiclass neon_mov_imm_shift_operands<string PREFIX, + string HALF, string ISHALF, code pred> +{ + def _operand : Operand<i32>, ImmLeaf<i32, pred, neon_mod_shift_imm_XFORM> + { + let PrintMethod = + "printNeonMovImmShiftOperand<A64SE::" # PREFIX # ", " # ISHALF # ">"; + let DecoderMethod = + "DecodeNeonMovImmShiftOperand<A64SE::" # PREFIX # ", " # ISHALF # ">"; + let ParserMatchClass = + !cast<AsmOperandClass>("neon_mov_imm_" # PREFIX # HALF # "_asmoperand"); + } +} + +defm neon_mov_imm_LSL : neon_mov_imm_shift_operands<"LSL", "", "false", [{ + unsigned ShiftImm; + unsigned ShiftOnesIn; + unsigned HasShift = + A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn); + return (HasShift && !ShiftOnesIn); +}]>; + +defm neon_mov_imm_MSL : neon_mov_imm_shift_operands<"MSL", "", "false", [{ + unsigned ShiftImm; + unsigned ShiftOnesIn; + unsigned HasShift = + A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn); + return (HasShift && ShiftOnesIn); +}]>; + +defm neon_mov_imm_LSLH : neon_mov_imm_shift_operands<"LSL", "H", "true", [{ + unsigned ShiftImm; + unsigned ShiftOnesIn; + unsigned HasShift = + A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn); + return (HasShift && !ShiftOnesIn); +}]>; + +def neon_uimm1_asmoperand : AsmOperandClass +{ + let Name = "UImm1"; + let PredicateMethod = "isUImm<1>"; + let RenderMethod = "addImmOperands"; +} + +def neon_uimm2_asmoperand : AsmOperandClass +{ + let Name = "UImm2"; + let PredicateMethod = "isUImm<2>"; + let RenderMethod = "addImmOperands"; +} + +def neon_uimm8_asmoperand : AsmOperandClass +{ + let Name = "UImm8"; + let PredicateMethod = "isUImm<8>"; + let RenderMethod = "addImmOperands"; +} + +def neon_uimm8 : Operand<i32>, ImmLeaf<i32, [{(void)Imm; return true;}]> { + let ParserMatchClass = neon_uimm8_asmoperand; + let PrintMethod = "printUImmHexOperand"; +} + +def neon_uimm64_mask_asmoperand : AsmOperandClass +{ + let Name = "NeonUImm64Mask"; + let PredicateMethod = "isNeonUImm64Mask"; + let RenderMethod = "addNeonUImm64MaskOperands"; +} + +// MCOperand for 64-bit bytemask with each byte having only the +// value 0x00 and 0xff is encoded as an unsigned 8-bit value +def neon_uimm64_mask : Operand<i32>, ImmLeaf<i32, [{(void)Imm; return true;}]> { + let ParserMatchClass = neon_uimm64_mask_asmoperand; + let PrintMethod = "printNeonUImm64MaskOperand"; +} + +multiclass NeonI_mov_imm_lsl_sizes<string asmop, bit op, + SDPatternOperator opnode> +{ + // shift zeros, per word + def _2S : NeonI_1VModImm<0b0, op, + (outs VPR64:$Rd), + (ins neon_uimm8:$Imm, + neon_mov_imm_LSL_operand:$Simm), + !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"), + [(set (v2i32 VPR64:$Rd), + (v2i32 (opnode (timm:$Imm), + (neon_mov_imm_LSL_operand:$Simm))))], + NoItinerary> { + bits<2> Simm; + let cmode = {0b0, Simm{1}, Simm{0}, 0b0}; + } + + def _4S : NeonI_1VModImm<0b1, op, + (outs VPR128:$Rd), + (ins neon_uimm8:$Imm, + neon_mov_imm_LSL_operand:$Simm), + !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"), + [(set (v4i32 VPR128:$Rd), + (v4i32 (opnode (timm:$Imm), + (neon_mov_imm_LSL_operand:$Simm))))], + NoItinerary> { + bits<2> Simm; + let cmode = {0b0, Simm{1}, Simm{0}, 0b0}; + } + + // shift zeros, per halfword + def _4H : NeonI_1VModImm<0b0, op, + (outs VPR64:$Rd), + (ins neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm), + !strconcat(asmop, "\t$Rd.4h, $Imm$Simm"), + [(set (v4i16 VPR64:$Rd), + (v4i16 (opnode (timm:$Imm), + (neon_mov_imm_LSLH_operand:$Simm))))], + NoItinerary> { + bit Simm; + let cmode = {0b1, 0b0, Simm, 0b0}; + } + + def _8H : NeonI_1VModImm<0b1, op, + (outs VPR128:$Rd), + (ins neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm), + !strconcat(asmop, "\t$Rd.8h, $Imm$Simm"), + [(set (v8i16 VPR128:$Rd), + (v8i16 (opnode (timm:$Imm), + (neon_mov_imm_LSLH_operand:$Simm))))], + NoItinerary> { + bit Simm; + let cmode = {0b1, 0b0, Simm, 0b0}; + } +} + +multiclass NeonI_mov_imm_with_constraint_lsl_sizes<string asmop, bit op, + SDPatternOperator opnode, + SDPatternOperator neonopnode> +{ + let Constraints = "$src = $Rd" in { + // shift zeros, per word + def _2S : NeonI_1VModImm<0b0, op, + (outs VPR64:$Rd), + (ins VPR64:$src, neon_uimm8:$Imm, + neon_mov_imm_LSL_operand:$Simm), + !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"), + [(set (v2i32 VPR64:$Rd), + (v2i32 (opnode (v2i32 VPR64:$src), + (v2i32 (bitconvert (v2i32 (neonopnode timm:$Imm, + neon_mov_imm_LSL_operand:$Simm)))))))], + NoItinerary> { + bits<2> Simm; + let cmode = {0b0, Simm{1}, Simm{0}, 0b1}; + } + + def _4S : NeonI_1VModImm<0b1, op, + (outs VPR128:$Rd), + (ins VPR128:$src, neon_uimm8:$Imm, + neon_mov_imm_LSL_operand:$Simm), + !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"), + [(set (v4i32 VPR128:$Rd), + (v4i32 (opnode (v4i32 VPR128:$src), + (v4i32 (bitconvert (v4i32 (neonopnode timm:$Imm, + neon_mov_imm_LSL_operand:$Simm)))))))], + NoItinerary> { + bits<2> Simm; + let cmode = {0b0, Simm{1}, Simm{0}, 0b1}; + } + + // shift zeros, per halfword + def _4H : NeonI_1VModImm<0b0, op, + (outs VPR64:$Rd), + (ins VPR64:$src, neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm), + !strconcat(asmop, "\t$Rd.4h, $Imm$Simm"), + [(set (v4i16 VPR64:$Rd), + (v4i16 (opnode (v4i16 VPR64:$src), + (v4i16 (bitconvert (v4i16 (neonopnode timm:$Imm, + neon_mov_imm_LSL_operand:$Simm)))))))], + NoItinerary> { + bit Simm; + let cmode = {0b1, 0b0, Simm, 0b1}; + } + + def _8H : NeonI_1VModImm<0b1, op, + (outs VPR128:$Rd), + (ins VPR128:$src, neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm), + !strconcat(asmop, "\t$Rd.8h, $Imm$Simm"), + [(set (v8i16 VPR128:$Rd), + (v8i16 (opnode (v8i16 VPR128:$src), + (v8i16 (bitconvert (v8i16 (neonopnode timm:$Imm, + neon_mov_imm_LSL_operand:$Simm)))))))], + NoItinerary> { + bit Simm; + let cmode = {0b1, 0b0, Simm, 0b1}; + } + } +} + +multiclass NeonI_mov_imm_msl_sizes<string asmop, bit op, + SDPatternOperator opnode> +{ + // shift ones, per word + def _2S : NeonI_1VModImm<0b0, op, + (outs VPR64:$Rd), + (ins neon_uimm8:$Imm, + neon_mov_imm_MSL_operand:$Simm), + !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"), + [(set (v2i32 VPR64:$Rd), + (v2i32 (opnode (timm:$Imm), + (neon_mov_imm_MSL_operand:$Simm))))], + NoItinerary> { + bit Simm; + let cmode = {0b1, 0b1, 0b0, Simm}; + } + + def _4S : NeonI_1VModImm<0b1, op, + (outs VPR128:$Rd), + (ins neon_uimm8:$Imm, + neon_mov_imm_MSL_operand:$Simm), + !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"), + [(set (v4i32 VPR128:$Rd), + (v4i32 (opnode (timm:$Imm), + (neon_mov_imm_MSL_operand:$Simm))))], + NoItinerary> { + bit Simm; + let cmode = {0b1, 0b1, 0b0, Simm}; + } +} + +// Vector Move Immediate Shifted +let isReMaterializable = 1 in { +defm MOVIvi_lsl : NeonI_mov_imm_lsl_sizes<"movi", 0b0, Neon_movi>; +} + +// Vector Move Inverted Immediate Shifted +let isReMaterializable = 1 in { +defm MVNIvi_lsl : NeonI_mov_imm_lsl_sizes<"mvni", 0b1, Neon_mvni>; +} + +// Vector Bitwise Bit Clear (AND NOT) - immediate +let isReMaterializable = 1 in { +defm BICvi_lsl : NeonI_mov_imm_with_constraint_lsl_sizes<"bic", 0b1, + and, Neon_mvni>; +} + +// Vector Bitwise OR - immedidate + +let isReMaterializable = 1 in { +defm ORRvi_lsl : NeonI_mov_imm_with_constraint_lsl_sizes<"orr", 0b0, + or, Neon_movi>; +} + +// Additional patterns for Vector Bitwise Bit Clear (AND NOT) - immedidate +// LowerBUILD_VECTOR favors lowering MOVI over MVNI. +// BIC immediate instructions selection requires additional patterns to +// transform Neon_movi operands into BIC immediate operands + +def neon_mov_imm_LSLH_transform_XFORM : SDNodeXForm<imm, [{ + uint64_t OpCmode = N->getZExtValue(); + unsigned ShiftImm; + unsigned ShiftOnesIn; + (void)A64Imms::decodeNeonModShiftImm(OpCmode, ShiftImm, ShiftOnesIn); + // LSLH restricts shift amount to 0, 8 which are encoded as 0 and 1 + // Transform encoded shift amount 0 to 1 and 1 to 0. + return CurDAG->getTargetConstant(!ShiftImm, MVT::i32); +}]>; + +def neon_mov_imm_LSLH_transform_operand + : ImmLeaf<i32, [{ + unsigned ShiftImm; + unsigned ShiftOnesIn; + unsigned HasShift = + A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn); + return (HasShift && !ShiftOnesIn); }], + neon_mov_imm_LSLH_transform_XFORM>; + +// Transform (and A, (4h Neon_movi 0xff)) -> BIC 4h (A, 0x00, LSL 8) +// Transform (and A, (4h Neon_movi 0xff LSL #8)) -> BIC 4h (A, 0x00) +def : Pat<(v4i16 (and VPR64:$src, + (v4i16 (Neon_movi 255, neon_mov_imm_LSLH_transform_operand:$Simm)))), + (BICvi_lsl_4H VPR64:$src, 0, + neon_mov_imm_LSLH_transform_operand:$Simm)>; + +// Transform (and A, (8h Neon_movi 8h 0xff)) -> BIC 8h (A, 0x00, LSL 8) +// Transform (and A, (8h Neon_movi 0xff LSL #8)) -> BIC 8h (A, 0x00) +def : Pat<(v8i16 (and VPR128:$src, + (v8i16 (Neon_movi 255, neon_mov_imm_LSLH_transform_operand:$Simm)))), + (BICvi_lsl_8H VPR128:$src, 0, + neon_mov_imm_LSLH_transform_operand:$Simm)>; + + +multiclass Neon_bitwiseVi_patterns<SDPatternOperator opnode, + SDPatternOperator neonopnode, + Instruction INST4H, + Instruction INST8H> { + def : Pat<(v8i8 (opnode VPR64:$src, + (bitconvert(v4i16 (neonopnode timm:$Imm, + neon_mov_imm_LSLH_operand:$Simm))))), + (INST4H VPR64:$src, neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm)>; + def : Pat<(v1i64 (opnode VPR64:$src, + (bitconvert(v4i16 (neonopnode timm:$Imm, + neon_mov_imm_LSLH_operand:$Simm))))), + (INST4H VPR64:$src, neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm)>; + + def : Pat<(v16i8 (opnode VPR128:$src, + (bitconvert(v8i16 (neonopnode timm:$Imm, + neon_mov_imm_LSLH_operand:$Simm))))), + (INST8H VPR128:$src, neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm)>; + def : Pat<(v4i32 (opnode VPR128:$src, + (bitconvert(v8i16 (neonopnode timm:$Imm, + neon_mov_imm_LSLH_operand:$Simm))))), + (INST8H VPR128:$src, neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm)>; + def : Pat<(v2i64 (opnode VPR128:$src, + (bitconvert(v8i16 (neonopnode timm:$Imm, + neon_mov_imm_LSLH_operand:$Simm))))), + (INST8H VPR128:$src, neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm)>; +} + +// Additional patterns for Vector Vector Bitwise Bit Clear (AND NOT) - immediate +defm : Neon_bitwiseVi_patterns<or, Neon_mvni, BICvi_lsl_4H, BICvi_lsl_8H>; + +// Additional patterns for Vector Bitwise OR - immedidate +defm : Neon_bitwiseVi_patterns<or, Neon_movi, ORRvi_lsl_4H, ORRvi_lsl_8H>; + + +// Vector Move Immediate Masked +let isReMaterializable = 1 in { +defm MOVIvi_msl : NeonI_mov_imm_msl_sizes<"movi", 0b0, Neon_movi>; +} + +// Vector Move Inverted Immediate Masked +let isReMaterializable = 1 in { +defm MVNIvi_msl : NeonI_mov_imm_msl_sizes<"mvni", 0b1, Neon_mvni>; +} + +class NeonI_mov_imm_lsl_aliases<string asmop, string asmlane, + Instruction inst, RegisterOperand VPRC> + : NeonInstAlias<!strconcat(asmop, "\t$Rd," # asmlane # ", $Imm"), + (inst VPRC:$Rd, neon_uimm8:$Imm, 0), 0b0>; + +// Aliases for Vector Move Immediate Shifted +def : NeonI_mov_imm_lsl_aliases<"movi", ".2s", MOVIvi_lsl_2S, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"movi", ".4s", MOVIvi_lsl_4S, VPR128>; +def : NeonI_mov_imm_lsl_aliases<"movi", ".4h", MOVIvi_lsl_4H, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"movi", ".8h", MOVIvi_lsl_8H, VPR128>; + +// Aliases for Vector Move Inverted Immediate Shifted +def : NeonI_mov_imm_lsl_aliases<"mvni", ".2s", MVNIvi_lsl_2S, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"mvni", ".4s", MVNIvi_lsl_4S, VPR128>; +def : NeonI_mov_imm_lsl_aliases<"mvni", ".4h", MVNIvi_lsl_4H, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"mvni", ".8h", MVNIvi_lsl_8H, VPR128>; + +// Aliases for Vector Bitwise Bit Clear (AND NOT) - immediate +def : NeonI_mov_imm_lsl_aliases<"bic", ".2s", BICvi_lsl_2S, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"bic", ".4s", BICvi_lsl_4S, VPR128>; +def : NeonI_mov_imm_lsl_aliases<"bic", ".4h", BICvi_lsl_4H, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"bic", ".8h", BICvi_lsl_8H, VPR128>; + +// Aliases for Vector Bitwise OR - immedidate +def : NeonI_mov_imm_lsl_aliases<"orr", ".2s", ORRvi_lsl_2S, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"orr", ".4s", ORRvi_lsl_4S, VPR128>; +def : NeonI_mov_imm_lsl_aliases<"orr", ".4h", ORRvi_lsl_4H, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"orr", ".8h", ORRvi_lsl_8H, VPR128>; + +// Vector Move Immediate - per byte +let isReMaterializable = 1 in { +def MOVIvi_8B : NeonI_1VModImm<0b0, 0b0, + (outs VPR64:$Rd), (ins neon_uimm8:$Imm), + "movi\t$Rd.8b, $Imm", + [(set (v8i8 VPR64:$Rd), + (v8i8 (Neon_movi (timm:$Imm), (i32 imm))))], + NoItinerary> { + let cmode = 0b1110; +} + +def MOVIvi_16B : NeonI_1VModImm<0b1, 0b0, + (outs VPR128:$Rd), (ins neon_uimm8:$Imm), + "movi\t$Rd.16b, $Imm", + [(set (v16i8 VPR128:$Rd), + (v16i8 (Neon_movi (timm:$Imm), (i32 imm))))], + NoItinerary> { + let cmode = 0b1110; +} +} + +// Vector Move Immediate - bytemask, per double word +let isReMaterializable = 1 in { +def MOVIvi_2D : NeonI_1VModImm<0b1, 0b1, + (outs VPR128:$Rd), (ins neon_uimm64_mask:$Imm), + "movi\t $Rd.2d, $Imm", + [(set (v2i64 VPR128:$Rd), + (v2i64 (Neon_movi (timm:$Imm), (i32 imm))))], + NoItinerary> { + let cmode = 0b1110; +} +} + +// Vector Move Immediate - bytemask, one doubleword + +let isReMaterializable = 1 in { +def MOVIdi : NeonI_1VModImm<0b0, 0b1, + (outs FPR64:$Rd), (ins neon_uimm64_mask:$Imm), + "movi\t $Rd, $Imm", + [(set (v1i64 FPR64:$Rd), + (v1i64 (Neon_movi (timm:$Imm), (i32 imm))))], + NoItinerary> { + let cmode = 0b1110; +} +} + +// Vector Floating Point Move Immediate + +class NeonI_FMOV_impl<string asmlane, RegisterOperand VPRC, ValueType OpTy, + Operand immOpType, bit q, bit op> + : NeonI_1VModImm<q, op, + (outs VPRC:$Rd), (ins immOpType:$Imm), + "fmov\t$Rd" # asmlane # ", $Imm", + [(set (OpTy VPRC:$Rd), + (OpTy (Neon_fmovi (timm:$Imm))))], + NoItinerary> { + let cmode = 0b1111; + } + +let isReMaterializable = 1 in { +def FMOVvi_2S : NeonI_FMOV_impl<".2s", VPR64, v2f32, fmov32_operand, 0b0, 0b0>; +def FMOVvi_4S : NeonI_FMOV_impl<".4s", VPR128, v4f32, fmov32_operand, 0b1, 0b0>; +def FMOVvi_2D : NeonI_FMOV_impl<".2d", VPR128, v2f64, fmov64_operand, 0b1, 0b1>; +} + +// Vector Shift (Immediate) +// Immediate in [0, 63] +def imm0_63 : Operand<i32> { + let ParserMatchClass = uimm6_asmoperand; +} + +// Shift Right/Left Immediate - The immh:immb field of these shifts are encoded +// as follows: +// +// Offset Encoding +// 8 immh:immb<6:3> = '0001xxx', <imm> is encoded in immh:immb<2:0> +// 16 immh:immb<6:4> = '001xxxx', <imm> is encoded in immh:immb<3:0> +// 32 immh:immb<6:5> = '01xxxxx', <imm> is encoded in immh:immb<4:0> +// 64 immh:immb<6> = '1xxxxxx', <imm> is encoded in immh:immb<5:0> +// +// The shift right immediate amount, in the range 1 to element bits, is computed +// as Offset - UInt(immh:immb). The shift left immediate amount, in the range 0 +// to element bits - 1, is computed as UInt(immh:immb) - Offset. + +class shr_imm_asmoperands<string OFFSET> : AsmOperandClass { + let Name = "ShrImm" # OFFSET; + let RenderMethod = "addImmOperands"; + let DiagnosticType = "ShrImm" # OFFSET; +} + +class shr_imm<string OFFSET> : Operand<i32> { + let EncoderMethod = "getShiftRightImm" # OFFSET; + let DecoderMethod = "DecodeShiftRightImm" # OFFSET; + let ParserMatchClass = + !cast<AsmOperandClass>("shr_imm" # OFFSET # "_asmoperand"); +} + +def shr_imm8_asmoperand : shr_imm_asmoperands<"8">; +def shr_imm16_asmoperand : shr_imm_asmoperands<"16">; +def shr_imm32_asmoperand : shr_imm_asmoperands<"32">; +def shr_imm64_asmoperand : shr_imm_asmoperands<"64">; + +def shr_imm8 : shr_imm<"8">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 8;}]>; +def shr_imm16 : shr_imm<"16">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 16;}]>; +def shr_imm32 : shr_imm<"32">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 32;}]>; +def shr_imm64 : shr_imm<"64">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 64;}]>; + +class shl_imm_asmoperands<string OFFSET> : AsmOperandClass { + let Name = "ShlImm" # OFFSET; + let RenderMethod = "addImmOperands"; + let DiagnosticType = "ShlImm" # OFFSET; +} + +class shl_imm<string OFFSET> : Operand<i32> { + let EncoderMethod = "getShiftLeftImm" # OFFSET; + let DecoderMethod = "DecodeShiftLeftImm" # OFFSET; + let ParserMatchClass = + !cast<AsmOperandClass>("shl_imm" # OFFSET # "_asmoperand"); +} + +def shl_imm8_asmoperand : shl_imm_asmoperands<"8">; +def shl_imm16_asmoperand : shl_imm_asmoperands<"16">; +def shl_imm32_asmoperand : shl_imm_asmoperands<"32">; +def shl_imm64_asmoperand : shl_imm_asmoperands<"64">; + +def shl_imm8 : shl_imm<"8">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 8;}]>; +def shl_imm16 : shl_imm<"16">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 16;}]>; +def shl_imm32 : shl_imm<"32">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 32;}]>; +def shl_imm64 : shl_imm<"64">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 64;}]>; + +class N2VShift<bit q, bit u, bits<5> opcode, string asmop, string T, + RegisterOperand VPRC, ValueType Ty, Operand ImmTy, SDNode OpNode> + : NeonI_2VShiftImm<q, u, opcode, + (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm), + asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm", + [(set (Ty VPRC:$Rd), + (Ty (OpNode (Ty VPRC:$Rn), + (Ty (Neon_vdup (i32 ImmTy:$Imm))))))], + NoItinerary>; + +multiclass NeonI_N2VShL<bit u, bits<5> opcode, string asmop> { + // 64-bit vector types. + def _8B : N2VShift<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8, shl> { + let Inst{22-19} = 0b0001; // immh:immb = 0001xxx + } + + def _4H : N2VShift<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16, shl> { + let Inst{22-20} = 0b001; // immh:immb = 001xxxx + } + + def _2S : N2VShift<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32, shl> { + let Inst{22-21} = 0b01; // immh:immb = 01xxxxx + } + + // 128-bit vector types. + def _16B : N2VShift<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8, shl> { + let Inst{22-19} = 0b0001; // immh:immb = 0001xxx + } + + def _8H : N2VShift<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16, shl> { + let Inst{22-20} = 0b001; // immh:immb = 001xxxx + } + + def _4S : N2VShift<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32, shl> { + let Inst{22-21} = 0b01; // immh:immb = 01xxxxx + } + + def _2D : N2VShift<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64, shl> { + let Inst{22} = 0b1; // immh:immb = 1xxxxxx + } +} + +multiclass NeonI_N2VShR<bit u, bits<5> opcode, string asmop, SDNode OpNode> { + def _8B : N2VShift<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8, + OpNode> { + let Inst{22-19} = 0b0001; + } + + def _4H : N2VShift<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16, + OpNode> { + let Inst{22-20} = 0b001; + } + + def _2S : N2VShift<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32, + OpNode> { + let Inst{22-21} = 0b01; + } + + def _16B : N2VShift<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8, + OpNode> { + let Inst{22-19} = 0b0001; + } + + def _8H : N2VShift<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16, + OpNode> { + let Inst{22-20} = 0b001; + } + + def _4S : N2VShift<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32, + OpNode> { + let Inst{22-21} = 0b01; + } + + def _2D : N2VShift<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64, + OpNode> { + let Inst{22} = 0b1; + } +} + +// Shift left +defm SHLvvi : NeonI_N2VShL<0b0, 0b01010, "shl">; + +// Shift right +defm SSHRvvi : NeonI_N2VShR<0b0, 0b00000, "sshr", sra>; +defm USHRvvi : NeonI_N2VShR<0b1, 0b00000, "ushr", srl>; + +def Neon_High16B : PatFrag<(ops node:$in), + (extract_subvector (v16i8 node:$in), (iPTR 8))>; +def Neon_High8H : PatFrag<(ops node:$in), + (extract_subvector (v8i16 node:$in), (iPTR 4))>; +def Neon_High4S : PatFrag<(ops node:$in), + (extract_subvector (v4i32 node:$in), (iPTR 2))>; +def Neon_High2D : PatFrag<(ops node:$in), + (extract_subvector (v2i64 node:$in), (iPTR 1))>; +def Neon_High4float : PatFrag<(ops node:$in), + (extract_subvector (v4f32 node:$in), (iPTR 2))>; +def Neon_High2double : PatFrag<(ops node:$in), + (extract_subvector (v2f64 node:$in), (iPTR 1))>; + +def Neon_Low16B : PatFrag<(ops node:$in), + (v8i8 (extract_subvector (v16i8 node:$in), + (iPTR 0)))>; +def Neon_Low8H : PatFrag<(ops node:$in), + (v4i16 (extract_subvector (v8i16 node:$in), + (iPTR 0)))>; +def Neon_Low4S : PatFrag<(ops node:$in), + (v2i32 (extract_subvector (v4i32 node:$in), + (iPTR 0)))>; +def Neon_Low2D : PatFrag<(ops node:$in), + (v1i64 (extract_subvector (v2i64 node:$in), + (iPTR 0)))>; +def Neon_Low4float : PatFrag<(ops node:$in), + (v2f32 (extract_subvector (v4f32 node:$in), + (iPTR 0)))>; +def Neon_Low2double : PatFrag<(ops node:$in), + (v1f64 (extract_subvector (v2f64 node:$in), + (iPTR 0)))>; + +class N2VShiftLong<bit q, bit u, bits<5> opcode, string asmop, string DestT, + string SrcT, ValueType DestTy, ValueType SrcTy, + Operand ImmTy, SDPatternOperator ExtOp> + : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd), + (ins VPR64:$Rn, ImmTy:$Imm), + asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm", + [(set (DestTy VPR128:$Rd), + (DestTy (shl + (DestTy (ExtOp (SrcTy VPR64:$Rn))), + (DestTy (Neon_vdup (i32 ImmTy:$Imm))))))], + NoItinerary>; + +class N2VShiftLongHigh<bit q, bit u, bits<5> opcode, string asmop, string DestT, + string SrcT, ValueType DestTy, ValueType SrcTy, + int StartIndex, Operand ImmTy, + SDPatternOperator ExtOp, PatFrag getTop> + : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd), + (ins VPR128:$Rn, ImmTy:$Imm), + asmop # "2\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm", + [(set (DestTy VPR128:$Rd), + (DestTy (shl + (DestTy (ExtOp + (SrcTy (getTop VPR128:$Rn)))), + (DestTy (Neon_vdup (i32 ImmTy:$Imm))))))], + NoItinerary>; + +multiclass NeonI_N2VShLL<string prefix, bit u, bits<5> opcode, string asmop, + SDNode ExtOp> { + // 64-bit vector types. + def _8B : N2VShiftLong<0b0, u, opcode, asmop, "8h", "8b", v8i16, v8i8, + shl_imm8, ExtOp> { + let Inst{22-19} = 0b0001; // immh:immb = 0001xxx + } + + def _4H : N2VShiftLong<0b0, u, opcode, asmop, "4s", "4h", v4i32, v4i16, + shl_imm16, ExtOp> { + let Inst{22-20} = 0b001; // immh:immb = 001xxxx + } + + def _2S : N2VShiftLong<0b0, u, opcode, asmop, "2d", "2s", v2i64, v2i32, + shl_imm32, ExtOp> { + let Inst{22-21} = 0b01; // immh:immb = 01xxxxx + } + + // 128-bit vector types + def _16B : N2VShiftLongHigh<0b1, u, opcode, asmop, "8h", "16b", v8i16, v8i8, + 8, shl_imm8, ExtOp, Neon_High16B> { + let Inst{22-19} = 0b0001; // immh:immb = 0001xxx + } + + def _8H : N2VShiftLongHigh<0b1, u, opcode, asmop, "4s", "8h", v4i32, v4i16, + 4, shl_imm16, ExtOp, Neon_High8H> { + let Inst{22-20} = 0b001; // immh:immb = 001xxxx + } + + def _4S : N2VShiftLongHigh<0b1, u, opcode, asmop, "2d", "4s", v2i64, v2i32, + 2, shl_imm32, ExtOp, Neon_High4S> { + let Inst{22-21} = 0b01; // immh:immb = 01xxxxx + } + + // Use other patterns to match when the immediate is 0. + def : Pat<(v8i16 (ExtOp (v8i8 VPR64:$Rn))), + (!cast<Instruction>(prefix # "_8B") VPR64:$Rn, 0)>; + + def : Pat<(v4i32 (ExtOp (v4i16 VPR64:$Rn))), + (!cast<Instruction>(prefix # "_4H") VPR64:$Rn, 0)>; + + def : Pat<(v2i64 (ExtOp (v2i32 VPR64:$Rn))), + (!cast<Instruction>(prefix # "_2S") VPR64:$Rn, 0)>; + + def : Pat<(v8i16 (ExtOp (v8i8 (Neon_High16B VPR128:$Rn)))), + (!cast<Instruction>(prefix # "_16B") VPR128:$Rn, 0)>; + + def : Pat<(v4i32 (ExtOp (v4i16 (Neon_High8H VPR128:$Rn)))), + (!cast<Instruction>(prefix # "_8H") VPR128:$Rn, 0)>; + + def : Pat<(v2i64 (ExtOp (v2i32 (Neon_High4S VPR128:$Rn)))), + (!cast<Instruction>(prefix # "_4S") VPR128:$Rn, 0)>; +} + +// Shift left long +defm SSHLLvvi : NeonI_N2VShLL<"SSHLLvvi", 0b0, 0b10100, "sshll", sext>; +defm USHLLvvi : NeonI_N2VShLL<"USHLLvvi", 0b1, 0b10100, "ushll", zext>; + +// Rounding/Saturating shift +class N2VShift_RQ<bit q, bit u, bits<5> opcode, string asmop, string T, + RegisterOperand VPRC, ValueType Ty, Operand ImmTy, + SDPatternOperator OpNode> + : NeonI_2VShiftImm<q, u, opcode, + (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm), + asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm", + [(set (Ty VPRC:$Rd), (Ty (OpNode (Ty VPRC:$Rn), + (i32 ImmTy:$Imm))))], + NoItinerary>; + +// shift right (vector by immediate) +multiclass NeonI_N2VShR_RQ<bit u, bits<5> opcode, string asmop, + SDPatternOperator OpNode> { + def _8B : N2VShift_RQ<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8, + OpNode> { + let Inst{22-19} = 0b0001; + } + + def _4H : N2VShift_RQ<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16, + OpNode> { + let Inst{22-20} = 0b001; + } + + def _2S : N2VShift_RQ<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32, + OpNode> { + let Inst{22-21} = 0b01; + } + + def _16B : N2VShift_RQ<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8, + OpNode> { + let Inst{22-19} = 0b0001; + } + + def _8H : N2VShift_RQ<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16, + OpNode> { + let Inst{22-20} = 0b001; + } + + def _4S : N2VShift_RQ<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32, + OpNode> { + let Inst{22-21} = 0b01; + } + + def _2D : N2VShift_RQ<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64, + OpNode> { + let Inst{22} = 0b1; + } +} + +multiclass NeonI_N2VShL_Q<bit u, bits<5> opcode, string asmop, + SDPatternOperator OpNode> { + // 64-bit vector types. + def _8B : N2VShift_RQ<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8, + OpNode> { + let Inst{22-19} = 0b0001; + } + + def _4H : N2VShift_RQ<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16, + OpNode> { + let Inst{22-20} = 0b001; + } + + def _2S : N2VShift_RQ<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32, + OpNode> { + let Inst{22-21} = 0b01; + } + + // 128-bit vector types. + def _16B : N2VShift_RQ<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8, + OpNode> { + let Inst{22-19} = 0b0001; + } + + def _8H : N2VShift_RQ<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16, + OpNode> { + let Inst{22-20} = 0b001; + } + + def _4S : N2VShift_RQ<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32, + OpNode> { + let Inst{22-21} = 0b01; + } + + def _2D : N2VShift_RQ<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64, + OpNode> { + let Inst{22} = 0b1; + } +} + +// Rounding shift right +defm SRSHRvvi : NeonI_N2VShR_RQ<0b0, 0b00100, "srshr", + int_aarch64_neon_vsrshr>; +defm URSHRvvi : NeonI_N2VShR_RQ<0b1, 0b00100, "urshr", + int_aarch64_neon_vurshr>; + +// Saturating shift left unsigned +defm SQSHLUvvi : NeonI_N2VShL_Q<0b1, 0b01100, "sqshlu", int_aarch64_neon_vsqshlu>; + +// Saturating shift left +defm SQSHLvvi : NeonI_N2VShL_Q<0b0, 0b01110, "sqshl", Neon_sqrshlImm>; +defm UQSHLvvi : NeonI_N2VShL_Q<0b1, 0b01110, "uqshl", Neon_uqrshlImm>; + +class N2VShiftAdd<bit q, bit u, bits<5> opcode, string asmop, string T, + RegisterOperand VPRC, ValueType Ty, Operand ImmTy, + SDNode OpNode> + : NeonI_2VShiftImm<q, u, opcode, + (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm), + asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm", + [(set (Ty VPRC:$Rd), (Ty (add (Ty VPRC:$src), + (Ty (OpNode (Ty VPRC:$Rn), + (Ty (Neon_vdup (i32 ImmTy:$Imm))))))))], + NoItinerary> { + let Constraints = "$src = $Rd"; +} + +// Shift Right accumulate +multiclass NeonI_N2VShRAdd<bit u, bits<5> opcode, string asmop, SDNode OpNode> { + def _8B : N2VShiftAdd<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8, + OpNode> { + let Inst{22-19} = 0b0001; + } + + def _4H : N2VShiftAdd<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16, + OpNode> { + let Inst{22-20} = 0b001; + } + + def _2S : N2VShiftAdd<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32, + OpNode> { + let Inst{22-21} = 0b01; + } + + def _16B : N2VShiftAdd<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8, + OpNode> { + let Inst{22-19} = 0b0001; + } + + def _8H : N2VShiftAdd<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16, + OpNode> { + let Inst{22-20} = 0b001; + } + + def _4S : N2VShiftAdd<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32, + OpNode> { + let Inst{22-21} = 0b01; + } + + def _2D : N2VShiftAdd<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64, + OpNode> { + let Inst{22} = 0b1; + } +} + +// Shift right and accumulate +defm SSRAvvi : NeonI_N2VShRAdd<0, 0b00010, "ssra", sra>; +defm USRAvvi : NeonI_N2VShRAdd<1, 0b00010, "usra", srl>; + +// Rounding shift accumulate +class N2VShiftAdd_R<bit q, bit u, bits<5> opcode, string asmop, string T, + RegisterOperand VPRC, ValueType Ty, Operand ImmTy, + SDPatternOperator OpNode> + : NeonI_2VShiftImm<q, u, opcode, + (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm), + asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm", + [(set (Ty VPRC:$Rd), (Ty (add (Ty VPRC:$src), + (Ty (OpNode (Ty VPRC:$Rn), (i32 ImmTy:$Imm))))))], + NoItinerary> { + let Constraints = "$src = $Rd"; +} + +multiclass NeonI_N2VShRAdd_R<bit u, bits<5> opcode, string asmop, + SDPatternOperator OpNode> { + def _8B : N2VShiftAdd_R<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8, + OpNode> { + let Inst{22-19} = 0b0001; + } + + def _4H : N2VShiftAdd_R<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16, + OpNode> { + let Inst{22-20} = 0b001; + } + + def _2S : N2VShiftAdd_R<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32, + OpNode> { + let Inst{22-21} = 0b01; + } + + def _16B : N2VShiftAdd_R<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8, + OpNode> { + let Inst{22-19} = 0b0001; + } + + def _8H : N2VShiftAdd_R<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16, + OpNode> { + let Inst{22-20} = 0b001; + } + + def _4S : N2VShiftAdd_R<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32, + OpNode> { + let Inst{22-21} = 0b01; + } + + def _2D : N2VShiftAdd_R<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64, + OpNode> { + let Inst{22} = 0b1; + } +} + +// Rounding shift right and accumulate +defm SRSRAvvi : NeonI_N2VShRAdd_R<0, 0b00110, "srsra", int_aarch64_neon_vsrshr>; +defm URSRAvvi : NeonI_N2VShRAdd_R<1, 0b00110, "ursra", int_aarch64_neon_vurshr>; + +// Shift insert by immediate +class N2VShiftIns<bit q, bit u, bits<5> opcode, string asmop, string T, + RegisterOperand VPRC, ValueType Ty, Operand ImmTy, + SDPatternOperator OpNode> + : NeonI_2VShiftImm<q, u, opcode, + (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm), + asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm", + [(set (Ty VPRC:$Rd), (Ty (OpNode (Ty VPRC:$src), (Ty VPRC:$Rn), + (i32 ImmTy:$Imm))))], + NoItinerary> { + let Constraints = "$src = $Rd"; +} + +// shift left insert (vector by immediate) +multiclass NeonI_N2VShLIns<bit u, bits<5> opcode, string asmop> { + def _8B : N2VShiftIns<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8, + int_aarch64_neon_vsli> { + let Inst{22-19} = 0b0001; + } + + def _4H : N2VShiftIns<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16, + int_aarch64_neon_vsli> { + let Inst{22-20} = 0b001; + } + + def _2S : N2VShiftIns<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32, + int_aarch64_neon_vsli> { + let Inst{22-21} = 0b01; + } + + // 128-bit vector types + def _16B : N2VShiftIns<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8, + int_aarch64_neon_vsli> { + let Inst{22-19} = 0b0001; + } + + def _8H : N2VShiftIns<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16, + int_aarch64_neon_vsli> { + let Inst{22-20} = 0b001; + } + + def _4S : N2VShiftIns<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32, + int_aarch64_neon_vsli> { + let Inst{22-21} = 0b01; + } + + def _2D : N2VShiftIns<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64, + int_aarch64_neon_vsli> { + let Inst{22} = 0b1; + } +} + +// shift right insert (vector by immediate) +multiclass NeonI_N2VShRIns<bit u, bits<5> opcode, string asmop> { + // 64-bit vector types. + def _8B : N2VShiftIns<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8, + int_aarch64_neon_vsri> { + let Inst{22-19} = 0b0001; + } + + def _4H : N2VShiftIns<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16, + int_aarch64_neon_vsri> { + let Inst{22-20} = 0b001; + } + + def _2S : N2VShiftIns<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32, + int_aarch64_neon_vsri> { + let Inst{22-21} = 0b01; + } + + // 128-bit vector types + def _16B : N2VShiftIns<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8, + int_aarch64_neon_vsri> { + let Inst{22-19} = 0b0001; + } + + def _8H : N2VShiftIns<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16, + int_aarch64_neon_vsri> { + let Inst{22-20} = 0b001; + } + + def _4S : N2VShiftIns<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32, + int_aarch64_neon_vsri> { + let Inst{22-21} = 0b01; + } + + def _2D : N2VShiftIns<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64, + int_aarch64_neon_vsri> { + let Inst{22} = 0b1; + } +} + +// Shift left and insert +defm SLIvvi : NeonI_N2VShLIns<0b1, 0b01010, "sli">; + +// Shift right and insert +defm SRIvvi : NeonI_N2VShRIns<0b1, 0b01000, "sri">; + +class N2VShR_Narrow<bit q, bit u, bits<5> opcode, string asmop, string DestT, + string SrcT, Operand ImmTy> + : NeonI_2VShiftImm<q, u, opcode, + (outs VPR64:$Rd), (ins VPR128:$Rn, ImmTy:$Imm), + asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm", + [], NoItinerary>; + +class N2VShR_Narrow_Hi<bit q, bit u, bits<5> opcode, string asmop, string DestT, + string SrcT, Operand ImmTy> + : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd), + (ins VPR128:$src, VPR128:$Rn, ImmTy:$Imm), + asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm", + [], NoItinerary> { + let Constraints = "$src = $Rd"; +} + +// left long shift by immediate +multiclass NeonI_N2VShR_Narrow<bit u, bits<5> opcode, string asmop> { + def _8B : N2VShR_Narrow<0b0, u, opcode, asmop, "8b", "8h", shr_imm8> { + let Inst{22-19} = 0b0001; + } + + def _4H : N2VShR_Narrow<0b0, u, opcode, asmop, "4h", "4s", shr_imm16> { + let Inst{22-20} = 0b001; + } + + def _2S : N2VShR_Narrow<0b0, u, opcode, asmop, "2s", "2d", shr_imm32> { + let Inst{22-21} = 0b01; + } + + // Shift Narrow High + def _16B : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "16b", "8h", + shr_imm8> { + let Inst{22-19} = 0b0001; + } + + def _8H : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "8h", "4s", + shr_imm16> { + let Inst{22-20} = 0b001; + } + + def _4S : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "4s", "2d", + shr_imm32> { + let Inst{22-21} = 0b01; + } +} + +// Shift right narrow +defm SHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10000, "shrn">; + +// Shift right narrow (prefix Q is saturating, prefix R is rounding) +defm QSHRUNvvi :NeonI_N2VShR_Narrow<0b1, 0b10000, "sqshrun">; +defm RSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10001, "rshrn">; +defm QRSHRUNvvi : NeonI_N2VShR_Narrow<0b1, 0b10001, "sqrshrun">; +defm SQSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10010, "sqshrn">; +defm UQSHRNvvi : NeonI_N2VShR_Narrow<0b1, 0b10010, "uqshrn">; +defm SQRSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10011, "sqrshrn">; +defm UQRSHRNvvi : NeonI_N2VShR_Narrow<0b1, 0b10011, "uqrshrn">; + +def Neon_combine_2D : PatFrag<(ops node:$Rm, node:$Rn), + (v2i64 (concat_vectors (v1i64 node:$Rm), + (v1i64 node:$Rn)))>; +def Neon_combine_8H : PatFrag<(ops node:$Rm, node:$Rn), + (v8i16 (concat_vectors (v4i16 node:$Rm), + (v4i16 node:$Rn)))>; +def Neon_combine_4S : PatFrag<(ops node:$Rm, node:$Rn), + (v4i32 (concat_vectors (v2i32 node:$Rm), + (v2i32 node:$Rn)))>; +def Neon_combine_4f : PatFrag<(ops node:$Rm, node:$Rn), + (v4f32 (concat_vectors (v2f32 node:$Rm), + (v2f32 node:$Rn)))>; +def Neon_combine_2d : PatFrag<(ops node:$Rm, node:$Rn), + (v2f64 (concat_vectors (v1f64 node:$Rm), + (v1f64 node:$Rn)))>; + +def Neon_lshrImm8H : PatFrag<(ops node:$lhs, node:$rhs), + (v8i16 (srl (v8i16 node:$lhs), + (v8i16 (Neon_vdup (i32 node:$rhs)))))>; +def Neon_lshrImm4S : PatFrag<(ops node:$lhs, node:$rhs), + (v4i32 (srl (v4i32 node:$lhs), + (v4i32 (Neon_vdup (i32 node:$rhs)))))>; +def Neon_lshrImm2D : PatFrag<(ops node:$lhs, node:$rhs), + (v2i64 (srl (v2i64 node:$lhs), + (v2i64 (Neon_vdup (i32 node:$rhs)))))>; +def Neon_ashrImm8H : PatFrag<(ops node:$lhs, node:$rhs), + (v8i16 (sra (v8i16 node:$lhs), + (v8i16 (Neon_vdup (i32 node:$rhs)))))>; +def Neon_ashrImm4S : PatFrag<(ops node:$lhs, node:$rhs), + (v4i32 (sra (v4i32 node:$lhs), + (v4i32 (Neon_vdup (i32 node:$rhs)))))>; +def Neon_ashrImm2D : PatFrag<(ops node:$lhs, node:$rhs), + (v2i64 (sra (v2i64 node:$lhs), + (v2i64 (Neon_vdup (i32 node:$rhs)))))>; + +// Normal shift right narrow is matched by IR (srl/sra, trunc, concat_vectors) +multiclass Neon_shiftNarrow_patterns<string shr> { + def : Pat<(v8i8 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm8H") VPR128:$Rn, + (i32 shr_imm8:$Imm)))), + (SHRNvvi_8B VPR128:$Rn, imm:$Imm)>; + def : Pat<(v4i16 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm4S") VPR128:$Rn, + (i32 shr_imm16:$Imm)))), + (SHRNvvi_4H VPR128:$Rn, imm:$Imm)>; + def : Pat<(v2i32 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm2D") VPR128:$Rn, + (i32 shr_imm32:$Imm)))), + (SHRNvvi_2S VPR128:$Rn, imm:$Imm)>; + + def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert + (v8i8 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm8H") + VPR128:$Rn, (i32 shr_imm8:$Imm))))))), + (SHRNvvi_16B (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)), + VPR128:$Rn, imm:$Imm)>; + def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert + (v4i16 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm4S") + VPR128:$Rn, (i32 shr_imm16:$Imm))))))), + (SHRNvvi_8H (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), + VPR128:$Rn, imm:$Imm)>; + def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert + (v2i32 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm2D") + VPR128:$Rn, (i32 shr_imm32:$Imm))))))), + (SHRNvvi_4S (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), + VPR128:$Rn, imm:$Imm)>; +} + +multiclass Neon_shiftNarrow_QR_patterns<SDPatternOperator op, string prefix> { + def : Pat<(v8i8 (op (v8i16 VPR128:$Rn), shr_imm8:$Imm)), + (!cast<Instruction>(prefix # "_8B") VPR128:$Rn, imm:$Imm)>; + def : Pat<(v4i16 (op (v4i32 VPR128:$Rn), shr_imm16:$Imm)), + (!cast<Instruction>(prefix # "_4H") VPR128:$Rn, imm:$Imm)>; + def : Pat<(v2i32 (op (v2i64 VPR128:$Rn), shr_imm32:$Imm)), + (!cast<Instruction>(prefix # "_2S") VPR128:$Rn, imm:$Imm)>; + + def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), + (v1i64 (bitconvert (v8i8 + (op (v8i16 VPR128:$Rn), shr_imm8:$Imm))))), + (!cast<Instruction>(prefix # "_16B") + (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), + VPR128:$Rn, imm:$Imm)>; + def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), + (v1i64 (bitconvert (v4i16 + (op (v4i32 VPR128:$Rn), shr_imm16:$Imm))))), + (!cast<Instruction>(prefix # "_8H") + (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), + VPR128:$Rn, imm:$Imm)>; + def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), + (v1i64 (bitconvert (v2i32 + (op (v2i64 VPR128:$Rn), shr_imm32:$Imm))))), + (!cast<Instruction>(prefix # "_4S") + (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), + VPR128:$Rn, imm:$Imm)>; +} + +defm : Neon_shiftNarrow_patterns<"lshr">; +defm : Neon_shiftNarrow_patterns<"ashr">; + +defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqshrun, "QSHRUNvvi">; +defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vrshrn, "RSHRNvvi">; +defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqrshrun, "QRSHRUNvvi">; +defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqshrn, "SQSHRNvvi">; +defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vuqshrn, "UQSHRNvvi">; +defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqrshrn, "SQRSHRNvvi">; +defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vuqrshrn, "UQRSHRNvvi">; + +// Convert fix-point and float-pointing +class N2VCvt_Fx<bit q, bit u, bits<5> opcode, string asmop, string T, + RegisterOperand VPRC, ValueType DestTy, ValueType SrcTy, + Operand ImmTy, SDPatternOperator IntOp> + : NeonI_2VShiftImm<q, u, opcode, + (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm), + asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm", + [(set (DestTy VPRC:$Rd), (DestTy (IntOp (SrcTy VPRC:$Rn), + (i32 ImmTy:$Imm))))], + NoItinerary>; + +multiclass NeonI_N2VCvt_Fx2fp<bit u, bits<5> opcode, string asmop, + SDPatternOperator IntOp> { + def _2S : N2VCvt_Fx<0, u, opcode, asmop, "2s", VPR64, v2f32, v2i32, + shr_imm32, IntOp> { + let Inst{22-21} = 0b01; + } + + def _4S : N2VCvt_Fx<1, u, opcode, asmop, "4s", VPR128, v4f32, v4i32, + shr_imm32, IntOp> { + let Inst{22-21} = 0b01; + } + + def _2D : N2VCvt_Fx<1, u, opcode, asmop, "2d", VPR128, v2f64, v2i64, + shr_imm64, IntOp> { + let Inst{22} = 0b1; + } +} + +multiclass NeonI_N2VCvt_Fp2fx<bit u, bits<5> opcode, string asmop, + SDPatternOperator IntOp> { + def _2S : N2VCvt_Fx<0, u, opcode, asmop, "2s", VPR64, v2i32, v2f32, + shr_imm32, IntOp> { + let Inst{22-21} = 0b01; + } + + def _4S : N2VCvt_Fx<1, u, opcode, asmop, "4s", VPR128, v4i32, v4f32, + shr_imm32, IntOp> { + let Inst{22-21} = 0b01; + } + + def _2D : N2VCvt_Fx<1, u, opcode, asmop, "2d", VPR128, v2i64, v2f64, + shr_imm64, IntOp> { + let Inst{22} = 0b1; + } +} + +// Convert fixed-point to floating-point +defm VCVTxs2f : NeonI_N2VCvt_Fx2fp<0, 0b11100, "scvtf", + int_arm_neon_vcvtfxs2fp>; +defm VCVTxu2f : NeonI_N2VCvt_Fx2fp<1, 0b11100, "ucvtf", + int_arm_neon_vcvtfxu2fp>; + +// Convert floating-point to fixed-point +defm VCVTf2xs : NeonI_N2VCvt_Fp2fx<0, 0b11111, "fcvtzs", + int_arm_neon_vcvtfp2fxs>; +defm VCVTf2xu : NeonI_N2VCvt_Fp2fx<1, 0b11111, "fcvtzu", + int_arm_neon_vcvtfp2fxu>; + +multiclass Neon_sshll2_0<SDNode ext> +{ + def _v8i8 : PatFrag<(ops node:$Rn), + (v8i16 (ext (v8i8 (Neon_High16B node:$Rn))))>; + def _v4i16 : PatFrag<(ops node:$Rn), + (v4i32 (ext (v4i16 (Neon_High8H node:$Rn))))>; + def _v2i32 : PatFrag<(ops node:$Rn), + (v2i64 (ext (v2i32 (Neon_High4S node:$Rn))))>; +} + +defm NI_sext_high : Neon_sshll2_0<sext>; +defm NI_zext_high : Neon_sshll2_0<zext>; + + +//===----------------------------------------------------------------------===// +// Multiclasses for NeonI_Across +//===----------------------------------------------------------------------===// + +// Variant 1 + +multiclass NeonI_2VAcross_1<bit u, bits<5> opcode, + string asmop, SDPatternOperator opnode> +{ + def _1h8b: NeonI_2VAcross<0b0, u, 0b00, opcode, + (outs FPR16:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd, $Rn.8b", + [(set (v1i16 FPR16:$Rd), + (v1i16 (opnode (v8i8 VPR64:$Rn))))], + NoItinerary>; + + def _1h16b: NeonI_2VAcross<0b1, u, 0b00, opcode, + (outs FPR16:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd, $Rn.16b", + [(set (v1i16 FPR16:$Rd), + (v1i16 (opnode (v16i8 VPR128:$Rn))))], + NoItinerary>; + + def _1s4h: NeonI_2VAcross<0b0, u, 0b01, opcode, + (outs FPR32:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd, $Rn.4h", + [(set (v1i32 FPR32:$Rd), + (v1i32 (opnode (v4i16 VPR64:$Rn))))], + NoItinerary>; + + def _1s8h: NeonI_2VAcross<0b1, u, 0b01, opcode, + (outs FPR32:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd, $Rn.8h", + [(set (v1i32 FPR32:$Rd), + (v1i32 (opnode (v8i16 VPR128:$Rn))))], + NoItinerary>; + + // _1d2s doesn't exist! + + def _1d4s: NeonI_2VAcross<0b1, u, 0b10, opcode, + (outs FPR64:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd, $Rn.4s", + [(set (v1i64 FPR64:$Rd), + (v1i64 (opnode (v4i32 VPR128:$Rn))))], + NoItinerary>; +} + +defm SADDLV : NeonI_2VAcross_1<0b0, 0b00011, "saddlv", int_aarch64_neon_saddlv>; +defm UADDLV : NeonI_2VAcross_1<0b1, 0b00011, "uaddlv", int_aarch64_neon_uaddlv>; + +// Variant 2 + +multiclass NeonI_2VAcross_2<bit u, bits<5> opcode, + string asmop, SDPatternOperator opnode> +{ + def _1b8b: NeonI_2VAcross<0b0, u, 0b00, opcode, + (outs FPR8:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd, $Rn.8b", + [(set (v1i8 FPR8:$Rd), + (v1i8 (opnode (v8i8 VPR64:$Rn))))], + NoItinerary>; + + def _1b16b: NeonI_2VAcross<0b1, u, 0b00, opcode, + (outs FPR8:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd, $Rn.16b", + [(set (v1i8 FPR8:$Rd), + (v1i8 (opnode (v16i8 VPR128:$Rn))))], + NoItinerary>; + + def _1h4h: NeonI_2VAcross<0b0, u, 0b01, opcode, + (outs FPR16:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd, $Rn.4h", + [(set (v1i16 FPR16:$Rd), + (v1i16 (opnode (v4i16 VPR64:$Rn))))], + NoItinerary>; + + def _1h8h: NeonI_2VAcross<0b1, u, 0b01, opcode, + (outs FPR16:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd, $Rn.8h", + [(set (v1i16 FPR16:$Rd), + (v1i16 (opnode (v8i16 VPR128:$Rn))))], + NoItinerary>; + + // _1s2s doesn't exist! + + def _1s4s: NeonI_2VAcross<0b1, u, 0b10, opcode, + (outs FPR32:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd, $Rn.4s", + [(set (v1i32 FPR32:$Rd), + (v1i32 (opnode (v4i32 VPR128:$Rn))))], + NoItinerary>; +} + +defm SMAXV : NeonI_2VAcross_2<0b0, 0b01010, "smaxv", int_aarch64_neon_smaxv>; +defm UMAXV : NeonI_2VAcross_2<0b1, 0b01010, "umaxv", int_aarch64_neon_umaxv>; + +defm SMINV : NeonI_2VAcross_2<0b0, 0b11010, "sminv", int_aarch64_neon_sminv>; +defm UMINV : NeonI_2VAcross_2<0b1, 0b11010, "uminv", int_aarch64_neon_uminv>; + +defm ADDV : NeonI_2VAcross_2<0b0, 0b11011, "addv", int_aarch64_neon_vaddv>; + +// Variant 3 + +multiclass NeonI_2VAcross_3<bit u, bits<5> opcode, bits<2> size, + string asmop, SDPatternOperator opnode> { + def _1s4s: NeonI_2VAcross<0b1, u, size, opcode, + (outs FPR32:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd, $Rn.4s", + [(set (v1f32 FPR32:$Rd), + (v1f32 (opnode (v4f32 VPR128:$Rn))))], + NoItinerary>; +} + +defm FMAXNMV : NeonI_2VAcross_3<0b1, 0b01100, 0b00, "fmaxnmv", + int_aarch64_neon_vmaxnmv>; +defm FMINNMV : NeonI_2VAcross_3<0b1, 0b01100, 0b10, "fminnmv", + int_aarch64_neon_vminnmv>; + +defm FMAXV : NeonI_2VAcross_3<0b1, 0b01111, 0b00, "fmaxv", + int_aarch64_neon_vmaxv>; +defm FMINV : NeonI_2VAcross_3<0b1, 0b01111, 0b10, "fminv", + int_aarch64_neon_vminv>; + +// The followings are for instruction class (Perm) + +class NeonI_Permute<bit q, bits<2> size, bits<3> opcode, + string asmop, RegisterOperand OpVPR, string OpS, + SDPatternOperator opnode, ValueType Ty> + : NeonI_Perm<q, size, opcode, + (outs OpVPR:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm), + asmop # "\t$Rd." # OpS # ", $Rn." # OpS # ", $Rm." # OpS, + [(set (Ty OpVPR:$Rd), + (Ty (opnode (Ty OpVPR:$Rn), (Ty OpVPR:$Rm))))], + NoItinerary>; + +multiclass NeonI_Perm_pat<bits<3> opcode, string asmop, + SDPatternOperator opnode> { + def _8b : NeonI_Permute<0b0, 0b00, opcode, asmop, + VPR64, "8b", opnode, v8i8>; + def _16b : NeonI_Permute<0b1, 0b00, opcode, asmop, + VPR128, "16b",opnode, v16i8>; + def _4h : NeonI_Permute<0b0, 0b01, opcode, asmop, + VPR64, "4h", opnode, v4i16>; + def _8h : NeonI_Permute<0b1, 0b01, opcode, asmop, + VPR128, "8h", opnode, v8i16>; + def _2s : NeonI_Permute<0b0, 0b10, opcode, asmop, + VPR64, "2s", opnode, v2i32>; + def _4s : NeonI_Permute<0b1, 0b10, opcode, asmop, + VPR128, "4s", opnode, v4i32>; + def _2d : NeonI_Permute<0b1, 0b11, opcode, asmop, + VPR128, "2d", opnode, v2i64>; +} + +defm UZP1vvv : NeonI_Perm_pat<0b001, "uzp1", Neon_uzp1>; +defm TRN1vvv : NeonI_Perm_pat<0b010, "trn1", Neon_trn1>; +defm ZIP1vvv : NeonI_Perm_pat<0b011, "zip1", Neon_zip1>; +defm UZP2vvv : NeonI_Perm_pat<0b101, "uzp2", Neon_uzp2>; +defm TRN2vvv : NeonI_Perm_pat<0b110, "trn2", Neon_trn2>; +defm ZIP2vvv : NeonI_Perm_pat<0b111, "zip2", Neon_zip2>; + +multiclass NeonI_Perm_float_pat<string INS, SDPatternOperator opnode> { + def : Pat<(v2f32 (opnode (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))), + (!cast<Instruction>(INS # "_2s") VPR64:$Rn, VPR64:$Rm)>; + + def : Pat<(v4f32 (opnode (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))), + (!cast<Instruction>(INS # "_4s") VPR128:$Rn, VPR128:$Rm)>; + + def : Pat<(v2f64 (opnode (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))), + (!cast<Instruction>(INS # "_2d") VPR128:$Rn, VPR128:$Rm)>; +} + +defm : NeonI_Perm_float_pat<"UZP1vvv", Neon_uzp1>; +defm : NeonI_Perm_float_pat<"UZP2vvv", Neon_uzp2>; +defm : NeonI_Perm_float_pat<"ZIP1vvv", Neon_zip1>; +defm : NeonI_Perm_float_pat<"ZIP2vvv", Neon_zip2>; +defm : NeonI_Perm_float_pat<"TRN1vvv", Neon_trn1>; +defm : NeonI_Perm_float_pat<"TRN2vvv", Neon_trn2>; + +// The followings are for instruction class (3V Diff) + +// normal long/long2 pattern +class NeonI_3VDL<bit q, bit u, bits<2> size, bits<4> opcode, + string asmop, string ResS, string OpS, + SDPatternOperator opnode, SDPatternOperator ext, + RegisterOperand OpVPR, + ValueType ResTy, ValueType OpTy> + : NeonI_3VDiff<q, u, size, opcode, + (outs VPR128:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm), + asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS, + [(set (ResTy VPR128:$Rd), + (ResTy (opnode (ResTy (ext (OpTy OpVPR:$Rn))), + (ResTy (ext (OpTy OpVPR:$Rm))))))], + NoItinerary>; + +multiclass NeonI_3VDL_s<bit u, bits<4> opcode, + string asmop, SDPatternOperator opnode, + bit Commutable = 0> { + let isCommutable = Commutable in { + def _8h8b : NeonI_3VDL<0b0, u, 0b00, opcode, asmop, "8h", "8b", + opnode, sext, VPR64, v8i16, v8i8>; + def _4s4h : NeonI_3VDL<0b0, u, 0b01, opcode, asmop, "4s", "4h", + opnode, sext, VPR64, v4i32, v4i16>; + def _2d2s : NeonI_3VDL<0b0, u, 0b10, opcode, asmop, "2d", "2s", + opnode, sext, VPR64, v2i64, v2i32>; + } +} + +multiclass NeonI_3VDL2_s<bit u, bits<4> opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _8h16b : NeonI_3VDL<0b1, u, 0b00, opcode, asmop, "8h", "16b", + opnode, NI_sext_high_v8i8, VPR128, v8i16, v16i8>; + def _4s8h : NeonI_3VDL<0b1, u, 0b01, opcode, asmop, "4s", "8h", + opnode, NI_sext_high_v4i16, VPR128, v4i32, v8i16>; + def _2d4s : NeonI_3VDL<0b1, u, 0b10, opcode, asmop, "2d", "4s", + opnode, NI_sext_high_v2i32, VPR128, v2i64, v4i32>; + } +} + +multiclass NeonI_3VDL_u<bit u, bits<4> opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _8h8b : NeonI_3VDL<0b0, u, 0b00, opcode, asmop, "8h", "8b", + opnode, zext, VPR64, v8i16, v8i8>; + def _4s4h : NeonI_3VDL<0b0, u, 0b01, opcode, asmop, "4s", "4h", + opnode, zext, VPR64, v4i32, v4i16>; + def _2d2s : NeonI_3VDL<0b0, u, 0b10, opcode, asmop, "2d", "2s", + opnode, zext, VPR64, v2i64, v2i32>; + } +} + +multiclass NeonI_3VDL2_u<bit u, bits<4> opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _8h16b : NeonI_3VDL<0b1, u, 0b00, opcode, asmop, "8h", "16b", + opnode, NI_zext_high_v8i8, VPR128, v8i16, v16i8>; + def _4s8h : NeonI_3VDL<0b1, u, 0b01, opcode, asmop, "4s", "8h", + opnode, NI_zext_high_v4i16, VPR128, v4i32, v8i16>; + def _2d4s : NeonI_3VDL<0b1, u, 0b10, opcode, asmop, "2d", "4s", + opnode, NI_zext_high_v2i32, VPR128, v2i64, v4i32>; + } +} + +defm SADDLvvv : NeonI_3VDL_s<0b0, 0b0000, "saddl", add, 1>; +defm UADDLvvv : NeonI_3VDL_u<0b1, 0b0000, "uaddl", add, 1>; + +defm SADDL2vvv : NeonI_3VDL2_s<0b0, 0b0000, "saddl2", add, 1>; +defm UADDL2vvv : NeonI_3VDL2_u<0b1, 0b0000, "uaddl2", add, 1>; + +defm SSUBLvvv : NeonI_3VDL_s<0b0, 0b0010, "ssubl", sub, 0>; +defm USUBLvvv : NeonI_3VDL_u<0b1, 0b0010, "usubl", sub, 0>; + +defm SSUBL2vvv : NeonI_3VDL2_s<0b0, 0b0010, "ssubl2", sub, 0>; +defm USUBL2vvv : NeonI_3VDL2_u<0b1, 0b0010, "usubl2", sub, 0>; + +// normal wide/wide2 pattern +class NeonI_3VDW<bit q, bit u, bits<2> size, bits<4> opcode, + string asmop, string ResS, string OpS, + SDPatternOperator opnode, SDPatternOperator ext, + RegisterOperand OpVPR, + ValueType ResTy, ValueType OpTy> + : NeonI_3VDiff<q, u, size, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, OpVPR:$Rm), + asmop # "\t$Rd." # ResS # ", $Rn." # ResS # ", $Rm." # OpS, + [(set (ResTy VPR128:$Rd), + (ResTy (opnode (ResTy VPR128:$Rn), + (ResTy (ext (OpTy OpVPR:$Rm))))))], + NoItinerary>; + +multiclass NeonI_3VDW_s<bit u, bits<4> opcode, string asmop, + SDPatternOperator opnode> { + def _8h8b : NeonI_3VDW<0b0, u, 0b00, opcode, asmop, "8h", "8b", + opnode, sext, VPR64, v8i16, v8i8>; + def _4s4h : NeonI_3VDW<0b0, u, 0b01, opcode, asmop, "4s", "4h", + opnode, sext, VPR64, v4i32, v4i16>; + def _2d2s : NeonI_3VDW<0b0, u, 0b10, opcode, asmop, "2d", "2s", + opnode, sext, VPR64, v2i64, v2i32>; +} + +defm SADDWvvv : NeonI_3VDW_s<0b0, 0b0001, "saddw", add>; +defm SSUBWvvv : NeonI_3VDW_s<0b0, 0b0011, "ssubw", sub>; + +multiclass NeonI_3VDW2_s<bit u, bits<4> opcode, string asmop, + SDPatternOperator opnode> { + def _8h16b : NeonI_3VDW<0b1, u, 0b00, opcode, asmop, "8h", "16b", + opnode, NI_sext_high_v8i8, VPR128, v8i16, v16i8>; + def _4s8h : NeonI_3VDW<0b1, u, 0b01, opcode, asmop, "4s", "8h", + opnode, NI_sext_high_v4i16, VPR128, v4i32, v8i16>; + def _2d4s : NeonI_3VDW<0b1, u, 0b10, opcode, asmop, "2d", "4s", + opnode, NI_sext_high_v2i32, VPR128, v2i64, v4i32>; +} + +defm SADDW2vvv : NeonI_3VDW2_s<0b0, 0b0001, "saddw2", add>; +defm SSUBW2vvv : NeonI_3VDW2_s<0b0, 0b0011, "ssubw2", sub>; + +multiclass NeonI_3VDW_u<bit u, bits<4> opcode, string asmop, + SDPatternOperator opnode> { + def _8h8b : NeonI_3VDW<0b0, u, 0b00, opcode, asmop, "8h", "8b", + opnode, zext, VPR64, v8i16, v8i8>; + def _4s4h : NeonI_3VDW<0b0, u, 0b01, opcode, asmop, "4s", "4h", + opnode, zext, VPR64, v4i32, v4i16>; + def _2d2s : NeonI_3VDW<0b0, u, 0b10, opcode, asmop, "2d", "2s", + opnode, zext, VPR64, v2i64, v2i32>; +} + +defm UADDWvvv : NeonI_3VDW_u<0b1, 0b0001, "uaddw", add>; +defm USUBWvvv : NeonI_3VDW_u<0b1, 0b0011, "usubw", sub>; + +multiclass NeonI_3VDW2_u<bit u, bits<4> opcode, string asmop, + SDPatternOperator opnode> { + def _8h16b : NeonI_3VDW<0b1, u, 0b00, opcode, asmop, "8h", "16b", + opnode, NI_zext_high_v8i8, VPR128, v8i16, v16i8>; + def _4s8h : NeonI_3VDW<0b1, u, 0b01, opcode, asmop, "4s", "8h", + opnode, NI_zext_high_v4i16, VPR128, v4i32, v8i16>; + def _2d4s : NeonI_3VDW<0b1, u, 0b10, opcode, asmop, "2d", "4s", + opnode, NI_zext_high_v2i32, VPR128, v2i64, v4i32>; +} + +defm UADDW2vvv : NeonI_3VDW2_u<0b1, 0b0001, "uaddw2", add>; +defm USUBW2vvv : NeonI_3VDW2_u<0b1, 0b0011, "usubw2", sub>; + +// Get the high half part of the vector element. +multiclass NeonI_get_high { + def _8h : PatFrag<(ops node:$Rn), + (v8i8 (trunc (v8i16 (srl (v8i16 node:$Rn), + (v8i16 (Neon_vdup (i32 8)))))))>; + def _4s : PatFrag<(ops node:$Rn), + (v4i16 (trunc (v4i32 (srl (v4i32 node:$Rn), + (v4i32 (Neon_vdup (i32 16)))))))>; + def _2d : PatFrag<(ops node:$Rn), + (v2i32 (trunc (v2i64 (srl (v2i64 node:$Rn), + (v2i64 (Neon_vdup (i32 32)))))))>; +} + +defm NI_get_hi : NeonI_get_high; + +// pattern for addhn/subhn with 2 operands +class NeonI_3VDN_addhn_2Op<bit q, bit u, bits<2> size, bits<4> opcode, + string asmop, string ResS, string OpS, + SDPatternOperator opnode, SDPatternOperator get_hi, + ValueType ResTy, ValueType OpTy> + : NeonI_3VDiff<q, u, size, opcode, + (outs VPR64:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS, + [(set (ResTy VPR64:$Rd), + (ResTy (get_hi + (OpTy (opnode (OpTy VPR128:$Rn), + (OpTy VPR128:$Rm))))))], + NoItinerary>; + +multiclass NeonI_3VDN_addhn_2Op<bit u, bits<4> opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _8b8h : NeonI_3VDN_addhn_2Op<0b0, u, 0b00, opcode, asmop, "8b", "8h", + opnode, NI_get_hi_8h, v8i8, v8i16>; + def _4h4s : NeonI_3VDN_addhn_2Op<0b0, u, 0b01, opcode, asmop, "4h", "4s", + opnode, NI_get_hi_4s, v4i16, v4i32>; + def _2s2d : NeonI_3VDN_addhn_2Op<0b0, u, 0b10, opcode, asmop, "2s", "2d", + opnode, NI_get_hi_2d, v2i32, v2i64>; + } +} + +defm ADDHNvvv : NeonI_3VDN_addhn_2Op<0b0, 0b0100, "addhn", add, 1>; +defm SUBHNvvv : NeonI_3VDN_addhn_2Op<0b0, 0b0110, "subhn", sub, 0>; + +// pattern for operation with 2 operands +class NeonI_3VD_2Op<bit q, bit u, bits<2> size, bits<4> opcode, + string asmop, string ResS, string OpS, + SDPatternOperator opnode, + RegisterOperand ResVPR, RegisterOperand OpVPR, + ValueType ResTy, ValueType OpTy> + : NeonI_3VDiff<q, u, size, opcode, + (outs ResVPR:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm), + asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS, + [(set (ResTy ResVPR:$Rd), + (ResTy (opnode (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm))))], + NoItinerary>; + +// normal narrow pattern +multiclass NeonI_3VDN_2Op<bit u, bits<4> opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _8b8h : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8b", "8h", + opnode, VPR64, VPR128, v8i8, v8i16>; + def _4h4s : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4h", "4s", + opnode, VPR64, VPR128, v4i16, v4i32>; + def _2s2d : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2s", "2d", + opnode, VPR64, VPR128, v2i32, v2i64>; + } +} + +defm RADDHNvvv : NeonI_3VDN_2Op<0b1, 0b0100, "raddhn", int_arm_neon_vraddhn, 1>; +defm RSUBHNvvv : NeonI_3VDN_2Op<0b1, 0b0110, "rsubhn", int_arm_neon_vrsubhn, 0>; + +// pattern for acle intrinsic with 3 operands +class NeonI_3VDN_3Op<bit q, bit u, bits<2> size, bits<4> opcode, + string asmop, string ResS, string OpS> + : NeonI_3VDiff<q, u, size, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS, + [], NoItinerary> { + let Constraints = "$src = $Rd"; + let neverHasSideEffects = 1; +} + +multiclass NeonI_3VDN_3Op_v1<bit u, bits<4> opcode, string asmop> { + def _16b8h : NeonI_3VDN_3Op<0b1, u, 0b00, opcode, asmop, "16b", "8h">; + def _8h4s : NeonI_3VDN_3Op<0b1, u, 0b01, opcode, asmop, "8h", "4s">; + def _4s2d : NeonI_3VDN_3Op<0b1, u, 0b10, opcode, asmop, "4s", "2d">; +} + +defm ADDHN2vvv : NeonI_3VDN_3Op_v1<0b0, 0b0100, "addhn2">; +defm SUBHN2vvv : NeonI_3VDN_3Op_v1<0b0, 0b0110, "subhn2">; + +defm RADDHN2vvv : NeonI_3VDN_3Op_v1<0b1, 0b0100, "raddhn2">; +defm RSUBHN2vvv : NeonI_3VDN_3Op_v1<0b1, 0b0110, "rsubhn2">; + +// Patterns have to be separate because there's a SUBREG_TO_REG in the output +// part. +class NarrowHighHalfPat<Instruction INST, ValueType DstTy, ValueType SrcTy, + SDPatternOperator coreop> + : Pat<(Neon_combine_2D (v1i64 VPR64:$src), + (v1i64 (bitconvert (DstTy (coreop (SrcTy VPR128:$Rn), + (SrcTy VPR128:$Rm)))))), + (INST (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), + VPR128:$Rn, VPR128:$Rm)>; + +// addhn2 patterns +def : NarrowHighHalfPat<ADDHN2vvv_16b8h, v8i8, v8i16, + BinOpFrag<(NI_get_hi_8h (add node:$LHS, node:$RHS))>>; +def : NarrowHighHalfPat<ADDHN2vvv_8h4s, v4i16, v4i32, + BinOpFrag<(NI_get_hi_4s (add node:$LHS, node:$RHS))>>; +def : NarrowHighHalfPat<ADDHN2vvv_4s2d, v2i32, v2i64, + BinOpFrag<(NI_get_hi_2d (add node:$LHS, node:$RHS))>>; + +// subhn2 patterns +def : NarrowHighHalfPat<SUBHN2vvv_16b8h, v8i8, v8i16, + BinOpFrag<(NI_get_hi_8h (sub node:$LHS, node:$RHS))>>; +def : NarrowHighHalfPat<SUBHN2vvv_8h4s, v4i16, v4i32, + BinOpFrag<(NI_get_hi_4s (sub node:$LHS, node:$RHS))>>; +def : NarrowHighHalfPat<SUBHN2vvv_4s2d, v2i32, v2i64, + BinOpFrag<(NI_get_hi_2d (sub node:$LHS, node:$RHS))>>; + +// raddhn2 patterns +def : NarrowHighHalfPat<RADDHN2vvv_16b8h, v8i8, v8i16, int_arm_neon_vraddhn>; +def : NarrowHighHalfPat<RADDHN2vvv_8h4s, v4i16, v4i32, int_arm_neon_vraddhn>; +def : NarrowHighHalfPat<RADDHN2vvv_4s2d, v2i32, v2i64, int_arm_neon_vraddhn>; + +// rsubhn2 patterns +def : NarrowHighHalfPat<RSUBHN2vvv_16b8h, v8i8, v8i16, int_arm_neon_vrsubhn>; +def : NarrowHighHalfPat<RSUBHN2vvv_8h4s, v4i16, v4i32, int_arm_neon_vrsubhn>; +def : NarrowHighHalfPat<RSUBHN2vvv_4s2d, v2i32, v2i64, int_arm_neon_vrsubhn>; + +// pattern that need to extend result +class NeonI_3VDL_Ext<bit q, bit u, bits<2> size, bits<4> opcode, + string asmop, string ResS, string OpS, + SDPatternOperator opnode, + RegisterOperand OpVPR, + ValueType ResTy, ValueType OpTy, ValueType OpSTy> + : NeonI_3VDiff<q, u, size, opcode, + (outs VPR128:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm), + asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS, + [(set (ResTy VPR128:$Rd), + (ResTy (zext (OpSTy (opnode (OpTy OpVPR:$Rn), + (OpTy OpVPR:$Rm))))))], + NoItinerary>; + +multiclass NeonI_3VDL_zext<bit u, bits<4> opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _8h8b : NeonI_3VDL_Ext<0b0, u, 0b00, opcode, asmop, "8h", "8b", + opnode, VPR64, v8i16, v8i8, v8i8>; + def _4s4h : NeonI_3VDL_Ext<0b0, u, 0b01, opcode, asmop, "4s", "4h", + opnode, VPR64, v4i32, v4i16, v4i16>; + def _2d2s : NeonI_3VDL_Ext<0b0, u, 0b10, opcode, asmop, "2d", "2s", + opnode, VPR64, v2i64, v2i32, v2i32>; + } +} + +defm SABDLvvv : NeonI_3VDL_zext<0b0, 0b0111, "sabdl", int_arm_neon_vabds, 1>; +defm UABDLvvv : NeonI_3VDL_zext<0b1, 0b0111, "uabdl", int_arm_neon_vabdu, 1>; + +multiclass NeonI_Op_High<SDPatternOperator op> { + def _16B : PatFrag<(ops node:$Rn, node:$Rm), + (op (v8i8 (Neon_High16B node:$Rn)), + (v8i8 (Neon_High16B node:$Rm)))>; + def _8H : PatFrag<(ops node:$Rn, node:$Rm), + (op (v4i16 (Neon_High8H node:$Rn)), + (v4i16 (Neon_High8H node:$Rm)))>; + def _4S : PatFrag<(ops node:$Rn, node:$Rm), + (op (v2i32 (Neon_High4S node:$Rn)), + (v2i32 (Neon_High4S node:$Rm)))>; +} + +defm NI_sabdl_hi : NeonI_Op_High<int_arm_neon_vabds>; +defm NI_uabdl_hi : NeonI_Op_High<int_arm_neon_vabdu>; +defm NI_smull_hi : NeonI_Op_High<int_arm_neon_vmulls>; +defm NI_umull_hi : NeonI_Op_High<int_arm_neon_vmullu>; +defm NI_qdmull_hi : NeonI_Op_High<int_arm_neon_vqdmull>; +defm NI_pmull_hi : NeonI_Op_High<int_arm_neon_vmullp>; + +multiclass NeonI_3VDL_Abd_u<bit u, bits<4> opcode, string asmop, string opnode, + bit Commutable = 0> { + let isCommutable = Commutable in { + def _8h8b : NeonI_3VDL_Ext<0b1, u, 0b00, opcode, asmop, "8h", "16b", + !cast<PatFrag>(opnode # "_16B"), + VPR128, v8i16, v16i8, v8i8>; + def _4s4h : NeonI_3VDL_Ext<0b1, u, 0b01, opcode, asmop, "4s", "8h", + !cast<PatFrag>(opnode # "_8H"), + VPR128, v4i32, v8i16, v4i16>; + def _2d2s : NeonI_3VDL_Ext<0b1, u, 0b10, opcode, asmop, "2d", "4s", + !cast<PatFrag>(opnode # "_4S"), + VPR128, v2i64, v4i32, v2i32>; + } +} + +defm SABDL2vvv : NeonI_3VDL_Abd_u<0b0, 0b0111, "sabdl2", "NI_sabdl_hi", 1>; +defm UABDL2vvv : NeonI_3VDL_Abd_u<0b1, 0b0111, "uabdl2", "NI_uabdl_hi", 1>; + +// For pattern that need two operators being chained. +class NeonI_3VDL_Aba<bit q, bit u, bits<2> size, bits<4> opcode, + string asmop, string ResS, string OpS, + SDPatternOperator opnode, SDPatternOperator subop, + RegisterOperand OpVPR, + ValueType ResTy, ValueType OpTy, ValueType OpSTy> + : NeonI_3VDiff<q, u, size, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, OpVPR:$Rn, OpVPR:$Rm), + asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS, + [(set (ResTy VPR128:$Rd), + (ResTy (opnode + (ResTy VPR128:$src), + (ResTy (zext (OpSTy (subop (OpTy OpVPR:$Rn), + (OpTy OpVPR:$Rm))))))))], + NoItinerary> { + let Constraints = "$src = $Rd"; +} + +multiclass NeonI_3VDL_Aba_v1<bit u, bits<4> opcode, string asmop, + SDPatternOperator opnode, SDPatternOperator subop>{ + def _8h8b : NeonI_3VDL_Aba<0b0, u, 0b00, opcode, asmop, "8h", "8b", + opnode, subop, VPR64, v8i16, v8i8, v8i8>; + def _4s4h : NeonI_3VDL_Aba<0b0, u, 0b01, opcode, asmop, "4s", "4h", + opnode, subop, VPR64, v4i32, v4i16, v4i16>; + def _2d2s : NeonI_3VDL_Aba<0b0, u, 0b10, opcode, asmop, "2d", "2s", + opnode, subop, VPR64, v2i64, v2i32, v2i32>; +} + +defm SABALvvv : NeonI_3VDL_Aba_v1<0b0, 0b0101, "sabal", + add, int_arm_neon_vabds>; +defm UABALvvv : NeonI_3VDL_Aba_v1<0b1, 0b0101, "uabal", + add, int_arm_neon_vabdu>; + +multiclass NeonI_3VDL2_Aba_v1<bit u, bits<4> opcode, string asmop, + SDPatternOperator opnode, string subop> { + def _8h8b : NeonI_3VDL_Aba<0b1, u, 0b00, opcode, asmop, "8h", "16b", + opnode, !cast<PatFrag>(subop # "_16B"), + VPR128, v8i16, v16i8, v8i8>; + def _4s4h : NeonI_3VDL_Aba<0b1, u, 0b01, opcode, asmop, "4s", "8h", + opnode, !cast<PatFrag>(subop # "_8H"), + VPR128, v4i32, v8i16, v4i16>; + def _2d2s : NeonI_3VDL_Aba<0b1, u, 0b10, opcode, asmop, "2d", "4s", + opnode, !cast<PatFrag>(subop # "_4S"), + VPR128, v2i64, v4i32, v2i32>; +} + +defm SABAL2vvv : NeonI_3VDL2_Aba_v1<0b0, 0b0101, "sabal2", add, + "NI_sabdl_hi">; +defm UABAL2vvv : NeonI_3VDL2_Aba_v1<0b1, 0b0101, "uabal2", add, + "NI_uabdl_hi">; + +// Long pattern with 2 operands +multiclass NeonI_3VDL_2Op<bit u, bits<4> opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b", + opnode, VPR128, VPR64, v8i16, v8i8>; + def _4s4h : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4s", "4h", + opnode, VPR128, VPR64, v4i32, v4i16>; + def _2d2s : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2d", "2s", + opnode, VPR128, VPR64, v2i64, v2i32>; + } +} + +defm SMULLvvv : NeonI_3VDL_2Op<0b0, 0b1100, "smull", int_arm_neon_vmulls, 1>; +defm UMULLvvv : NeonI_3VDL_2Op<0b1, 0b1100, "umull", int_arm_neon_vmullu, 1>; + +class NeonI_3VDL2_2Op_mull<bit q, bit u, bits<2> size, bits<4> opcode, + string asmop, string ResS, string OpS, + SDPatternOperator opnode, + ValueType ResTy, ValueType OpTy> + : NeonI_3VDiff<q, u, size, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS, + [(set (ResTy VPR128:$Rd), + (ResTy (opnode (OpTy VPR128:$Rn), (OpTy VPR128:$Rm))))], + NoItinerary>; + +multiclass NeonI_3VDL2_2Op_mull_v1<bit u, bits<4> opcode, string asmop, + string opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _8h16b : NeonI_3VDL2_2Op_mull<0b1, u, 0b00, opcode, asmop, "8h", "16b", + !cast<PatFrag>(opnode # "_16B"), + v8i16, v16i8>; + def _4s8h : NeonI_3VDL2_2Op_mull<0b1, u, 0b01, opcode, asmop, "4s", "8h", + !cast<PatFrag>(opnode # "_8H"), + v4i32, v8i16>; + def _2d4s : NeonI_3VDL2_2Op_mull<0b1, u, 0b10, opcode, asmop, "2d", "4s", + !cast<PatFrag>(opnode # "_4S"), + v2i64, v4i32>; + } +} + +defm SMULL2vvv : NeonI_3VDL2_2Op_mull_v1<0b0, 0b1100, "smull2", + "NI_smull_hi", 1>; +defm UMULL2vvv : NeonI_3VDL2_2Op_mull_v1<0b1, 0b1100, "umull2", + "NI_umull_hi", 1>; + +// Long pattern with 3 operands +class NeonI_3VDL_3Op<bit q, bit u, bits<2> size, bits<4> opcode, + string asmop, string ResS, string OpS, + SDPatternOperator opnode, + ValueType ResTy, ValueType OpTy> + : NeonI_3VDiff<q, u, size, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR64:$Rn, VPR64:$Rm), + asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS, + [(set (ResTy VPR128:$Rd), + (ResTy (opnode + (ResTy VPR128:$src), + (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))))], + NoItinerary> { + let Constraints = "$src = $Rd"; +} + +multiclass NeonI_3VDL_3Op_v1<bit u, bits<4> opcode, string asmop, + SDPatternOperator opnode> { + def _8h8b : NeonI_3VDL_3Op<0b0, u, 0b00, opcode, asmop, "8h", "8b", + opnode, v8i16, v8i8>; + def _4s4h : NeonI_3VDL_3Op<0b0, u, 0b01, opcode, asmop, "4s", "4h", + opnode, v4i32, v4i16>; + def _2d2s : NeonI_3VDL_3Op<0b0, u, 0b10, opcode, asmop, "2d", "2s", + opnode, v2i64, v2i32>; +} + +def Neon_smlal : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm), + (add node:$Rd, + (int_arm_neon_vmulls node:$Rn, node:$Rm))>; + +def Neon_umlal : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm), + (add node:$Rd, + (int_arm_neon_vmullu node:$Rn, node:$Rm))>; + +def Neon_smlsl : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm), + (sub node:$Rd, + (int_arm_neon_vmulls node:$Rn, node:$Rm))>; + +def Neon_umlsl : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm), + (sub node:$Rd, + (int_arm_neon_vmullu node:$Rn, node:$Rm))>; + +defm SMLALvvv : NeonI_3VDL_3Op_v1<0b0, 0b1000, "smlal", Neon_smlal>; +defm UMLALvvv : NeonI_3VDL_3Op_v1<0b1, 0b1000, "umlal", Neon_umlal>; + +defm SMLSLvvv : NeonI_3VDL_3Op_v1<0b0, 0b1010, "smlsl", Neon_smlsl>; +defm UMLSLvvv : NeonI_3VDL_3Op_v1<0b1, 0b1010, "umlsl", Neon_umlsl>; + +class NeonI_3VDL2_3Op_mlas<bit q, bit u, bits<2> size, bits<4> opcode, + string asmop, string ResS, string OpS, + SDPatternOperator subop, SDPatternOperator opnode, + RegisterOperand OpVPR, + ValueType ResTy, ValueType OpTy> + : NeonI_3VDiff<q, u, size, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, OpVPR:$Rn, OpVPR:$Rm), + asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS, + [(set (ResTy VPR128:$Rd), + (ResTy (subop + (ResTy VPR128:$src), + (ResTy (opnode (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm))))))], + NoItinerary> { + let Constraints = "$src = $Rd"; +} + +multiclass NeonI_3VDL2_3Op_mlas_v1<bit u, bits<4> opcode, string asmop, + SDPatternOperator subop, string opnode> { + def _8h16b : NeonI_3VDL2_3Op_mlas<0b1, u, 0b00, opcode, asmop, "8h", "16b", + subop, !cast<PatFrag>(opnode # "_16B"), + VPR128, v8i16, v16i8>; + def _4s8h : NeonI_3VDL2_3Op_mlas<0b1, u, 0b01, opcode, asmop, "4s", "8h", + subop, !cast<PatFrag>(opnode # "_8H"), + VPR128, v4i32, v8i16>; + def _2d4s : NeonI_3VDL2_3Op_mlas<0b1, u, 0b10, opcode, asmop, "2d", "4s", + subop, !cast<PatFrag>(opnode # "_4S"), + VPR128, v2i64, v4i32>; +} + +defm SMLAL2vvv : NeonI_3VDL2_3Op_mlas_v1<0b0, 0b1000, "smlal2", + add, "NI_smull_hi">; +defm UMLAL2vvv : NeonI_3VDL2_3Op_mlas_v1<0b1, 0b1000, "umlal2", + add, "NI_umull_hi">; + +defm SMLSL2vvv : NeonI_3VDL2_3Op_mlas_v1<0b0, 0b1010, "smlsl2", + sub, "NI_smull_hi">; +defm UMLSL2vvv : NeonI_3VDL2_3Op_mlas_v1<0b1, 0b1010, "umlsl2", + sub, "NI_umull_hi">; + +multiclass NeonI_3VDL_qdmlal_3Op_v2<bit u, bits<4> opcode, string asmop, + SDPatternOperator opnode> { + def _4s4h : NeonI_3VDL2_3Op_mlas<0b0, u, 0b01, opcode, asmop, "4s", "4h", + opnode, int_arm_neon_vqdmull, + VPR64, v4i32, v4i16>; + def _2d2s : NeonI_3VDL2_3Op_mlas<0b0, u, 0b10, opcode, asmop, "2d", "2s", + opnode, int_arm_neon_vqdmull, + VPR64, v2i64, v2i32>; +} + +defm SQDMLALvvv : NeonI_3VDL_qdmlal_3Op_v2<0b0, 0b1001, "sqdmlal", + int_arm_neon_vqadds>; +defm SQDMLSLvvv : NeonI_3VDL_qdmlal_3Op_v2<0b0, 0b1011, "sqdmlsl", + int_arm_neon_vqsubs>; + +multiclass NeonI_3VDL_v2<bit u, bits<4> opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _4s4h : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4s", "4h", + opnode, VPR128, VPR64, v4i32, v4i16>; + def _2d2s : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2d", "2s", + opnode, VPR128, VPR64, v2i64, v2i32>; + } +} + +defm SQDMULLvvv : NeonI_3VDL_v2<0b0, 0b1101, "sqdmull", + int_arm_neon_vqdmull, 1>; + +multiclass NeonI_3VDL2_2Op_mull_v2<bit u, bits<4> opcode, string asmop, + string opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _4s8h : NeonI_3VDL2_2Op_mull<0b1, u, 0b01, opcode, asmop, "4s", "8h", + !cast<PatFrag>(opnode # "_8H"), + v4i32, v8i16>; + def _2d4s : NeonI_3VDL2_2Op_mull<0b1, u, 0b10, opcode, asmop, "2d", "4s", + !cast<PatFrag>(opnode # "_4S"), + v2i64, v4i32>; + } +} + +defm SQDMULL2vvv : NeonI_3VDL2_2Op_mull_v2<0b0, 0b1101, "sqdmull2", + "NI_qdmull_hi", 1>; + +multiclass NeonI_3VDL2_3Op_qdmlal_v2<bit u, bits<4> opcode, string asmop, + SDPatternOperator opnode> { + def _4s8h : NeonI_3VDL2_3Op_mlas<0b1, u, 0b01, opcode, asmop, "4s", "8h", + opnode, NI_qdmull_hi_8H, + VPR128, v4i32, v8i16>; + def _2d4s : NeonI_3VDL2_3Op_mlas<0b1, u, 0b10, opcode, asmop, "2d", "4s", + opnode, NI_qdmull_hi_4S, + VPR128, v2i64, v4i32>; +} + +defm SQDMLAL2vvv : NeonI_3VDL2_3Op_qdmlal_v2<0b0, 0b1001, "sqdmlal2", + int_arm_neon_vqadds>; +defm SQDMLSL2vvv : NeonI_3VDL2_3Op_qdmlal_v2<0b0, 0b1011, "sqdmlsl2", + int_arm_neon_vqsubs>; + +multiclass NeonI_3VDL_v3<bit u, bits<4> opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b", + opnode, VPR128, VPR64, v8i16, v8i8>; + + def _1q1d : NeonI_3VDiff<0b0, u, 0b11, opcode, + (outs VPR128:$Rd), (ins VPR64:$Rn, VPR64:$Rm), + asmop # "\t$Rd.1q, $Rn.1d, $Rm.1d", + [], NoItinerary>; + } +} + +defm PMULLvvv : NeonI_3VDL_v3<0b0, 0b1110, "pmull", int_arm_neon_vmullp, 1>; + +multiclass NeonI_3VDL2_2Op_mull_v3<bit u, bits<4> opcode, string asmop, + string opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _8h16b : NeonI_3VDL2_2Op_mull<0b1, u, 0b00, opcode, asmop, "8h", "16b", + !cast<PatFrag>(opnode # "_16B"), + v8i16, v16i8>; + + def _1q2d : NeonI_3VDiff<0b1, u, 0b11, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.1q, $Rn.2d, $Rm.2d", + [], NoItinerary>; + } +} + +defm PMULL2vvv : NeonI_3VDL2_2Op_mull_v3<0b0, 0b1110, "pmull2", "NI_pmull_hi", + 1>; + +// End of implementation for instruction class (3V Diff) + +// The followings are vector load/store multiple N-element structure +// (class SIMD lselem). + +// ld1: load multiple 1-element structure to 1/2/3/4 registers. +// ld2/ld3/ld4: load multiple N-element structure to N registers (N = 2, 3, 4). +// The structure consists of a sequence of sets of N values. +// The first element of the structure is placed in the first lane +// of the first first vector, the second element in the first lane +// of the second vector, and so on. +// E.g. LD1_3V_2S will load 32-bit elements {A, B, C, D, E, F} sequentially into +// the three 64-bit vectors list {BA, DC, FE}. +// E.g. LD3_2S will load 32-bit elements {A, B, C, D, E, F} into the three +// 64-bit vectors list {DA, EB, FC}. +// Store instructions store multiple structure to N registers like load. + + +class NeonI_LDVList<bit q, bits<4> opcode, bits<2> size, + RegisterOperand VecList, string asmop> + : NeonI_LdStMult<q, 1, opcode, size, + (outs VecList:$Rt), (ins GPR64xsp:$Rn), + asmop # "\t$Rt, [$Rn]", + [], + NoItinerary> { + let mayLoad = 1; + let neverHasSideEffects = 1; +} + +multiclass LDVList_BHSD<bits<4> opcode, string List, string asmop> { + def _8B : NeonI_LDVList<0, opcode, 0b00, + !cast<RegisterOperand>(List # "8B_operand"), asmop>; + + def _4H : NeonI_LDVList<0, opcode, 0b01, + !cast<RegisterOperand>(List # "4H_operand"), asmop>; + + def _2S : NeonI_LDVList<0, opcode, 0b10, + !cast<RegisterOperand>(List # "2S_operand"), asmop>; + + def _16B : NeonI_LDVList<1, opcode, 0b00, + !cast<RegisterOperand>(List # "16B_operand"), asmop>; + + def _8H : NeonI_LDVList<1, opcode, 0b01, + !cast<RegisterOperand>(List # "8H_operand"), asmop>; + + def _4S : NeonI_LDVList<1, opcode, 0b10, + !cast<RegisterOperand>(List # "4S_operand"), asmop>; + + def _2D : NeonI_LDVList<1, opcode, 0b11, + !cast<RegisterOperand>(List # "2D_operand"), asmop>; +} + +// Load multiple N-element structure to N consecutive registers (N = 1,2,3,4) +defm LD1 : LDVList_BHSD<0b0111, "VOne", "ld1">; +def LD1_1D : NeonI_LDVList<0, 0b0111, 0b11, VOne1D_operand, "ld1">; + +defm LD2 : LDVList_BHSD<0b1000, "VPair", "ld2">; + +defm LD3 : LDVList_BHSD<0b0100, "VTriple", "ld3">; + +defm LD4 : LDVList_BHSD<0b0000, "VQuad", "ld4">; + +// Load multiple 1-element structure to N consecutive registers (N = 2,3,4) +defm LD1x2 : LDVList_BHSD<0b1010, "VPair", "ld1">; +def LD1x2_1D : NeonI_LDVList<0, 0b1010, 0b11, VPair1D_operand, "ld1">; + +defm LD1x3 : LDVList_BHSD<0b0110, "VTriple", "ld1">; +def LD1x3_1D : NeonI_LDVList<0, 0b0110, 0b11, VTriple1D_operand, "ld1">; + +defm LD1x4 : LDVList_BHSD<0b0010, "VQuad", "ld1">; +def LD1x4_1D : NeonI_LDVList<0, 0b0010, 0b11, VQuad1D_operand, "ld1">; + +class NeonI_STVList<bit q, bits<4> opcode, bits<2> size, + RegisterOperand VecList, string asmop> + : NeonI_LdStMult<q, 0, opcode, size, + (outs), (ins GPR64xsp:$Rn, VecList:$Rt), + asmop # "\t$Rt, [$Rn]", + [], + NoItinerary> { + let mayStore = 1; + let neverHasSideEffects = 1; +} + +multiclass STVList_BHSD<bits<4> opcode, string List, string asmop> { + def _8B : NeonI_STVList<0, opcode, 0b00, + !cast<RegisterOperand>(List # "8B_operand"), asmop>; + + def _4H : NeonI_STVList<0, opcode, 0b01, + !cast<RegisterOperand>(List # "4H_operand"), asmop>; + + def _2S : NeonI_STVList<0, opcode, 0b10, + !cast<RegisterOperand>(List # "2S_operand"), asmop>; + + def _16B : NeonI_STVList<1, opcode, 0b00, + !cast<RegisterOperand>(List # "16B_operand"), asmop>; + + def _8H : NeonI_STVList<1, opcode, 0b01, + !cast<RegisterOperand>(List # "8H_operand"), asmop>; + + def _4S : NeonI_STVList<1, opcode, 0b10, + !cast<RegisterOperand>(List # "4S_operand"), asmop>; + + def _2D : NeonI_STVList<1, opcode, 0b11, + !cast<RegisterOperand>(List # "2D_operand"), asmop>; +} + +// Store multiple N-element structures from N registers (N = 1,2,3,4) +defm ST1 : STVList_BHSD<0b0111, "VOne", "st1">; +def ST1_1D : NeonI_STVList<0, 0b0111, 0b11, VOne1D_operand, "st1">; + +defm ST2 : STVList_BHSD<0b1000, "VPair", "st2">; + +defm ST3 : STVList_BHSD<0b0100, "VTriple", "st3">; + +defm ST4 : STVList_BHSD<0b0000, "VQuad", "st4">; + +// Store multiple 1-element structures from N consecutive registers (N = 2,3,4) +defm ST1x2 : STVList_BHSD<0b1010, "VPair", "st1">; +def ST1x2_1D : NeonI_STVList<0, 0b1010, 0b11, VPair1D_operand, "st1">; + +defm ST1x3 : STVList_BHSD<0b0110, "VTriple", "st1">; +def ST1x3_1D : NeonI_STVList<0, 0b0110, 0b11, VTriple1D_operand, "st1">; + +defm ST1x4 : STVList_BHSD<0b0010, "VQuad", "st1">; +def ST1x4_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">; + +def : Pat<(v2f64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>; +def : Pat<(v2i64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>; + +def : Pat<(v4f32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>; +def : Pat<(v4i32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>; + +def : Pat<(v8i16 (load GPR64xsp:$addr)), (LD1_8H GPR64xsp:$addr)>; +def : Pat<(v16i8 (load GPR64xsp:$addr)), (LD1_16B GPR64xsp:$addr)>; + +def : Pat<(v1f64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>; +def : Pat<(v1i64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>; + +def : Pat<(v2f32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>; +def : Pat<(v2i32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>; + +def : Pat<(v4i16 (load GPR64xsp:$addr)), (LD1_4H GPR64xsp:$addr)>; +def : Pat<(v8i8 (load GPR64xsp:$addr)), (LD1_8B GPR64xsp:$addr)>; + +def : Pat<(store (v2i64 VPR128:$value), GPR64xsp:$addr), + (ST1_2D GPR64xsp:$addr, VPR128:$value)>; +def : Pat<(store (v2f64 VPR128:$value), GPR64xsp:$addr), + (ST1_2D GPR64xsp:$addr, VPR128:$value)>; + +def : Pat<(store (v4i32 VPR128:$value), GPR64xsp:$addr), + (ST1_4S GPR64xsp:$addr, VPR128:$value)>; +def : Pat<(store (v4f32 VPR128:$value), GPR64xsp:$addr), + (ST1_4S GPR64xsp:$addr, VPR128:$value)>; + +def : Pat<(store (v8i16 VPR128:$value), GPR64xsp:$addr), + (ST1_8H GPR64xsp:$addr, VPR128:$value)>; +def : Pat<(store (v16i8 VPR128:$value), GPR64xsp:$addr), + (ST1_16B GPR64xsp:$addr, VPR128:$value)>; + +def : Pat<(store (v1i64 VPR64:$value), GPR64xsp:$addr), + (ST1_1D GPR64xsp:$addr, VPR64:$value)>; +def : Pat<(store (v1f64 VPR64:$value), GPR64xsp:$addr), + (ST1_1D GPR64xsp:$addr, VPR64:$value)>; + +def : Pat<(store (v2i32 VPR64:$value), GPR64xsp:$addr), + (ST1_2S GPR64xsp:$addr, VPR64:$value)>; +def : Pat<(store (v2f32 VPR64:$value), GPR64xsp:$addr), + (ST1_2S GPR64xsp:$addr, VPR64:$value)>; + +def : Pat<(store (v4i16 VPR64:$value), GPR64xsp:$addr), + (ST1_4H GPR64xsp:$addr, VPR64:$value)>; +def : Pat<(store (v8i8 VPR64:$value), GPR64xsp:$addr), + (ST1_8B GPR64xsp:$addr, VPR64:$value)>; + +// End of vector load/store multiple N-element structure(class SIMD lselem) + +// The followings are post-index vector load/store multiple N-element +// structure(class SIMD lselem-post) +def exact1_asmoperand : AsmOperandClass { + let Name = "Exact1"; + let PredicateMethod = "isExactImm<1>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact1 : Operand<i32>, ImmLeaf<i32, [{return Imm == 1;}]> { + let ParserMatchClass = exact1_asmoperand; +} + +def exact2_asmoperand : AsmOperandClass { + let Name = "Exact2"; + let PredicateMethod = "isExactImm<2>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact2 : Operand<i32>, ImmLeaf<i32, [{return Imm == 2;}]> { + let ParserMatchClass = exact2_asmoperand; +} + +def exact3_asmoperand : AsmOperandClass { + let Name = "Exact3"; + let PredicateMethod = "isExactImm<3>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact3 : Operand<i32>, ImmLeaf<i32, [{return Imm == 3;}]> { + let ParserMatchClass = exact3_asmoperand; +} + +def exact4_asmoperand : AsmOperandClass { + let Name = "Exact4"; + let PredicateMethod = "isExactImm<4>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact4 : Operand<i32>, ImmLeaf<i32, [{return Imm == 4;}]> { + let ParserMatchClass = exact4_asmoperand; +} + +def exact6_asmoperand : AsmOperandClass { + let Name = "Exact6"; + let PredicateMethod = "isExactImm<6>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact6 : Operand<i32>, ImmLeaf<i32, [{return Imm == 6;}]> { + let ParserMatchClass = exact6_asmoperand; +} + +def exact8_asmoperand : AsmOperandClass { + let Name = "Exact8"; + let PredicateMethod = "isExactImm<8>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact8 : Operand<i32>, ImmLeaf<i32, [{return Imm == 8;}]> { + let ParserMatchClass = exact8_asmoperand; +} + +def exact12_asmoperand : AsmOperandClass { + let Name = "Exact12"; + let PredicateMethod = "isExactImm<12>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact12 : Operand<i32>, ImmLeaf<i32, [{return Imm == 12;}]> { + let ParserMatchClass = exact12_asmoperand; +} + +def exact16_asmoperand : AsmOperandClass { + let Name = "Exact16"; + let PredicateMethod = "isExactImm<16>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact16 : Operand<i32>, ImmLeaf<i32, [{return Imm == 16;}]> { + let ParserMatchClass = exact16_asmoperand; +} + +def exact24_asmoperand : AsmOperandClass { + let Name = "Exact24"; + let PredicateMethod = "isExactImm<24>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact24 : Operand<i32>, ImmLeaf<i32, [{return Imm == 24;}]> { + let ParserMatchClass = exact24_asmoperand; +} + +def exact32_asmoperand : AsmOperandClass { + let Name = "Exact32"; + let PredicateMethod = "isExactImm<32>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact32 : Operand<i32>, ImmLeaf<i32, [{return Imm == 32;}]> { + let ParserMatchClass = exact32_asmoperand; +} + +def exact48_asmoperand : AsmOperandClass { + let Name = "Exact48"; + let PredicateMethod = "isExactImm<48>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact48 : Operand<i32>, ImmLeaf<i32, [{return Imm == 48;}]> { + let ParserMatchClass = exact48_asmoperand; +} + +def exact64_asmoperand : AsmOperandClass { + let Name = "Exact64"; + let PredicateMethod = "isExactImm<64>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact64 : Operand<i32>, ImmLeaf<i32, [{return Imm == 64;}]> { + let ParserMatchClass = exact64_asmoperand; +} + +multiclass NeonI_LDWB_VList<bit q, bits<4> opcode, bits<2> size, + RegisterOperand VecList, Operand ImmTy, + string asmop> { + let Constraints = "$Rn = $wb", mayLoad = 1, neverHasSideEffects = 1, + DecoderMethod = "DecodeVLDSTPostInstruction" in { + def _fixed : NeonI_LdStMult_Post<q, 1, opcode, size, + (outs VecList:$Rt, GPR64xsp:$wb), + (ins GPR64xsp:$Rn, ImmTy:$amt), + asmop # "\t$Rt, [$Rn], $amt", + [], + NoItinerary> { + let Rm = 0b11111; + } + + def _register : NeonI_LdStMult_Post<q, 1, opcode, size, + (outs VecList:$Rt, GPR64xsp:$wb), + (ins GPR64xsp:$Rn, GPR64noxzr:$Rm), + asmop # "\t$Rt, [$Rn], $Rm", + [], + NoItinerary>; + } +} + +multiclass LDWB_VList_BHSD<bits<4> opcode, string List, Operand ImmTy, + Operand ImmTy2, string asmop> { + defm _8B : NeonI_LDWB_VList<0, opcode, 0b00, + !cast<RegisterOperand>(List # "8B_operand"), + ImmTy, asmop>; + + defm _4H : NeonI_LDWB_VList<0, opcode, 0b01, + !cast<RegisterOperand>(List # "4H_operand"), + ImmTy, asmop>; + + defm _2S : NeonI_LDWB_VList<0, opcode, 0b10, + !cast<RegisterOperand>(List # "2S_operand"), + ImmTy, asmop>; + + defm _16B : NeonI_LDWB_VList<1, opcode, 0b00, + !cast<RegisterOperand>(List # "16B_operand"), + ImmTy2, asmop>; + + defm _8H : NeonI_LDWB_VList<1, opcode, 0b01, + !cast<RegisterOperand>(List # "8H_operand"), + ImmTy2, asmop>; + + defm _4S : NeonI_LDWB_VList<1, opcode, 0b10, + !cast<RegisterOperand>(List # "4S_operand"), + ImmTy2, asmop>; + + defm _2D : NeonI_LDWB_VList<1, opcode, 0b11, + !cast<RegisterOperand>(List # "2D_operand"), + ImmTy2, asmop>; +} + +// Post-index load multiple N-element structures from N registers (N = 1,2,3,4) +defm LD1WB : LDWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "ld1">; +defm LD1WB_1D : NeonI_LDWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8, + "ld1">; + +defm LD2WB : LDWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "ld2">; + +defm LD3WB : LDWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48, + "ld3">; + +defm LD4WB : LDWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "ld4">; + +// Post-index load multiple 1-element structures from N consecutive registers +// (N = 2,3,4) +defm LD1x2WB : LDWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32, + "ld1">; +defm LD1x2WB_1D : NeonI_LDWB_VList<0, 0b1010, 0b11, VPair1D_operand, + uimm_exact16, "ld1">; + +defm LD1x3WB : LDWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48, + "ld1">; +defm LD1x3WB_1D : NeonI_LDWB_VList<0, 0b0110, 0b11, VTriple1D_operand, + uimm_exact24, "ld1">; + +defm LD1x4WB : LDWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64, + "ld1">; +defm LD1x4WB_1D : NeonI_LDWB_VList<0, 0b0010, 0b11, VQuad1D_operand, + uimm_exact32, "ld1">; + +multiclass NeonI_STWB_VList<bit q, bits<4> opcode, bits<2> size, + RegisterOperand VecList, Operand ImmTy, + string asmop> { + let Constraints = "$Rn = $wb", mayStore = 1, neverHasSideEffects = 1, + DecoderMethod = "DecodeVLDSTPostInstruction" in { + def _fixed : NeonI_LdStMult_Post<q, 0, opcode, size, + (outs GPR64xsp:$wb), + (ins GPR64xsp:$Rn, ImmTy:$amt, VecList:$Rt), + asmop # "\t$Rt, [$Rn], $amt", + [], + NoItinerary> { + let Rm = 0b11111; + } + + def _register : NeonI_LdStMult_Post<q, 0, opcode, size, + (outs GPR64xsp:$wb), + (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, VecList:$Rt), + asmop # "\t$Rt, [$Rn], $Rm", + [], + NoItinerary>; + } +} + +multiclass STWB_VList_BHSD<bits<4> opcode, string List, Operand ImmTy, + Operand ImmTy2, string asmop> { + defm _8B : NeonI_STWB_VList<0, opcode, 0b00, + !cast<RegisterOperand>(List # "8B_operand"), ImmTy, asmop>; + + defm _4H : NeonI_STWB_VList<0, opcode, 0b01, + !cast<RegisterOperand>(List # "4H_operand"), + ImmTy, asmop>; + + defm _2S : NeonI_STWB_VList<0, opcode, 0b10, + !cast<RegisterOperand>(List # "2S_operand"), + ImmTy, asmop>; + + defm _16B : NeonI_STWB_VList<1, opcode, 0b00, + !cast<RegisterOperand>(List # "16B_operand"), + ImmTy2, asmop>; + + defm _8H : NeonI_STWB_VList<1, opcode, 0b01, + !cast<RegisterOperand>(List # "8H_operand"), + ImmTy2, asmop>; + + defm _4S : NeonI_STWB_VList<1, opcode, 0b10, + !cast<RegisterOperand>(List # "4S_operand"), + ImmTy2, asmop>; + + defm _2D : NeonI_STWB_VList<1, opcode, 0b11, + !cast<RegisterOperand>(List # "2D_operand"), + ImmTy2, asmop>; +} + +// Post-index load multiple N-element structures from N registers (N = 1,2,3,4) +defm ST1WB : STWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "st1">; +defm ST1WB_1D : NeonI_STWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8, + "st1">; + +defm ST2WB : STWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "st2">; + +defm ST3WB : STWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48, + "st3">; + +defm ST4WB : STWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "st4">; + +// Post-index load multiple 1-element structures from N consecutive registers +// (N = 2,3,4) +defm ST1x2WB : STWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32, + "st1">; +defm ST1x2WB_1D : NeonI_STWB_VList<0, 0b1010, 0b11, VPair1D_operand, + uimm_exact16, "st1">; + +defm ST1x3WB : STWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48, + "st1">; +defm ST1x3WB_1D : NeonI_STWB_VList<0, 0b0110, 0b11, VTriple1D_operand, + uimm_exact24, "st1">; + +defm ST1x4WB : STWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64, + "st1">; +defm ST1x4WB_1D : NeonI_STWB_VList<0, 0b0010, 0b11, VQuad1D_operand, + uimm_exact32, "st1">; + +// End of post-index vector load/store multiple N-element structure +// (class SIMD lselem-post) + +// The followings are vector load/store single N-element structure +// (class SIMD lsone). +def neon_uimm0_bare : Operand<i64>, + ImmLeaf<i64, [{return Imm == 0;}]> { + let ParserMatchClass = neon_uimm0_asmoperand; + let PrintMethod = "printUImmBareOperand"; +} + +def neon_uimm1_bare : Operand<i64>, + ImmLeaf<i64, [{return Imm < 2;}]> { + let ParserMatchClass = neon_uimm1_asmoperand; + let PrintMethod = "printUImmBareOperand"; +} + +def neon_uimm2_bare : Operand<i64>, + ImmLeaf<i64, [{return Imm < 4;}]> { + let ParserMatchClass = neon_uimm2_asmoperand; + let PrintMethod = "printUImmBareOperand"; +} + +def neon_uimm3_bare : Operand<i64>, + ImmLeaf<i64, [{return Imm < 8;}]> { + let ParserMatchClass = uimm3_asmoperand; + let PrintMethod = "printUImmBareOperand"; +} + +def neon_uimm4_bare : Operand<i64>, + ImmLeaf<i64, [{return Imm < 16;}]> { + let ParserMatchClass = uimm4_asmoperand; + let PrintMethod = "printUImmBareOperand"; +} + +class NeonI_LDN_Dup<bit q, bit r, bits<3> opcode, bits<2> size, + RegisterOperand VecList, string asmop> + : NeonI_LdOne_Dup<q, r, opcode, size, + (outs VecList:$Rt), (ins GPR64xsp:$Rn), + asmop # "\t$Rt, [$Rn]", + [], + NoItinerary> { + let mayLoad = 1; + let neverHasSideEffects = 1; +} + +multiclass LDN_Dup_BHSD<bit r, bits<3> opcode, string List, string asmop> { + def _8B : NeonI_LDN_Dup<0, r, opcode, 0b00, + !cast<RegisterOperand>(List # "8B_operand"), asmop>; + + def _4H : NeonI_LDN_Dup<0, r, opcode, 0b01, + !cast<RegisterOperand>(List # "4H_operand"), asmop>; + + def _2S : NeonI_LDN_Dup<0, r, opcode, 0b10, + !cast<RegisterOperand>(List # "2S_operand"), asmop>; + + def _1D : NeonI_LDN_Dup<0, r, opcode, 0b11, + !cast<RegisterOperand>(List # "1D_operand"), asmop>; + + def _16B : NeonI_LDN_Dup<1, r, opcode, 0b00, + !cast<RegisterOperand>(List # "16B_operand"), asmop>; + + def _8H : NeonI_LDN_Dup<1, r, opcode, 0b01, + !cast<RegisterOperand>(List # "8H_operand"), asmop>; + + def _4S : NeonI_LDN_Dup<1, r, opcode, 0b10, + !cast<RegisterOperand>(List # "4S_operand"), asmop>; + + def _2D : NeonI_LDN_Dup<1, r, opcode, 0b11, + !cast<RegisterOperand>(List # "2D_operand"), asmop>; +} + +// Load single 1-element structure to all lanes of 1 register +defm LD1R : LDN_Dup_BHSD<0b0, 0b110, "VOne", "ld1r">; + +// Load single N-element structure to all lanes of N consecutive +// registers (N = 2,3,4) +defm LD2R : LDN_Dup_BHSD<0b1, 0b110, "VPair", "ld2r">; +defm LD3R : LDN_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r">; +defm LD4R : LDN_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r">; + + +class LD1R_pattern <ValueType VTy, ValueType DTy, PatFrag LoadOp, + Instruction INST> + : Pat<(VTy (Neon_vdup (DTy (LoadOp GPR64xsp:$Rn)))), + (VTy (INST GPR64xsp:$Rn))>; + +// Match all LD1R instructions +def : LD1R_pattern<v8i8, i32, extloadi8, LD1R_8B>; + +def : LD1R_pattern<v16i8, i32, extloadi8, LD1R_16B>; + +def : LD1R_pattern<v4i16, i32, extloadi16, LD1R_4H>; + +def : LD1R_pattern<v8i16, i32, extloadi16, LD1R_8H>; + +def : LD1R_pattern<v2i32, i32, load, LD1R_2S>; +def : LD1R_pattern<v2f32, f32, load, LD1R_2S>; + +def : LD1R_pattern<v4i32, i32, load, LD1R_4S>; +def : LD1R_pattern<v4f32, f32, load, LD1R_4S>; + +def : LD1R_pattern<v1i64, i64, load, LD1R_1D>; +def : LD1R_pattern<v1f64, f64, load, LD1R_1D>; + +def : LD1R_pattern<v2i64, i64, load, LD1R_2D>; +def : LD1R_pattern<v2f64, f64, load, LD1R_2D>; + + +multiclass VectorList_Bare_BHSD<string PREFIX, int Count, + RegisterClass RegList> { + defm B : VectorList_operands<PREFIX, "B", Count, RegList>; + defm H : VectorList_operands<PREFIX, "H", Count, RegList>; + defm S : VectorList_operands<PREFIX, "S", Count, RegList>; + defm D : VectorList_operands<PREFIX, "D", Count, RegList>; +} + +// Special vector list operand of 128-bit vectors with bare layout. +// i.e. only show ".b", ".h", ".s", ".d" +defm VOne : VectorList_Bare_BHSD<"VOne", 1, FPR128>; +defm VPair : VectorList_Bare_BHSD<"VPair", 2, QPair>; +defm VTriple : VectorList_Bare_BHSD<"VTriple", 3, QTriple>; +defm VQuad : VectorList_Bare_BHSD<"VQuad", 4, QQuad>; + +class NeonI_LDN_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList, + Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane<1, r, op2_1, op0, + (outs VList:$Rt), + (ins GPR64xsp:$Rn, VList:$src, ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn]", + [], + NoItinerary> { + let mayLoad = 1; + let neverHasSideEffects = 1; + let hasExtraDefRegAllocReq = 1; + let Constraints = "$src = $Rt"; +} + +multiclass LDN_Lane_BHSD<bit r, bit op0, string List, string asmop> { + def _B : NeonI_LDN_Lane<r, 0b00, op0, + !cast<RegisterOperand>(List # "B_operand"), + neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } + + def _H : NeonI_LDN_Lane<r, 0b01, op0, + !cast<RegisterOperand>(List # "H_operand"), + neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } + + def _S : NeonI_LDN_Lane<r, 0b10, op0, + !cast<RegisterOperand>(List # "S_operand"), + neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; + } + + def _D : NeonI_LDN_Lane<r, 0b10, op0, + !cast<RegisterOperand>(List # "D_operand"), + neon_uimm1_bare, asmop> { + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; + } +} + +// Load single 1-element structure to one lane of 1 register. +defm LD1LN : LDN_Lane_BHSD<0b0, 0b0, "VOne", "ld1">; + +// Load single N-element structure to one lane of N consecutive registers +// (N = 2,3,4) +defm LD2LN : LDN_Lane_BHSD<0b1, 0b0, "VPair", "ld2">; +defm LD3LN : LDN_Lane_BHSD<0b0, 0b1, "VTriple", "ld3">; +defm LD4LN : LDN_Lane_BHSD<0b1, 0b1, "VQuad", "ld4">; + +multiclass LD1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy, + Operand ImmOp, Operand ImmOp2, PatFrag LoadOp, + Instruction INST> { + def : Pat<(VTy (vector_insert (VTy VPR64:$src), + (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp:$lane))), + (VTy (EXTRACT_SUBREG + (INST GPR64xsp:$Rn, + (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), + ImmOp:$lane), + sub_64))>; + + def : Pat<(VTy2 (vector_insert (VTy2 VPR128:$src), + (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp2:$lane))), + (VTy2 (INST GPR64xsp:$Rn, VPR128:$src, ImmOp2:$lane))>; +} + +// Match all LD1LN instructions +defm : LD1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare, + extloadi8, LD1LN_B>; + +defm : LD1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare, + extloadi16, LD1LN_H>; + +defm : LD1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare, + load, LD1LN_S>; +defm : LD1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare, + load, LD1LN_S>; + +defm : LD1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare, + load, LD1LN_D>; +defm : LD1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare, + load, LD1LN_D>; + +class NeonI_STN_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList, + Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane<0, r, op2_1, op0, + (outs), (ins GPR64xsp:$Rn, VList:$Rt, ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn]", + [], + NoItinerary> { + let mayStore = 1; + let neverHasSideEffects = 1; + let hasExtraDefRegAllocReq = 1; +} + +multiclass STN_Lane_BHSD<bit r, bit op0, string List, string asmop> { + def _B : NeonI_STN_Lane<r, 0b00, op0, + !cast<RegisterOperand>(List # "B_operand"), + neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } + + def _H : NeonI_STN_Lane<r, 0b01, op0, + !cast<RegisterOperand>(List # "H_operand"), + neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } + + def _S : NeonI_STN_Lane<r, 0b10, op0, + !cast<RegisterOperand>(List # "S_operand"), + neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; + } + + def _D : NeonI_STN_Lane<r, 0b10, op0, + !cast<RegisterOperand>(List # "D_operand"), + neon_uimm1_bare, asmop>{ + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; + } +} + +// Store single 1-element structure from one lane of 1 register. +defm ST1LN : STN_Lane_BHSD<0b0, 0b0, "VOne", "st1">; + +// Store single N-element structure from one lane of N consecutive registers +// (N = 2,3,4) +defm ST2LN : STN_Lane_BHSD<0b1, 0b0, "VPair", "st2">; +defm ST3LN : STN_Lane_BHSD<0b0, 0b1, "VTriple", "st3">; +defm ST4LN : STN_Lane_BHSD<0b1, 0b1, "VQuad", "st4">; + +multiclass ST1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy, + Operand ImmOp, Operand ImmOp2, PatFrag StoreOp, + Instruction INST> { + def : Pat<(StoreOp (DTy (vector_extract (VTy VPR64:$Rt), ImmOp:$lane)), + GPR64xsp:$Rn), + (INST GPR64xsp:$Rn, + (SUBREG_TO_REG (i64 0), VPR64:$Rt, sub_64), + ImmOp:$lane)>; + + def : Pat<(StoreOp (DTy (vector_extract (VTy2 VPR128:$Rt), ImmOp2:$lane)), + GPR64xsp:$Rn), + (INST GPR64xsp:$Rn, VPR128:$Rt, ImmOp2:$lane)>; +} + +// Match all ST1LN instructions +defm : ST1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare, + truncstorei8, ST1LN_B>; + +defm : ST1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare, + truncstorei16, ST1LN_H>; + +defm : ST1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare, + store, ST1LN_S>; +defm : ST1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare, + store, ST1LN_S>; + +defm : ST1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare, + store, ST1LN_D>; +defm : ST1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare, + store, ST1LN_D>; + +// End of vector load/store single N-element structure (class SIMD lsone). + + +// The following are post-index load/store single N-element instructions +// (class SIMD lsone-post) + +multiclass NeonI_LDN_WB_Dup<bit q, bit r, bits<3> opcode, bits<2> size, + RegisterOperand VecList, Operand ImmTy, + string asmop> { + let mayLoad = 1, neverHasSideEffects = 1, Constraints = "$wb = $Rn", + DecoderMethod = "DecodeVLDSTLanePostInstruction" in { + def _fixed : NeonI_LdOne_Dup_Post<q, r, opcode, size, + (outs VecList:$Rt, GPR64xsp:$wb), + (ins GPR64xsp:$Rn, ImmTy:$amt), + asmop # "\t$Rt, [$Rn], $amt", + [], + NoItinerary> { + let Rm = 0b11111; + } + + def _register : NeonI_LdOne_Dup_Post<q, r, opcode, size, + (outs VecList:$Rt, GPR64xsp:$wb), + (ins GPR64xsp:$Rn, GPR64noxzr:$Rm), + asmop # "\t$Rt, [$Rn], $Rm", + [], + NoItinerary>; + } +} + +multiclass LDWB_Dup_BHSD<bit r, bits<3> opcode, string List, string asmop, + Operand uimm_b, Operand uimm_h, + Operand uimm_s, Operand uimm_d> { + defm _8B : NeonI_LDN_WB_Dup<0, r, opcode, 0b00, + !cast<RegisterOperand>(List # "8B_operand"), + uimm_b, asmop>; + + defm _4H : NeonI_LDN_WB_Dup<0, r, opcode, 0b01, + !cast<RegisterOperand>(List # "4H_operand"), + uimm_h, asmop>; + + defm _2S : NeonI_LDN_WB_Dup<0, r, opcode, 0b10, + !cast<RegisterOperand>(List # "2S_operand"), + uimm_s, asmop>; + + defm _1D : NeonI_LDN_WB_Dup<0, r, opcode, 0b11, + !cast<RegisterOperand>(List # "1D_operand"), + uimm_d, asmop>; + + defm _16B : NeonI_LDN_WB_Dup<1, r, opcode, 0b00, + !cast<RegisterOperand>(List # "16B_operand"), + uimm_b, asmop>; + + defm _8H : NeonI_LDN_WB_Dup<1, r, opcode, 0b01, + !cast<RegisterOperand>(List # "8H_operand"), + uimm_h, asmop>; + + defm _4S : NeonI_LDN_WB_Dup<1, r, opcode, 0b10, + !cast<RegisterOperand>(List # "4S_operand"), + uimm_s, asmop>; + + defm _2D : NeonI_LDN_WB_Dup<1, r, opcode, 0b11, + !cast<RegisterOperand>(List # "2D_operand"), + uimm_d, asmop>; +} + +// Post-index load single 1-element structure to all lanes of 1 register +defm LD1R_WB : LDWB_Dup_BHSD<0b0, 0b110, "VOne", "ld1r", uimm_exact1, + uimm_exact2, uimm_exact4, uimm_exact8>; + +// Post-index load single N-element structure to all lanes of N consecutive +// registers (N = 2,3,4) +defm LD2R_WB : LDWB_Dup_BHSD<0b1, 0b110, "VPair", "ld2r", uimm_exact2, + uimm_exact4, uimm_exact8, uimm_exact16>; +defm LD3R_WB : LDWB_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r", uimm_exact3, + uimm_exact6, uimm_exact12, uimm_exact24>; +defm LD4R_WB : LDWB_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r", uimm_exact4, + uimm_exact8, uimm_exact16, uimm_exact32>; + +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1, + Constraints = "$Rn = $wb, $Rt = $src", + DecoderMethod = "DecodeVLDSTLanePostInstruction" in { + class LDN_WBFx_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList, + Operand ImmTy, Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0, + (outs VList:$Rt, GPR64xsp:$wb), + (ins GPR64xsp:$Rn, ImmTy:$amt, + VList:$src, ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn], $amt", + [], + NoItinerary> { + let Rm = 0b11111; + } + + class LDN_WBReg_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList, + Operand ImmTy, Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0, + (outs VList:$Rt, GPR64xsp:$wb), + (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, + VList:$src, ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn], $Rm", + [], + NoItinerary>; +} + +multiclass LD_Lane_WB_BHSD<bit r, bit op0, string List, string asmop, + Operand uimm_b, Operand uimm_h, + Operand uimm_s, Operand uimm_d> { + def _B_fixed : LDN_WBFx_Lane<r, 0b00, op0, + !cast<RegisterOperand>(List # "B_operand"), + uimm_b, neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } + + def _B_register : LDN_WBReg_Lane<r, 0b00, op0, + !cast<RegisterOperand>(List # "B_operand"), + uimm_b, neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } + + def _H_fixed : LDN_WBFx_Lane<r, 0b01, op0, + !cast<RegisterOperand>(List # "H_operand"), + uimm_h, neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } + + def _H_register : LDN_WBReg_Lane<r, 0b01, op0, + !cast<RegisterOperand>(List # "H_operand"), + uimm_h, neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } + + def _S_fixed : LDN_WBFx_Lane<r, 0b10, op0, + !cast<RegisterOperand>(List # "S_operand"), + uimm_s, neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; + } + + def _S_register : LDN_WBReg_Lane<r, 0b10, op0, + !cast<RegisterOperand>(List # "S_operand"), + uimm_s, neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; + } + + def _D_fixed : LDN_WBFx_Lane<r, 0b10, op0, + !cast<RegisterOperand>(List # "D_operand"), + uimm_d, neon_uimm1_bare, asmop> { + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; + } + + def _D_register : LDN_WBReg_Lane<r, 0b10, op0, + !cast<RegisterOperand>(List # "D_operand"), + uimm_d, neon_uimm1_bare, asmop> { + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; + } +} + +// Post-index load single 1-element structure to one lane of 1 register. +defm LD1LN_WB : LD_Lane_WB_BHSD<0b0, 0b0, "VOne", "ld1", uimm_exact1, + uimm_exact2, uimm_exact4, uimm_exact8>; + +// Post-index load single N-element structure to one lane of N consecutive +// registers +// (N = 2,3,4) +defm LD2LN_WB : LD_Lane_WB_BHSD<0b1, 0b0, "VPair", "ld2", uimm_exact2, + uimm_exact4, uimm_exact8, uimm_exact16>; +defm LD3LN_WB : LD_Lane_WB_BHSD<0b0, 0b1, "VTriple", "ld3", uimm_exact3, + uimm_exact6, uimm_exact12, uimm_exact24>; +defm LD4LN_WB : LD_Lane_WB_BHSD<0b1, 0b1, "VQuad", "ld4", uimm_exact4, + uimm_exact8, uimm_exact16, uimm_exact32>; + +let mayStore = 1, neverHasSideEffects = 1, + hasExtraDefRegAllocReq = 1, Constraints = "$Rn = $wb", + DecoderMethod = "DecodeVLDSTLanePostInstruction" in { + class STN_WBFx_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList, + Operand ImmTy, Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0, + (outs GPR64xsp:$wb), + (ins GPR64xsp:$Rn, ImmTy:$amt, + VList:$Rt, ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn], $amt", + [], + NoItinerary> { + let Rm = 0b11111; + } + + class STN_WBReg_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList, + Operand ImmTy, Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0, + (outs GPR64xsp:$wb), + (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, VList:$Rt, + ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn], $Rm", + [], + NoItinerary>; +} + +multiclass ST_Lane_WB_BHSD<bit r, bit op0, string List, string asmop, + Operand uimm_b, Operand uimm_h, + Operand uimm_s, Operand uimm_d> { + def _B_fixed : STN_WBFx_Lane<r, 0b00, op0, + !cast<RegisterOperand>(List # "B_operand"), + uimm_b, neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } + + def _B_register : STN_WBReg_Lane<r, 0b00, op0, + !cast<RegisterOperand>(List # "B_operand"), + uimm_b, neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } + + def _H_fixed : STN_WBFx_Lane<r, 0b01, op0, + !cast<RegisterOperand>(List # "H_operand"), + uimm_h, neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } + + def _H_register : STN_WBReg_Lane<r, 0b01, op0, + !cast<RegisterOperand>(List # "H_operand"), + uimm_h, neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } + + def _S_fixed : STN_WBFx_Lane<r, 0b10, op0, + !cast<RegisterOperand>(List # "S_operand"), + uimm_s, neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; + } + + def _S_register : STN_WBReg_Lane<r, 0b10, op0, + !cast<RegisterOperand>(List # "S_operand"), + uimm_s, neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; + } + + def _D_fixed : STN_WBFx_Lane<r, 0b10, op0, + !cast<RegisterOperand>(List # "D_operand"), + uimm_d, neon_uimm1_bare, asmop> { + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; + } + + def _D_register : STN_WBReg_Lane<r, 0b10, op0, + !cast<RegisterOperand>(List # "D_operand"), + uimm_d, neon_uimm1_bare, asmop> { + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; + } +} + +// Post-index store single 1-element structure from one lane of 1 register. +defm ST1LN_WB : ST_Lane_WB_BHSD<0b0, 0b0, "VOne", "st1", uimm_exact1, + uimm_exact2, uimm_exact4, uimm_exact8>; + +// Post-index store single N-element structure from one lane of N consecutive +// registers (N = 2,3,4) +defm ST2LN_WB : ST_Lane_WB_BHSD<0b1, 0b0, "VPair", "st2", uimm_exact2, + uimm_exact4, uimm_exact8, uimm_exact16>; +defm ST3LN_WB : ST_Lane_WB_BHSD<0b0, 0b1, "VTriple", "st3", uimm_exact3, + uimm_exact6, uimm_exact12, uimm_exact24>; +defm ST4LN_WB : ST_Lane_WB_BHSD<0b1, 0b1, "VQuad", "st4", uimm_exact4, + uimm_exact8, uimm_exact16, uimm_exact32>; + +// End of post-index load/store single N-element instructions +// (class SIMD lsone-post) + +// Neon Scalar instructions implementation +// Scalar Three Same + +class NeonI_Scalar3Same_size<bit u, bits<2> size, bits<5> opcode, string asmop, + RegisterClass FPRC> + : NeonI_Scalar3Same<u, size, opcode, + (outs FPRC:$Rd), (ins FPRC:$Rn, FPRC:$Rm), + !strconcat(asmop, "\t$Rd, $Rn, $Rm"), + [], + NoItinerary>; + +class NeonI_Scalar3Same_D_size<bit u, bits<5> opcode, string asmop> + : NeonI_Scalar3Same_size<u, 0b11, opcode, asmop, FPR64>; + +multiclass NeonI_Scalar3Same_HS_sizes<bit u, bits<5> opcode, string asmop, + bit Commutable = 0> { + let isCommutable = Commutable in { + def hhh : NeonI_Scalar3Same_size<u, 0b01, opcode, asmop, FPR16>; + def sss : NeonI_Scalar3Same_size<u, 0b10, opcode, asmop, FPR32>; + } +} + +multiclass NeonI_Scalar3Same_SD_sizes<bit u, bit size_high, bits<5> opcode, + string asmop, bit Commutable = 0> { + let isCommutable = Commutable in { + def sss : NeonI_Scalar3Same_size<u, {size_high, 0b0}, opcode, asmop, FPR32>; + def ddd : NeonI_Scalar3Same_size<u, {size_high, 0b1}, opcode, asmop, FPR64>; + } +} + +multiclass NeonI_Scalar3Same_BHSD_sizes<bit u, bits<5> opcode, + string asmop, bit Commutable = 0> { + let isCommutable = Commutable in { + def bbb : NeonI_Scalar3Same_size<u, 0b00, opcode, asmop, FPR8>; + def hhh : NeonI_Scalar3Same_size<u, 0b01, opcode, asmop, FPR16>; + def sss : NeonI_Scalar3Same_size<u, 0b10, opcode, asmop, FPR32>; + def ddd : NeonI_Scalar3Same_size<u, 0b11, opcode, asmop, FPR64>; + } +} + +multiclass Neon_Scalar3Same_D_size_patterns<SDPatternOperator opnode, + Instruction INSTD> { + def : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))), + (INSTD FPR64:$Rn, FPR64:$Rm)>; +} + +multiclass Neon_Scalar3Same_BHSD_size_patterns<SDPatternOperator opnode, + Instruction INSTB, + Instruction INSTH, + Instruction INSTS, + Instruction INSTD> + : Neon_Scalar3Same_D_size_patterns<opnode, INSTD> { + def: Pat<(v1i8 (opnode (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))), + (INSTB FPR8:$Rn, FPR8:$Rm)>; + + def: Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))), + (INSTH FPR16:$Rn, FPR16:$Rm)>; + + def: Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))), + (INSTS FPR32:$Rn, FPR32:$Rm)>; +} + +class Neon_Scalar3Same_cmp_D_size_patterns<SDPatternOperator opnode, + Instruction INSTD> + : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))), + (INSTD FPR64:$Rn, FPR64:$Rm)>; + +multiclass Neon_Scalar3Same_HS_size_patterns<SDPatternOperator opnode, + Instruction INSTH, + Instruction INSTS> { + def : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))), + (INSTH FPR16:$Rn, FPR16:$Rm)>; + def : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))), + (INSTS FPR32:$Rn, FPR32:$Rm)>; +} + +multiclass Neon_Scalar3Same_SD_size_patterns<SDPatternOperator opnode, + Instruction INSTS, + Instruction INSTD> { + def : Pat<(v1f32 (opnode (v1f32 FPR32:$Rn), (v1f32 FPR32:$Rm))), + (INSTS FPR32:$Rn, FPR32:$Rm)>; + def : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), + (INSTD FPR64:$Rn, FPR64:$Rm)>; +} + +multiclass Neon_Scalar3Same_cmp_SD_size_patterns<SDPatternOperator opnode, + Instruction INSTS, + Instruction INSTD> { + def : Pat<(v1i32 (opnode (v1f32 FPR32:$Rn), (v1f32 FPR32:$Rm))), + (INSTS FPR32:$Rn, FPR32:$Rm)>; + def : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), + (INSTD FPR64:$Rn, FPR64:$Rm)>; +} + +class Neon_Scalar3Same_cmp_V1_D_size_patterns<CondCode CC, + Instruction INSTD> + : Pat<(v1i64 (Neon_cmp (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm), CC)), + (INSTD FPR64:$Rn, FPR64:$Rm)>; + +// Scalar Three Different + +class NeonI_Scalar3Diff_size<bit u, bits<2> size, bits<4> opcode, string asmop, + RegisterClass FPRCD, RegisterClass FPRCS> + : NeonI_Scalar3Diff<u, size, opcode, + (outs FPRCD:$Rd), (ins FPRCS:$Rn, FPRCS:$Rm), + !strconcat(asmop, "\t$Rd, $Rn, $Rm"), + [], + NoItinerary>; + +multiclass NeonI_Scalar3Diff_HS_size<bit u, bits<4> opcode, string asmop> { + def shh : NeonI_Scalar3Diff_size<u, 0b01, opcode, asmop, FPR32, FPR16>; + def dss : NeonI_Scalar3Diff_size<u, 0b10, opcode, asmop, FPR64, FPR32>; +} + +multiclass NeonI_Scalar3Diff_ml_HS_size<bit u, bits<4> opcode, string asmop> { + let Constraints = "$Src = $Rd" in { + def shh : NeonI_Scalar3Diff<u, 0b01, opcode, + (outs FPR32:$Rd), (ins FPR32:$Src, FPR16:$Rn, FPR16:$Rm), + !strconcat(asmop, "\t$Rd, $Rn, $Rm"), + [], + NoItinerary>; + def dss : NeonI_Scalar3Diff<u, 0b10, opcode, + (outs FPR64:$Rd), (ins FPR64:$Src, FPR32:$Rn, FPR32:$Rm), + !strconcat(asmop, "\t$Rd, $Rn, $Rm"), + [], + NoItinerary>; + } +} + +multiclass Neon_Scalar3Diff_HS_size_patterns<SDPatternOperator opnode, + Instruction INSTH, + Instruction INSTS> { + def : Pat<(v1i32 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))), + (INSTH FPR16:$Rn, FPR16:$Rm)>; + def : Pat<(v1i64 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))), + (INSTS FPR32:$Rn, FPR32:$Rm)>; +} + +multiclass Neon_Scalar3Diff_ml_HS_size_patterns<SDPatternOperator opnode, + Instruction INSTH, + Instruction INSTS> { + def : Pat<(v1i32 (opnode (v1i32 FPR32:$Src), (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))), + (INSTH FPR32:$Src, FPR16:$Rn, FPR16:$Rm)>; + def : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))), + (INSTS FPR64:$Src, FPR32:$Rn, FPR32:$Rm)>; +} + +// Scalar Two Registers Miscellaneous + +class NeonI_Scalar2SameMisc_size<bit u, bits<2> size, bits<5> opcode, string asmop, + RegisterClass FPRCD, RegisterClass FPRCS> + : NeonI_Scalar2SameMisc<u, size, opcode, + (outs FPRCD:$Rd), (ins FPRCS:$Rn), + !strconcat(asmop, "\t$Rd, $Rn"), + [], + NoItinerary>; + +multiclass NeonI_Scalar2SameMisc_SD_size<bit u, bit size_high, bits<5> opcode, + string asmop> { + def ss : NeonI_Scalar2SameMisc_size<u, {size_high, 0b0}, opcode, asmop, FPR32, + FPR32>; + def dd : NeonI_Scalar2SameMisc_size<u, {size_high, 0b1}, opcode, asmop, FPR64, + FPR64>; +} + +multiclass NeonI_Scalar2SameMisc_D_size<bit u, bits<5> opcode, string asmop> { + def dd : NeonI_Scalar2SameMisc_size<u, 0b11, opcode, asmop, FPR64, FPR64>; +} + +multiclass NeonI_Scalar2SameMisc_BHSD_size<bit u, bits<5> opcode, string asmop> + : NeonI_Scalar2SameMisc_D_size<u, opcode, asmop> { + def bb : NeonI_Scalar2SameMisc_size<u, 0b00, opcode, asmop, FPR8, FPR8>; + def hh : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR16, FPR16>; + def ss : NeonI_Scalar2SameMisc_size<u, 0b10, opcode, asmop, FPR32, FPR32>; +} + +class NeonI_Scalar2SameMisc_fcvtxn_D_size<bit u, bits<5> opcode, string asmop> + : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR32, FPR64>; + +multiclass NeonI_Scalar2SameMisc_narrow_HSD_size<bit u, bits<5> opcode, + string asmop> { + def bh : NeonI_Scalar2SameMisc_size<u, 0b00, opcode, asmop, FPR8, FPR16>; + def hs : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR16, FPR32>; + def sd : NeonI_Scalar2SameMisc_size<u, 0b10, opcode, asmop, FPR32, FPR64>; +} + +class NeonI_Scalar2SameMisc_accum_size<bit u, bits<2> size, bits<5> opcode, + string asmop, RegisterClass FPRC> + : NeonI_Scalar2SameMisc<u, size, opcode, + (outs FPRC:$Rd), (ins FPRC:$Src, FPRC:$Rn), + !strconcat(asmop, "\t$Rd, $Rn"), + [], + NoItinerary>; + +multiclass NeonI_Scalar2SameMisc_accum_BHSD_size<bit u, bits<5> opcode, + string asmop> { + + let Constraints = "$Src = $Rd" in { + def bb : NeonI_Scalar2SameMisc_accum_size<u, 0b00, opcode, asmop, FPR8>; + def hh : NeonI_Scalar2SameMisc_accum_size<u, 0b01, opcode, asmop, FPR16>; + def ss : NeonI_Scalar2SameMisc_accum_size<u, 0b10, opcode, asmop, FPR32>; + def dd : NeonI_Scalar2SameMisc_accum_size<u, 0b11, opcode, asmop, FPR64>; + } +} + +class Neon_Scalar2SameMisc_fcvtxn_D_size_patterns<SDPatternOperator opnode, + Instruction INSTD> + : Pat<(v1f32 (opnode (v1f64 FPR64:$Rn))), + (INSTD FPR64:$Rn)>; + +multiclass Neon_Scalar2SameMisc_fcvt_SD_size_patterns<SDPatternOperator opnode, + Instruction INSTS, + Instruction INSTD> { + def : Pat<(v1i32 (opnode (v1f32 FPR32:$Rn))), + (INSTS FPR32:$Rn)>; + def : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn))), + (INSTD FPR64:$Rn)>; +} + +multiclass Neon_Scalar2SameMisc_cvt_SD_size_patterns<SDPatternOperator Sopnode, + SDPatternOperator Dopnode, + Instruction INSTS, + Instruction INSTD> { + def : Pat<(f32 (Sopnode (v1i32 FPR32:$Rn))), + (INSTS FPR32:$Rn)>; + def : Pat<(f64 (Dopnode (v1i64 FPR64:$Rn))), + (INSTD FPR64:$Rn)>; +} + +multiclass Neon_Scalar2SameMisc_SD_size_patterns<SDPatternOperator opnode, + Instruction INSTS, + Instruction INSTD> { + def : Pat<(v1f32 (opnode (v1f32 FPR32:$Rn))), + (INSTS FPR32:$Rn)>; + def : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), + (INSTD FPR64:$Rn)>; +} + +class NeonI_Scalar2SameMisc_cmpz_D_size<bit u, bits<5> opcode, string asmop> + : NeonI_Scalar2SameMisc<u, 0b11, opcode, + (outs FPR64:$Rd), (ins FPR64:$Rn, neon_uimm0:$Imm), + !strconcat(asmop, "\t$Rd, $Rn, $Imm"), + [], + NoItinerary>; + +multiclass NeonI_Scalar2SameMisc_cmpz_SD_size<bit u, bits<5> opcode, + string asmop> { + def ssi : NeonI_Scalar2SameMisc<u, 0b10, opcode, + (outs FPR32:$Rd), (ins FPR32:$Rn, fpz32:$FPImm), + !strconcat(asmop, "\t$Rd, $Rn, $FPImm"), + [], + NoItinerary>; + def ddi : NeonI_Scalar2SameMisc<u, 0b11, opcode, + (outs FPR64:$Rd), (ins FPR64:$Rn, fpz32:$FPImm), + !strconcat(asmop, "\t$Rd, $Rn, $FPImm"), + [], + NoItinerary>; +} + +class Neon_Scalar2SameMisc_cmpz_D_size_patterns<SDPatternOperator opnode, + Instruction INSTD> + : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), + (v1i64 (bitconvert (v8i8 Neon_AllZero))))), + (INSTD FPR64:$Rn, 0)>; + +class Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<CondCode CC, + Instruction INSTD> + : Pat<(v1i64 (Neon_cmpz (v1i64 FPR64:$Rn), + (i32 neon_uimm0:$Imm), CC)), + (INSTD FPR64:$Rn, neon_uimm0:$Imm)>; + +multiclass Neon_Scalar2SameMisc_cmpz_SD_size_patterns<SDPatternOperator opnode, + Instruction INSTS, + Instruction INSTD> { + def : Pat<(v1i32 (opnode (v1f32 FPR32:$Rn), + (v1f32 (scalar_to_vector (f32 fpz32:$FPImm))))), + (INSTS FPR32:$Rn, fpz32:$FPImm)>; + def : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn), + (v1f32 (scalar_to_vector (f32 fpz32:$FPImm))))), + (INSTD FPR64:$Rn, fpz32:$FPImm)>; +} + +multiclass Neon_Scalar2SameMisc_D_size_patterns<SDPatternOperator opnode, + Instruction INSTD> { + def : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn))), + (INSTD FPR64:$Rn)>; +} + +multiclass Neon_Scalar2SameMisc_BHSD_size_patterns<SDPatternOperator opnode, + Instruction INSTB, + Instruction INSTH, + Instruction INSTS, + Instruction INSTD> + : Neon_Scalar2SameMisc_D_size_patterns<opnode, INSTD> { + def : Pat<(v1i8 (opnode (v1i8 FPR8:$Rn))), + (INSTB FPR8:$Rn)>; + def : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn))), + (INSTH FPR16:$Rn)>; + def : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn))), + (INSTS FPR32:$Rn)>; +} + +multiclass Neon_Scalar2SameMisc_narrow_HSD_size_patterns< + SDPatternOperator opnode, + Instruction INSTH, + Instruction INSTS, + Instruction INSTD> { + def : Pat<(v1i8 (opnode (v1i16 FPR16:$Rn))), + (INSTH FPR16:$Rn)>; + def : Pat<(v1i16 (opnode (v1i32 FPR32:$Rn))), + (INSTS FPR32:$Rn)>; + def : Pat<(v1i32 (opnode (v1i64 FPR64:$Rn))), + (INSTD FPR64:$Rn)>; + +} + +multiclass Neon_Scalar2SameMisc_accum_BHSD_size_patterns< + SDPatternOperator opnode, + Instruction INSTB, + Instruction INSTH, + Instruction INSTS, + Instruction INSTD> { + def : Pat<(v1i8 (opnode (v1i8 FPR8:$Src), (v1i8 FPR8:$Rn))), + (INSTB FPR8:$Src, FPR8:$Rn)>; + def : Pat<(v1i16 (opnode (v1i16 FPR16:$Src), (v1i16 FPR16:$Rn))), + (INSTH FPR16:$Src, FPR16:$Rn)>; + def : Pat<(v1i32 (opnode (v1i32 FPR32:$Src), (v1i32 FPR32:$Rn))), + (INSTS FPR32:$Src, FPR32:$Rn)>; + def : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn))), + (INSTD FPR64:$Src, FPR64:$Rn)>; +} + +// Scalar Shift By Immediate + +class NeonI_ScalarShiftImm_size<bit u, bits<5> opcode, string asmop, + RegisterClass FPRC, Operand ImmTy> + : NeonI_ScalarShiftImm<u, opcode, + (outs FPRC:$Rd), (ins FPRC:$Rn, ImmTy:$Imm), + !strconcat(asmop, "\t$Rd, $Rn, $Imm"), + [], NoItinerary>; + +multiclass NeonI_ScalarShiftRightImm_D_size<bit u, bits<5> opcode, + string asmop> { + def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shr_imm64> { + bits<6> Imm; + let Inst{22} = 0b1; // immh:immb = 1xxxxxx + let Inst{21-16} = Imm; + } +} + +multiclass NeonI_ScalarShiftRightImm_BHSD_size<bit u, bits<5> opcode, + string asmop> + : NeonI_ScalarShiftRightImm_D_size<u, opcode, asmop> { + def bbi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR8, shr_imm8> { + bits<3> Imm; + let Inst{22-19} = 0b0001; // immh:immb = 0001xxx + let Inst{18-16} = Imm; + } + def hhi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR16, shr_imm16> { + bits<4> Imm; + let Inst{22-20} = 0b001; // immh:immb = 001xxxx + let Inst{19-16} = Imm; + } + def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shr_imm32> { + bits<5> Imm; + let Inst{22-21} = 0b01; // immh:immb = 01xxxxx + let Inst{20-16} = Imm; + } +} + +multiclass NeonI_ScalarShiftLeftImm_D_size<bit u, bits<5> opcode, + string asmop> { + def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shl_imm64> { + bits<6> Imm; + let Inst{22} = 0b1; // immh:immb = 1xxxxxx + let Inst{21-16} = Imm; + } +} + +multiclass NeonI_ScalarShiftLeftImm_BHSD_size<bit u, bits<5> opcode, + string asmop> + : NeonI_ScalarShiftLeftImm_D_size<u, opcode, asmop> { + def bbi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR8, shl_imm8> { + bits<3> Imm; + let Inst{22-19} = 0b0001; // immh:immb = 0001xxx + let Inst{18-16} = Imm; + } + def hhi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR16, shl_imm16> { + bits<4> Imm; + let Inst{22-20} = 0b001; // immh:immb = 001xxxx + let Inst{19-16} = Imm; + } + def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shl_imm32> { + bits<5> Imm; + let Inst{22-21} = 0b01; // immh:immb = 01xxxxx + let Inst{20-16} = Imm; + } +} + +class NeonI_ScalarShiftRightImm_accum_D_size<bit u, bits<5> opcode, string asmop> + : NeonI_ScalarShiftImm<u, opcode, + (outs FPR64:$Rd), + (ins FPR64:$Src, FPR64:$Rn, shr_imm64:$Imm), + !strconcat(asmop, "\t$Rd, $Rn, $Imm"), + [], NoItinerary> { + bits<6> Imm; + let Inst{22} = 0b1; // immh:immb = 1xxxxxx + let Inst{21-16} = Imm; + let Constraints = "$Src = $Rd"; +} + +class NeonI_ScalarShiftLeftImm_accum_D_size<bit u, bits<5> opcode, string asmop> + : NeonI_ScalarShiftImm<u, opcode, + (outs FPR64:$Rd), + (ins FPR64:$Src, FPR64:$Rn, shl_imm64:$Imm), + !strconcat(asmop, "\t$Rd, $Rn, $Imm"), + [], NoItinerary> { + bits<6> Imm; + let Inst{22} = 0b1; // immh:immb = 1xxxxxx + let Inst{21-16} = Imm; + let Constraints = "$Src = $Rd"; +} + +class NeonI_ScalarShiftImm_narrow_size<bit u, bits<5> opcode, string asmop, + RegisterClass FPRCD, RegisterClass FPRCS, + Operand ImmTy> + : NeonI_ScalarShiftImm<u, opcode, + (outs FPRCD:$Rd), (ins FPRCS:$Rn, ImmTy:$Imm), + !strconcat(asmop, "\t$Rd, $Rn, $Imm"), + [], NoItinerary>; + +multiclass NeonI_ScalarShiftImm_narrow_HSD_size<bit u, bits<5> opcode, + string asmop> { + def bhi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR8, FPR16, + shr_imm8> { + bits<3> Imm; + let Inst{22-19} = 0b0001; // immh:immb = 0001xxx + let Inst{18-16} = Imm; + } + def hsi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR16, FPR32, + shr_imm16> { + bits<4> Imm; + let Inst{22-20} = 0b001; // immh:immb = 001xxxx + let Inst{19-16} = Imm; + } + def sdi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR32, FPR64, + shr_imm32> { + bits<5> Imm; + let Inst{22-21} = 0b01; // immh:immb = 01xxxxx + let Inst{20-16} = Imm; + } +} + +multiclass NeonI_ScalarShiftImm_cvt_SD_size<bit u, bits<5> opcode, string asmop> { + def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shr_imm32> { + bits<5> Imm; + let Inst{22-21} = 0b01; // immh:immb = 01xxxxx + let Inst{20-16} = Imm; + } + def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shr_imm64> { + bits<6> Imm; + let Inst{22} = 0b1; // immh:immb = 1xxxxxx + let Inst{21-16} = Imm; + } +} + +multiclass Neon_ScalarShiftRImm_D_size_patterns<SDPatternOperator opnode, + Instruction INSTD> { + def ddi : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))), + (INSTD FPR64:$Rn, imm:$Imm)>; +} + +multiclass Neon_ScalarShiftLImm_D_size_patterns<SDPatternOperator opnode, + Instruction INSTD> { + def ddi : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (i32 shl_imm64:$Imm))), + (INSTD FPR64:$Rn, imm:$Imm)>; +} + +class Neon_ScalarShiftImm_arm_D_size_patterns<SDPatternOperator opnode, + Instruction INSTD> + : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), + (v1i64 (Neon_vdup (i32 shr_imm64:$Imm))))), + (INSTD FPR64:$Rn, imm:$Imm)>; + +multiclass Neon_ScalarShiftLImm_BHSD_size_patterns<SDPatternOperator opnode, + Instruction INSTB, + Instruction INSTH, + Instruction INSTS, + Instruction INSTD> + : Neon_ScalarShiftLImm_D_size_patterns<opnode, INSTD> { + def bbi : Pat<(v1i8 (opnode (v1i8 FPR8:$Rn), (i32 shl_imm8:$Imm))), + (INSTB FPR8:$Rn, imm:$Imm)>; + def hhi : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (i32 shl_imm16:$Imm))), + (INSTH FPR16:$Rn, imm:$Imm)>; + def ssi : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (i32 shl_imm32:$Imm))), + (INSTS FPR32:$Rn, imm:$Imm)>; +} + +class Neon_ScalarShiftLImm_accum_D_size_patterns<SDPatternOperator opnode, + Instruction INSTD> + : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn), + (i32 shl_imm64:$Imm))), + (INSTD FPR64:$Src, FPR64:$Rn, imm:$Imm)>; + +class Neon_ScalarShiftRImm_accum_D_size_patterns<SDPatternOperator opnode, + Instruction INSTD> + : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn), + (i32 shr_imm64:$Imm))), + (INSTD FPR64:$Src, FPR64:$Rn, imm:$Imm)>; + +multiclass Neon_ScalarShiftImm_narrow_HSD_size_patterns< + SDPatternOperator opnode, + Instruction INSTH, + Instruction INSTS, + Instruction INSTD> { + def bhi : Pat<(v1i8 (opnode (v1i16 FPR16:$Rn), (i32 shr_imm16:$Imm))), + (INSTH FPR16:$Rn, imm:$Imm)>; + def hsi : Pat<(v1i16 (opnode (v1i32 FPR32:$Rn), (i32 shr_imm32:$Imm))), + (INSTS FPR32:$Rn, imm:$Imm)>; + def sdi : Pat<(v1i32 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))), + (INSTD FPR64:$Rn, imm:$Imm)>; +} + +multiclass Neon_ScalarShiftImm_scvtf_SD_size_patterns<SDPatternOperator Sopnode, + SDPatternOperator Dopnode, + Instruction INSTS, + Instruction INSTD> { + def ssi : Pat<(f32 (Sopnode (v1i32 FPR32:$Rn), (i32 shr_imm32:$Imm))), + (INSTS FPR32:$Rn, imm:$Imm)>; + def ddi : Pat<(f64 (Dopnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))), + (INSTD FPR64:$Rn, imm:$Imm)>; +} + +multiclass Neon_ScalarShiftImm_fcvts_SD_size_patterns<SDPatternOperator Sopnode, + SDPatternOperator Dopnode, + Instruction INSTS, + Instruction INSTD> { + def ssi : Pat<(v1i32 (Sopnode (v1f32 FPR32:$Rn), (i32 shr_imm32:$Imm))), + (INSTS FPR32:$Rn, imm:$Imm)>; + def ddi : Pat<(v1i64 (Dopnode (v1f64 FPR64:$Rn), (i32 shr_imm64:$Imm))), + (INSTD FPR64:$Rn, imm:$Imm)>; +} + +// Scalar Signed Shift Right (Immediate) +defm SSHR : NeonI_ScalarShiftRightImm_D_size<0b0, 0b00000, "sshr">; +defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vshrds_n, SSHRddi>; +// Pattern to match llvm.arm.* intrinsic. +def : Neon_ScalarShiftImm_arm_D_size_patterns<sra, SSHRddi>; + +// Scalar Unsigned Shift Right (Immediate) +defm USHR : NeonI_ScalarShiftRightImm_D_size<0b1, 0b00000, "ushr">; +defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vshrdu_n, USHRddi>; +// Pattern to match llvm.arm.* intrinsic. +def : Neon_ScalarShiftImm_arm_D_size_patterns<srl, USHRddi>; + +// Scalar Signed Rounding Shift Right (Immediate) +defm SRSHR : NeonI_ScalarShiftRightImm_D_size<0b0, 0b00100, "srshr">; +defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vsrshr, SRSHRddi>; + +// Scalar Unigned Rounding Shift Right (Immediate) +defm URSHR : NeonI_ScalarShiftRightImm_D_size<0b1, 0b00100, "urshr">; +defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vurshr, URSHRddi>; + +// Scalar Signed Shift Right and Accumulate (Immediate) +def SSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b0, 0b00010, "ssra">; +def : Neon_ScalarShiftRImm_accum_D_size_patterns + <int_aarch64_neon_vsrads_n, SSRA>; + +// Scalar Unsigned Shift Right and Accumulate (Immediate) +def USRA : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b00010, "usra">; +def : Neon_ScalarShiftRImm_accum_D_size_patterns + <int_aarch64_neon_vsradu_n, USRA>; + +// Scalar Signed Rounding Shift Right and Accumulate (Immediate) +def SRSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b0, 0b00110, "srsra">; +def : Neon_ScalarShiftRImm_accum_D_size_patterns + <int_aarch64_neon_vrsrads_n, SRSRA>; + +// Scalar Unsigned Rounding Shift Right and Accumulate (Immediate) +def URSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b00110, "ursra">; +def : Neon_ScalarShiftRImm_accum_D_size_patterns + <int_aarch64_neon_vrsradu_n, URSRA>; + +// Scalar Shift Left (Immediate) +defm SHL : NeonI_ScalarShiftLeftImm_D_size<0b0, 0b01010, "shl">; +defm : Neon_ScalarShiftLImm_D_size_patterns<int_aarch64_neon_vshld_n, SHLddi>; +// Pattern to match llvm.arm.* intrinsic. +def : Neon_ScalarShiftImm_arm_D_size_patterns<shl, SHLddi>; + +// Signed Saturating Shift Left (Immediate) +defm SQSHL : NeonI_ScalarShiftLeftImm_BHSD_size<0b0, 0b01110, "sqshl">; +defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vqshls_n, + SQSHLbbi, SQSHLhhi, + SQSHLssi, SQSHLddi>; +// Pattern to match llvm.arm.* intrinsic. +defm : Neon_ScalarShiftLImm_D_size_patterns<Neon_sqrshlImm, SQSHLddi>; + +// Unsigned Saturating Shift Left (Immediate) +defm UQSHL : NeonI_ScalarShiftLeftImm_BHSD_size<0b1, 0b01110, "uqshl">; +defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vqshlu_n, + UQSHLbbi, UQSHLhhi, + UQSHLssi, UQSHLddi>; +// Pattern to match llvm.arm.* intrinsic. +defm : Neon_ScalarShiftLImm_D_size_patterns<Neon_uqrshlImm, UQSHLddi>; + +// Signed Saturating Shift Left Unsigned (Immediate) +defm SQSHLU : NeonI_ScalarShiftLeftImm_BHSD_size<0b1, 0b01100, "sqshlu">; +defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vsqshlu, + SQSHLUbbi, SQSHLUhhi, + SQSHLUssi, SQSHLUddi>; + +// Shift Right And Insert (Immediate) +def SRI : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b01000, "sri">; +def : Neon_ScalarShiftRImm_accum_D_size_patterns + <int_aarch64_neon_vsri, SRI>; + +// Shift Left And Insert (Immediate) +def SLI : NeonI_ScalarShiftLeftImm_accum_D_size<0b1, 0b01010, "sli">; +def : Neon_ScalarShiftLImm_accum_D_size_patterns + <int_aarch64_neon_vsli, SLI>; + +// Signed Saturating Shift Right Narrow (Immediate) +defm SQSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b0, 0b10010, "sqshrn">; +defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqshrn, + SQSHRNbhi, SQSHRNhsi, + SQSHRNsdi>; + +// Unsigned Saturating Shift Right Narrow (Immediate) +defm UQSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10010, "uqshrn">; +defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vuqshrn, + UQSHRNbhi, UQSHRNhsi, + UQSHRNsdi>; + +// Signed Saturating Rounded Shift Right Narrow (Immediate) +defm SQRSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b0, 0b10011, "sqrshrn">; +defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqrshrn, + SQRSHRNbhi, SQRSHRNhsi, + SQRSHRNsdi>; + +// Unsigned Saturating Rounded Shift Right Narrow (Immediate) +defm UQRSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10011, "uqrshrn">; +defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vuqrshrn, + UQRSHRNbhi, UQRSHRNhsi, + UQRSHRNsdi>; + +// Signed Saturating Shift Right Unsigned Narrow (Immediate) +defm SQSHRUN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10000, "sqshrun">; +defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqshrun, + SQSHRUNbhi, SQSHRUNhsi, + SQSHRUNsdi>; + +// Signed Saturating Rounded Shift Right Unsigned Narrow (Immediate) +defm SQRSHRUN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10001, "sqrshrun">; +defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqrshrun, + SQRSHRUNbhi, SQRSHRUNhsi, + SQRSHRUNsdi>; + +// Scalar Signed Fixed-point Convert To Floating-Point (Immediate) +defm SCVTF_N : NeonI_ScalarShiftImm_cvt_SD_size<0b0, 0b11100, "scvtf">; +defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns<int_aarch64_neon_vcvtf32_n_s32, + int_aarch64_neon_vcvtf64_n_s64, + SCVTF_Nssi, SCVTF_Nddi>; + +// Scalar Unsigned Fixed-point Convert To Floating-Point (Immediate) +defm UCVTF_N : NeonI_ScalarShiftImm_cvt_SD_size<0b1, 0b11100, "ucvtf">; +defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns<int_aarch64_neon_vcvtf32_n_u32, + int_aarch64_neon_vcvtf64_n_u64, + UCVTF_Nssi, UCVTF_Nddi>; + +// Scalar Floating-point Convert To Signed Fixed-point (Immediate) +defm FCVTZS_N : NeonI_ScalarShiftImm_cvt_SD_size<0b0, 0b11111, "fcvtzs">; +defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns<int_aarch64_neon_vcvts_n_s32_f32, + int_aarch64_neon_vcvtd_n_s64_f64, + FCVTZS_Nssi, FCVTZS_Nddi>; + +// Scalar Floating-point Convert To Unsigned Fixed-point (Immediate) +defm FCVTZU_N : NeonI_ScalarShiftImm_cvt_SD_size<0b1, 0b11111, "fcvtzu">; +defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns<int_aarch64_neon_vcvts_n_u32_f32, + int_aarch64_neon_vcvtd_n_u64_f64, + FCVTZU_Nssi, FCVTZU_Nddi>; + +// Patterns For Convert Instructions Between v1f64 and v1i64 +class Neon_ScalarShiftImm_cvtf_v1f64_pattern<SDPatternOperator opnode, + Instruction INST> + : Pat<(v1f64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))), + (INST FPR64:$Rn, imm:$Imm)>; + +class Neon_ScalarShiftImm_fcvt_v1f64_pattern<SDPatternOperator opnode, + Instruction INST> + : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn), (i32 shr_imm64:$Imm))), + (INST FPR64:$Rn, imm:$Imm)>; + +def : Neon_ScalarShiftImm_cvtf_v1f64_pattern<int_arm_neon_vcvtfxs2fp, + SCVTF_Nddi>; + +def : Neon_ScalarShiftImm_cvtf_v1f64_pattern<int_arm_neon_vcvtfxu2fp, + UCVTF_Nddi>; + +def : Neon_ScalarShiftImm_fcvt_v1f64_pattern<int_arm_neon_vcvtfp2fxs, + FCVTZS_Nddi>; + +def : Neon_ScalarShiftImm_fcvt_v1f64_pattern<int_arm_neon_vcvtfp2fxu, + FCVTZU_Nddi>; + +// Scalar Integer Add +let isCommutable = 1 in { +def ADDddd : NeonI_Scalar3Same_D_size<0b0, 0b10000, "add">; +} + +// Scalar Integer Sub +def SUBddd : NeonI_Scalar3Same_D_size<0b1, 0b10000, "sub">; + +// Pattern for Scalar Integer Add and Sub with D register only +defm : Neon_Scalar3Same_D_size_patterns<add, ADDddd>; +defm : Neon_Scalar3Same_D_size_patterns<sub, SUBddd>; + +// Patterns to match llvm.aarch64.* intrinsic for Scalar Add, Sub +defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vaddds, ADDddd>; +defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vadddu, ADDddd>; +defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vsubds, SUBddd>; +defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vsubdu, SUBddd>; + +// Scalar Integer Saturating Add (Signed, Unsigned) +defm SQADD : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00001, "sqadd", 1>; +defm UQADD : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00001, "uqadd", 1>; + +// Scalar Integer Saturating Sub (Signed, Unsigned) +defm SQSUB : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00101, "sqsub", 0>; +defm UQSUB : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00101, "uqsub", 0>; + + +// Patterns to match llvm.aarch64.* intrinsic for +// Scalar Integer Saturating Add, Sub (Signed, Unsigned) +defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqadds, SQADDbbb, + SQADDhhh, SQADDsss, SQADDddd>; +defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqaddu, UQADDbbb, + UQADDhhh, UQADDsss, UQADDddd>; +defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqsubs, SQSUBbbb, + SQSUBhhh, SQSUBsss, SQSUBddd>; +defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqsubu, UQSUBbbb, + UQSUBhhh, UQSUBsss, UQSUBddd>; + +// Scalar Integer Saturating Doubling Multiply Half High +defm SQDMULH : NeonI_Scalar3Same_HS_sizes<0b0, 0b10110, "sqdmulh", 1>; + +// Scalar Integer Saturating Rounding Doubling Multiply Half High +defm SQRDMULH : NeonI_Scalar3Same_HS_sizes<0b1, 0b10110, "sqrdmulh", 1>; + +// Patterns to match llvm.arm.* intrinsic for +// Scalar Integer Saturating Doubling Multiply Half High and +// Scalar Integer Saturating Rounding Doubling Multiply Half High +defm : Neon_Scalar3Same_HS_size_patterns<int_arm_neon_vqdmulh, SQDMULHhhh, + SQDMULHsss>; +defm : Neon_Scalar3Same_HS_size_patterns<int_arm_neon_vqrdmulh, SQRDMULHhhh, + SQRDMULHsss>; + +// Scalar Floating-point Multiply Extended +defm FMULX : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11011, "fmulx", 1>; + +// Scalar Floating-point Reciprocal Step +defm FRECPS : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11111, "frecps", 0>; + +// Scalar Floating-point Reciprocal Square Root Step +defm FRSQRTS : NeonI_Scalar3Same_SD_sizes<0b0, 0b1, 0b11111, "frsqrts", 0>; + +// Patterns to match llvm.arm.* intrinsic for +// Scalar Floating-point Reciprocal Step and +// Scalar Floating-point Reciprocal Square Root Step +defm : Neon_Scalar3Same_SD_size_patterns<int_arm_neon_vrecps, FRECPSsss, + FRECPSddd>; +defm : Neon_Scalar3Same_SD_size_patterns<int_arm_neon_vrsqrts, FRSQRTSsss, + FRSQRTSddd>; + +def : Pat<(v1f64 (fsqrt (v1f64 FPR64:$Rn))), (FSQRTdd FPR64:$Rn)>; + +// Patterns to match llvm.aarch64.* intrinsic for +// Scalar Floating-point Multiply Extended, +multiclass Neon_Scalar3Same_MULX_SD_size_patterns<SDPatternOperator opnode, + Instruction INSTS, + Instruction INSTD> { + def : Pat<(f32 (opnode (f32 FPR32:$Rn), (f32 FPR32:$Rm))), + (INSTS FPR32:$Rn, FPR32:$Rm)>; + def : Pat<(f64 (opnode (f64 FPR64:$Rn), (f64 FPR64:$Rm))), + (INSTD FPR64:$Rn, FPR64:$Rm)>; +} + +defm : Neon_Scalar3Same_MULX_SD_size_patterns<int_aarch64_neon_vmulx, + FMULXsss,FMULXddd>; + +// Scalar Integer Shift Left (Signed, Unsigned) +def SSHLddd : NeonI_Scalar3Same_D_size<0b0, 0b01000, "sshl">; +def USHLddd : NeonI_Scalar3Same_D_size<0b1, 0b01000, "ushl">; + +// Patterns to match llvm.arm.* intrinsic for +// Scalar Integer Shift Left (Signed, Unsigned) +defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vshifts, SSHLddd>; +defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vshiftu, USHLddd>; + +// Patterns to match llvm.aarch64.* intrinsic for +// Scalar Integer Shift Left (Signed, Unsigned) +defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vshlds, SSHLddd>; +defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vshldu, USHLddd>; + +// Scalar Integer Saturating Shift Left (Signed, Unsigned) +defm SQSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01001, "sqshl", 0>; +defm UQSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01001, "uqshl", 0>; + +// Patterns to match llvm.aarch64.* intrinsic for +// Scalar Integer Saturating Shift Letf (Signed, Unsigned) +defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqshls, SQSHLbbb, + SQSHLhhh, SQSHLsss, SQSHLddd>; +defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqshlu, UQSHLbbb, + UQSHLhhh, UQSHLsss, UQSHLddd>; + +// Patterns to match llvm.arm.* intrinsic for +// Scalar Integer Saturating Shift Letf (Signed, Unsigned) +defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqshifts, SQSHLddd>; +defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqshiftu, UQSHLddd>; + +// Scalar Integer Rounding Shift Left (Signed, Unsigned) +def SRSHLddd: NeonI_Scalar3Same_D_size<0b0, 0b01010, "srshl">; +def URSHLddd: NeonI_Scalar3Same_D_size<0b1, 0b01010, "urshl">; + +// Patterns to match llvm.aarch64.* intrinsic for +// Scalar Integer Rounding Shift Left (Signed, Unsigned) +defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vrshlds, SRSHLddd>; +defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vrshldu, URSHLddd>; + +// Patterns to match llvm.arm.* intrinsic for +// Scalar Integer Rounding Shift Left (Signed, Unsigned) +defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vrshifts, SRSHLddd>; +defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vrshiftu, URSHLddd>; + +// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned) +defm SQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01011, "sqrshl", 0>; +defm UQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01011, "uqrshl", 0>; + +// Patterns to match llvm.aarch64.* intrinsic for +// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned) +defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqrshls, SQRSHLbbb, + SQRSHLhhh, SQRSHLsss, SQRSHLddd>; +defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqrshlu, UQRSHLbbb, + UQRSHLhhh, UQRSHLsss, UQRSHLddd>; + +// Patterns to match llvm.arm.* intrinsic for +// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned) +defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqrshifts, SQRSHLddd>; +defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqrshiftu, UQRSHLddd>; + +// Signed Saturating Doubling Multiply-Add Long +defm SQDMLAL : NeonI_Scalar3Diff_ml_HS_size<0b0, 0b1001, "sqdmlal">; +defm : Neon_Scalar3Diff_ml_HS_size_patterns<int_aarch64_neon_vqdmlal, + SQDMLALshh, SQDMLALdss>; + +// Signed Saturating Doubling Multiply-Subtract Long +defm SQDMLSL : NeonI_Scalar3Diff_ml_HS_size<0b0, 0b1011, "sqdmlsl">; +defm : Neon_Scalar3Diff_ml_HS_size_patterns<int_aarch64_neon_vqdmlsl, + SQDMLSLshh, SQDMLSLdss>; + +// Signed Saturating Doubling Multiply Long +defm SQDMULL : NeonI_Scalar3Diff_HS_size<0b0, 0b1101, "sqdmull">; +defm : Neon_Scalar3Diff_HS_size_patterns<int_arm_neon_vqdmull, + SQDMULLshh, SQDMULLdss>; + +// Scalar Signed Integer Convert To Floating-point +defm SCVTF : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11101, "scvtf">; +defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns<int_aarch64_neon_vcvtf32_s32, + int_aarch64_neon_vcvtf64_s64, + SCVTFss, SCVTFdd>; + +// Scalar Unsigned Integer Convert To Floating-point +defm UCVTF : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11101, "ucvtf">; +defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns<int_aarch64_neon_vcvtf32_u32, + int_aarch64_neon_vcvtf64_u64, + UCVTFss, UCVTFdd>; + +// Scalar Floating-point Converts +def FCVTXN : NeonI_Scalar2SameMisc_fcvtxn_D_size<0b1, 0b10110, "fcvtxn">; +def : Neon_Scalar2SameMisc_fcvtxn_D_size_patterns<int_aarch64_neon_fcvtxn, + FCVTXN>; + +defm FCVTNS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11010, "fcvtns">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtns, + FCVTNSss, FCVTNSdd>; + +defm FCVTNU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11010, "fcvtnu">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtnu, + FCVTNUss, FCVTNUdd>; + +defm FCVTMS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11011, "fcvtms">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtms, + FCVTMSss, FCVTMSdd>; + +defm FCVTMU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11011, "fcvtmu">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtmu, + FCVTMUss, FCVTMUdd>; + +defm FCVTAS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11100, "fcvtas">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtas, + FCVTASss, FCVTASdd>; + +defm FCVTAU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11100, "fcvtau">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtau, + FCVTAUss, FCVTAUdd>; + +defm FCVTPS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11010, "fcvtps">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtps, + FCVTPSss, FCVTPSdd>; + +defm FCVTPU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11010, "fcvtpu">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtpu, + FCVTPUss, FCVTPUdd>; + +defm FCVTZS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11011, "fcvtzs">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtzs, + FCVTZSss, FCVTZSdd>; + +defm FCVTZU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11011, "fcvtzu">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtzu, + FCVTZUss, FCVTZUdd>; + +// Patterns For Convert Instructions Between v1f64 and v1i64 +class Neon_Scalar2SameMisc_cvtf_v1f64_pattern<SDPatternOperator opnode, + Instruction INST> + : Pat<(v1f64 (opnode (v1i64 FPR64:$Rn))), (INST FPR64:$Rn)>; + +class Neon_Scalar2SameMisc_fcvt_v1f64_pattern<SDPatternOperator opnode, + Instruction INST> + : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>; + +def : Neon_Scalar2SameMisc_cvtf_v1f64_pattern<sint_to_fp, SCVTFdd>; +def : Neon_Scalar2SameMisc_cvtf_v1f64_pattern<uint_to_fp, UCVTFdd>; + +def : Neon_Scalar2SameMisc_fcvt_v1f64_pattern<fp_to_sint, FCVTZSdd>; +def : Neon_Scalar2SameMisc_fcvt_v1f64_pattern<fp_to_uint, FCVTZUdd>; + +// Scalar Floating-point Reciprocal Estimate +defm FRECPE : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11101, "frecpe">; +defm : Neon_Scalar2SameMisc_SD_size_patterns<int_arm_neon_vrecpe, + FRECPEss, FRECPEdd>; + +// Scalar Floating-point Reciprocal Exponent +defm FRECPX : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11111, "frecpx">; +defm : Neon_Scalar2SameMisc_SD_size_patterns<int_aarch64_neon_vrecpx, + FRECPXss, FRECPXdd>; + +// Scalar Floating-point Reciprocal Square Root Estimate +defm FRSQRTE: NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11101, "frsqrte">; +defm : Neon_Scalar2SameMisc_SD_size_patterns<int_arm_neon_vrsqrte, + FRSQRTEss, FRSQRTEdd>; + +// Scalar Floating-point Round +class Neon_ScalarFloatRound_pattern<SDPatternOperator opnode, Instruction INST> + : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>; + +def : Neon_ScalarFloatRound_pattern<fceil, FRINTPdd>; +def : Neon_ScalarFloatRound_pattern<ffloor, FRINTMdd>; +def : Neon_ScalarFloatRound_pattern<ftrunc, FRINTZdd>; +def : Neon_ScalarFloatRound_pattern<frint, FRINTXdd>; +def : Neon_ScalarFloatRound_pattern<fnearbyint, FRINTIdd>; +def : Neon_ScalarFloatRound_pattern<frnd, FRINTAdd>; +def : Neon_ScalarFloatRound_pattern<int_aarch64_neon_frintn, FRINTNdd>; + +// Scalar Integer Compare + +// Scalar Compare Bitwise Equal +def CMEQddd: NeonI_Scalar3Same_D_size<0b1, 0b10001, "cmeq">; +def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vceq, CMEQddd>; + +class Neon_Scalar3Same_cmp_D_size_v1_patterns<SDPatternOperator opnode, + Instruction INSTD, + CondCode CC> + : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm), CC)), + (INSTD FPR64:$Rn, FPR64:$Rm)>; + +def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMEQddd, SETEQ>; + +// Scalar Compare Signed Greather Than Or Equal +def CMGEddd: NeonI_Scalar3Same_D_size<0b0, 0b00111, "cmge">; +def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vcge, CMGEddd>; +def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMGEddd, SETGE>; + +// Scalar Compare Unsigned Higher Or Same +def CMHSddd: NeonI_Scalar3Same_D_size<0b1, 0b00111, "cmhs">; +def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vchs, CMHSddd>; +def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMHSddd, SETUGE>; + +// Scalar Compare Unsigned Higher +def CMHIddd: NeonI_Scalar3Same_D_size<0b1, 0b00110, "cmhi">; +def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vchi, CMHIddd>; +def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMHIddd, SETUGT>; + +// Scalar Compare Signed Greater Than +def CMGTddd: NeonI_Scalar3Same_D_size<0b0, 0b00110, "cmgt">; +def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vcgt, CMGTddd>; +def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMGTddd, SETGT>; + +// Scalar Compare Bitwise Test Bits +def CMTSTddd: NeonI_Scalar3Same_D_size<0b0, 0b10001, "cmtst">; +def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vtstd, CMTSTddd>; +def : Neon_Scalar3Same_cmp_D_size_patterns<Neon_tst, CMTSTddd>; + +// Scalar Compare Bitwise Equal To Zero +def CMEQddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01001, "cmeq">; +def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vceq, + CMEQddi>; +def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETEQ, CMEQddi>; + +// Scalar Compare Signed Greather Than Or Equal To Zero +def CMGEddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b1, 0b01000, "cmge">; +def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcge, + CMGEddi>; +def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETGE, CMGEddi>; + +// Scalar Compare Signed Greater Than Zero +def CMGTddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01000, "cmgt">; +def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcgt, + CMGTddi>; +def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETGT, CMGTddi>; + +// Scalar Compare Signed Less Than Or Equal To Zero +def CMLEddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b1, 0b01001, "cmle">; +def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vclez, + CMLEddi>; +def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETLE, CMLEddi>; + +// Scalar Compare Less Than Zero +def CMLTddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01010, "cmlt">; +def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcltz, + CMLTddi>; +def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETLT, CMLTddi>; + +// Scalar Floating-point Compare + +// Scalar Floating-point Compare Mask Equal +defm FCMEQ: NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11100, "fcmeq">; +defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vceq, + FCMEQsss, FCMEQddd>; +def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETEQ, FCMEQddd>; + +// Scalar Floating-point Compare Mask Equal To Zero +defm FCMEQZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01101, "fcmeq">; +defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vceq, + FCMEQZssi, FCMEQZddi>; +def : Pat<(v1i64 (Neon_cmpz (v1f64 FPR64:$Rn), (f32 fpz32:$FPImm), SETEQ)), + (FCMEQZddi FPR64:$Rn, fpz32:$FPImm)>; + +// Scalar Floating-point Compare Mask Greater Than Or Equal +defm FCMGE: NeonI_Scalar3Same_SD_sizes<0b1, 0b0, 0b11100, "fcmge">; +defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vcge, + FCMGEsss, FCMGEddd>; +def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETGE, FCMGEddd>; + +// Scalar Floating-point Compare Mask Greater Than Or Equal To Zero +defm FCMGEZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b1, 0b01100, "fcmge">; +defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vcge, + FCMGEZssi, FCMGEZddi>; + +// Scalar Floating-point Compare Mask Greather Than +defm FCMGT: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11100, "fcmgt">; +defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vcgt, + FCMGTsss, FCMGTddd>; +def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETGT, FCMGTddd>; + +// Scalar Floating-point Compare Mask Greather Than Zero +defm FCMGTZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01100, "fcmgt">; +defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vcgt, + FCMGTZssi, FCMGTZddi>; + +// Scalar Floating-point Compare Mask Less Than Or Equal To Zero +defm FCMLEZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b1, 0b01101, "fcmle">; +defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vclez, + FCMLEZssi, FCMLEZddi>; + +// Scalar Floating-point Compare Mask Less Than Zero +defm FCMLTZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01110, "fcmlt">; +defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vcltz, + FCMLTZssi, FCMLTZddi>; + +// Scalar Floating-point Absolute Compare Mask Greater Than Or Equal +defm FACGE: NeonI_Scalar3Same_SD_sizes<0b1, 0b0, 0b11101, "facge">; +defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vcage, + FACGEsss, FACGEddd>; + +// Scalar Floating-point Absolute Compare Mask Greater Than +defm FACGT: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11101, "facgt">; +defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vcagt, + FACGTsss, FACGTddd>; + +// Scakar Floating-point Absolute Difference +defm FABD: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11010, "fabd">; +defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_vabd, + FABDsss, FABDddd>; + +// Scalar Absolute Value +defm ABS : NeonI_Scalar2SameMisc_D_size<0b0, 0b01011, "abs">; +defm : Neon_Scalar2SameMisc_D_size_patterns<int_aarch64_neon_vabs, ABSdd>; + +// Scalar Signed Saturating Absolute Value +defm SQABS : NeonI_Scalar2SameMisc_BHSD_size<0b0, 0b00111, "sqabs">; +defm : Neon_Scalar2SameMisc_BHSD_size_patterns<int_arm_neon_vqabs, + SQABSbb, SQABShh, SQABSss, SQABSdd>; + +// Scalar Negate +defm NEG : NeonI_Scalar2SameMisc_D_size<0b1, 0b01011, "neg">; +defm : Neon_Scalar2SameMisc_D_size_patterns<int_aarch64_neon_vneg, NEGdd>; + +// Scalar Signed Saturating Negate +defm SQNEG : NeonI_Scalar2SameMisc_BHSD_size<0b1, 0b00111, "sqneg">; +defm : Neon_Scalar2SameMisc_BHSD_size_patterns<int_arm_neon_vqneg, + SQNEGbb, SQNEGhh, SQNEGss, SQNEGdd>; + +// Scalar Signed Saturating Accumulated of Unsigned Value +defm SUQADD : NeonI_Scalar2SameMisc_accum_BHSD_size<0b0, 0b00011, "suqadd">; +defm : Neon_Scalar2SameMisc_accum_BHSD_size_patterns<int_aarch64_neon_vuqadd, + SUQADDbb, SUQADDhh, + SUQADDss, SUQADDdd>; + +// Scalar Unsigned Saturating Accumulated of Signed Value +defm USQADD : NeonI_Scalar2SameMisc_accum_BHSD_size<0b1, 0b00011, "usqadd">; +defm : Neon_Scalar2SameMisc_accum_BHSD_size_patterns<int_aarch64_neon_vsqadd, + USQADDbb, USQADDhh, + USQADDss, USQADDdd>; + +def : Pat<(v1i64 (int_aarch64_neon_suqadd (v1i64 FPR64:$Src), + (v1i64 FPR64:$Rn))), + (SUQADDdd FPR64:$Src, FPR64:$Rn)>; + +def : Pat<(v1i64 (int_aarch64_neon_usqadd (v1i64 FPR64:$Src), + (v1i64 FPR64:$Rn))), + (USQADDdd FPR64:$Src, FPR64:$Rn)>; + +def : Pat<(v1i64 (int_arm_neon_vabs (v1i64 FPR64:$Rn))), + (ABSdd FPR64:$Rn)>; + +def : Pat<(v1i64 (int_arm_neon_vqabs (v1i64 FPR64:$Rn))), + (SQABSdd FPR64:$Rn)>; + +def : Pat<(v1i64 (int_arm_neon_vqneg (v1i64 FPR64:$Rn))), + (SQNEGdd FPR64:$Rn)>; + +def : Pat<(v1i64 (sub (v1i64 (bitconvert (v8i8 Neon_AllZero))), + (v1i64 FPR64:$Rn))), + (NEGdd FPR64:$Rn)>; + +// Scalar Signed Saturating Extract Unsigned Narrow +defm SQXTUN : NeonI_Scalar2SameMisc_narrow_HSD_size<0b1, 0b10010, "sqxtun">; +defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovnsu, + SQXTUNbh, SQXTUNhs, + SQXTUNsd>; + +// Scalar Signed Saturating Extract Narrow +defm SQXTN : NeonI_Scalar2SameMisc_narrow_HSD_size<0b0, 0b10100, "sqxtn">; +defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovns, + SQXTNbh, SQXTNhs, + SQXTNsd>; + +// Scalar Unsigned Saturating Extract Narrow +defm UQXTN : NeonI_Scalar2SameMisc_narrow_HSD_size<0b1, 0b10100, "uqxtn">; +defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovnu, + UQXTNbh, UQXTNhs, + UQXTNsd>; + +// Scalar Reduce Pairwise + +multiclass NeonI_ScalarPair_D_sizes<bit u, bit size, bits<5> opcode, + string asmop, bit Commutable = 0> { + let isCommutable = Commutable in { + def _D_2D : NeonI_ScalarPair<u, {size, 0b1}, opcode, + (outs FPR64:$Rd), (ins VPR128:$Rn), + !strconcat(asmop, "\t$Rd, $Rn.2d"), + [], + NoItinerary>; + } +} + +multiclass NeonI_ScalarPair_SD_sizes<bit u, bit size, bits<5> opcode, + string asmop, bit Commutable = 0> + : NeonI_ScalarPair_D_sizes<u, size, opcode, asmop, Commutable> { + let isCommutable = Commutable in { + def _S_2S : NeonI_ScalarPair<u, {size, 0b0}, opcode, + (outs FPR32:$Rd), (ins VPR64:$Rn), + !strconcat(asmop, "\t$Rd, $Rn.2s"), + [], + NoItinerary>; + } +} + +// Scalar Reduce Addition Pairwise (Integer) with +// Pattern to match llvm.arm.* intrinsic +defm ADDPvv : NeonI_ScalarPair_D_sizes<0b0, 0b1, 0b11011, "addp", 0>; + +// Pattern to match llvm.aarch64.* intrinsic for +// Scalar Reduce Addition Pairwise (Integer) +def : Pat<(v1i64 (int_aarch64_neon_vpadd (v2i64 VPR128:$Rn))), + (ADDPvv_D_2D VPR128:$Rn)>; +def : Pat<(v1i64 (int_aarch64_neon_vaddv (v2i64 VPR128:$Rn))), + (ADDPvv_D_2D VPR128:$Rn)>; + +// Scalar Reduce Addition Pairwise (Floating Point) +defm FADDPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01101, "faddp", 0>; + +// Scalar Reduce Maximum Pairwise (Floating Point) +defm FMAXPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01111, "fmaxp", 0>; + +// Scalar Reduce Minimum Pairwise (Floating Point) +defm FMINPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b1, 0b01111, "fminp", 0>; + +// Scalar Reduce maxNum Pairwise (Floating Point) +defm FMAXNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01100, "fmaxnmp", 0>; + +// Scalar Reduce minNum Pairwise (Floating Point) +defm FMINNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b1, 0b01100, "fminnmp", 0>; + +multiclass Neon_ScalarPair_SD_size_patterns<SDPatternOperator opnodeS, + SDPatternOperator opnodeD, + Instruction INSTS, + Instruction INSTD> { + def : Pat<(v1f32 (opnodeS (v2f32 VPR64:$Rn))), + (INSTS VPR64:$Rn)>; + def : Pat<(v1f64 (opnodeD (v2f64 VPR128:$Rn))), + (INSTD VPR128:$Rn)>; +} + +// Patterns to match llvm.aarch64.* intrinsic for +// Scalar Reduce Add, Max, Min, MaxiNum, MinNum Pairwise (Floating Point) +defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfadd, + int_aarch64_neon_vpfaddq, FADDPvv_S_2S, FADDPvv_D_2D>; + +defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpmax, + int_aarch64_neon_vpmaxq, FMAXPvv_S_2S, FMAXPvv_D_2D>; + +defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpmin, + int_aarch64_neon_vpminq, FMINPvv_S_2S, FMINPvv_D_2D>; + +defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfmaxnm, + int_aarch64_neon_vpfmaxnmq, FMAXNMPvv_S_2S, FMAXNMPvv_D_2D>; + +defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfminnm, + int_aarch64_neon_vpfminnmq, FMINNMPvv_S_2S, FMINNMPvv_D_2D>; + +defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vaddv, + int_aarch64_neon_vaddv, FADDPvv_S_2S, FADDPvv_D_2D>; + +def : Pat<(v1f32 (int_aarch64_neon_vaddv (v4f32 VPR128:$Rn))), + (FADDPvv_S_2S (v2f32 + (EXTRACT_SUBREG + (v4f32 (FADDP_4S (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rn))), + sub_64)))>; + +defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vmaxv, + int_aarch64_neon_vmaxv, FMAXPvv_S_2S, FMAXPvv_D_2D>; + +defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vminv, + int_aarch64_neon_vminv, FMINPvv_S_2S, FMINPvv_D_2D>; + +defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vmaxnmv, + int_aarch64_neon_vmaxnmv, FMAXNMPvv_S_2S, FMAXNMPvv_D_2D>; + +defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vminnmv, + int_aarch64_neon_vminnmv, FMINNMPvv_S_2S, FMINNMPvv_D_2D>; + +// Scalar by element Arithmetic + +class NeonI_ScalarXIndexedElemArith<string asmop, bits<4> opcode, + string rmlane, bit u, bit szhi, bit szlo, + RegisterClass ResFPR, RegisterClass OpFPR, + RegisterOperand OpVPR, Operand OpImm> + : NeonI_ScalarXIndexedElem<u, szhi, szlo, opcode, + (outs ResFPR:$Rd), + (ins OpFPR:$Rn, OpVPR:$MRm, OpImm:$Imm), + asmop # "\t$Rd, $Rn, $MRm" # rmlane # "[$Imm]", + [], + NoItinerary> { + bits<3> Imm; + bits<5> MRm; +} + +class NeonI_ScalarXIndexedElemArith_Constraint_Impl<string asmop, bits<4> opcode, + string rmlane, + bit u, bit szhi, bit szlo, + RegisterClass ResFPR, + RegisterClass OpFPR, + RegisterOperand OpVPR, + Operand OpImm> + : NeonI_ScalarXIndexedElem<u, szhi, szlo, opcode, + (outs ResFPR:$Rd), + (ins ResFPR:$src, OpFPR:$Rn, OpVPR:$MRm, OpImm:$Imm), + asmop # "\t$Rd, $Rn, $MRm" # rmlane # "[$Imm]", + [], + NoItinerary> { + let Constraints = "$src = $Rd"; + bits<3> Imm; + bits<5> MRm; +} + +// Scalar Floating Point multiply (scalar, by element) +def FMULssv_4S : NeonI_ScalarXIndexedElemArith<"fmul", + 0b1001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def FMULddv_2D : NeonI_ScalarXIndexedElemArith<"fmul", + 0b1001, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> { + let Inst{11} = Imm{0}; // h + let Inst{21} = 0b0; // l + let Inst{20-16} = MRm; +} + +// Scalar Floating Point multiply extended (scalar, by element) +def FMULXssv_4S : NeonI_ScalarXIndexedElemArith<"fmulx", + 0b1001, ".s", 0b1, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def FMULXddv_2D : NeonI_ScalarXIndexedElemArith<"fmulx", + 0b1001, ".d", 0b1, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> { + let Inst{11} = Imm{0}; // h + let Inst{21} = 0b0; // l + let Inst{20-16} = MRm; +} + +multiclass Neon_ScalarXIndexedElem_MUL_MULX_Patterns< + SDPatternOperator opnode, + Instruction INST, + ValueType ResTy, RegisterClass FPRC, ValueType OpTy, Operand OpImm, + ValueType OpNTy, ValueType ExTy, Operand OpNImm> { + + def : Pat<(ResTy (opnode (ResTy FPRC:$Rn), + (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)))), + (ResTy (INST (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>; + + def : Pat<(ResTy (opnode (ResTy FPRC:$Rn), + (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)))), + (ResTy (INST (ResTy FPRC:$Rn), + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)), + OpNImm:$Imm))>; + + // swapped operands + def : Pat<(ResTy (opnode + (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)), + (ResTy FPRC:$Rn))), + (ResTy (INST (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>; + + def : Pat<(ResTy (opnode + (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)), + (ResTy FPRC:$Rn))), + (ResTy (INST (ResTy FPRC:$Rn), + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)), + OpNImm:$Imm))>; +} + +// Patterns for Scalar Floating Point multiply (scalar, by element) +defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<fmul, FMULssv_4S, + f32, FPR32, v4f32, neon_uimm2_bare, v2f32, v4f32, neon_uimm1_bare>; +defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<fmul, FMULddv_2D, + f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>; + +// Patterns for Scalar Floating Point multiply extended (scalar, by element) +defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<int_aarch64_neon_vmulx, + FMULXssv_4S, f32, FPR32, v4f32, neon_uimm2_bare, + v2f32, v4f32, neon_uimm1_bare>; +defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<int_aarch64_neon_vmulx, + FMULXddv_2D, f64, FPR64, v2f64, neon_uimm1_bare, + v1f64, v2f64, neon_uimm0_bare>; + + +// Scalar Floating Point fused multiply-add (scalar, by element) +def FMLAssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla", + 0b0001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def FMLAddv_2D : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla", + 0b0001, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> { + let Inst{11} = Imm{0}; // h + let Inst{21} = 0b0; // l + let Inst{20-16} = MRm; +} + +// Scalar Floating Point fused multiply-subtract (scalar, by element) +def FMLSssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmls", + 0b0101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def FMLSddv_2D : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmls", + 0b0101, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> { + let Inst{11} = Imm{0}; // h + let Inst{21} = 0b0; // l + let Inst{20-16} = MRm; +} +// We are allowed to match the fma instruction regardless of compile options. +multiclass Neon_ScalarXIndexedElem_FMA_Patterns< + Instruction FMLAI, Instruction FMLSI, + ValueType ResTy, RegisterClass FPRC, ValueType OpTy, Operand OpImm, + ValueType OpNTy, ValueType ExTy, Operand OpNImm> { + // fmla + def : Pat<(ResTy (fma (ResTy FPRC:$Rn), + (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)), + (ResTy FPRC:$Ra))), + (ResTy (FMLAI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>; + + def : Pat<(ResTy (fma (ResTy FPRC:$Rn), + (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)), + (ResTy FPRC:$Ra))), + (ResTy (FMLAI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)), + OpNImm:$Imm))>; + + // swapped fmla operands + def : Pat<(ResTy (fma + (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)), + (ResTy FPRC:$Rn), + (ResTy FPRC:$Ra))), + (ResTy (FMLAI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>; + + def : Pat<(ResTy (fma + (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)), + (ResTy FPRC:$Rn), + (ResTy FPRC:$Ra))), + (ResTy (FMLAI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)), + OpNImm:$Imm))>; + + // fmls + def : Pat<(ResTy (fma (ResTy FPRC:$Rn), + (fneg (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm))), + (ResTy FPRC:$Ra))), + (ResTy (FMLSI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>; + + def : Pat<(ResTy (fma (ResTy FPRC:$Rn), + (fneg (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm))), + (ResTy FPRC:$Ra))), + (ResTy (FMLSI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)), + OpNImm:$Imm))>; + + // swapped fmls operands + def : Pat<(ResTy (fma + (fneg (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm))), + (ResTy FPRC:$Rn), + (ResTy FPRC:$Ra))), + (ResTy (FMLSI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>; + + def : Pat<(ResTy (fma + (fneg (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm))), + (ResTy FPRC:$Rn), + (ResTy FPRC:$Ra))), + (ResTy (FMLSI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)), + OpNImm:$Imm))>; +} + +// Scalar Floating Point fused multiply-add and +// multiply-subtract (scalar, by element) +defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAssv_4S, FMLSssv_4S, + f32, FPR32, v4f32, neon_uimm2_bare, v2f32, v4f32, neon_uimm1_bare>; +defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAddv_2D, FMLSddv_2D, + f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>; +defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAddv_2D, FMLSddv_2D, + f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>; + +// Scalar Signed saturating doubling multiply long (scalar, by element) +def SQDMULLshv_4H : NeonI_ScalarXIndexedElemArith<"sqdmull", + 0b1011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMULLshv_8H : NeonI_ScalarXIndexedElemArith<"sqdmull", + 0b1011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> { + let Inst{11} = Imm{2}; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMULLdsv_2S : NeonI_ScalarXIndexedElemArith<"sqdmull", + 0b1011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def SQDMULLdsv_4S : NeonI_ScalarXIndexedElemArith<"sqdmull", + 0b1011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} + +multiclass Neon_ScalarXIndexedElem_MUL_Patterns< + SDPatternOperator opnode, + Instruction INST, + ValueType ResTy, RegisterClass FPRC, + ValueType OpVTy, ValueType OpTy, + ValueType VecOpTy, ValueType ExTy, RegisterOperand VPRC, Operand OpImm> { + + def : Pat<(ResTy (opnode (OpVTy FPRC:$Rn), + (OpVTy (scalar_to_vector + (ExTy (vector_extract (VecOpTy VPRC:$MRm), OpImm:$Imm)))))), + (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>; + + //swapped operands + def : Pat<(ResTy (opnode + (OpVTy (scalar_to_vector + (ExTy (vector_extract (VecOpTy VPRC:$MRm), OpImm:$Imm)))), + (OpVTy FPRC:$Rn))), + (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>; +} + + +// Patterns for Scalar Signed saturating doubling +// multiply long (scalar, by element) +defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull, + SQDMULLshv_4H, v1i32, FPR16, v1i16, i16, v4i16, + i32, VPR64Lo, neon_uimm2_bare>; +defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull, + SQDMULLshv_8H, v1i32, FPR16, v1i16, i16, v8i16, + i32, VPR128Lo, neon_uimm3_bare>; +defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull, + SQDMULLdsv_2S, v1i64, FPR32, v1i32, i32, v2i32, + i32, VPR64Lo, neon_uimm1_bare>; +defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull, + SQDMULLdsv_4S, v1i64, FPR32, v1i32, i32, v4i32, + i32, VPR128Lo, neon_uimm2_bare>; + +// Scalar Signed saturating doubling multiply-add long (scalar, by element) +def SQDMLALshv_4H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal", + 0b0011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMLALshv_8H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal", + 0b0011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> { + let Inst{11} = Imm{2}; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMLALdsv_2S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal", + 0b0011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def SQDMLALdsv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal", + 0b0011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} + +// Scalar Signed saturating doubling +// multiply-subtract long (scalar, by element) +def SQDMLSLshv_4H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl", + 0b0111, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMLSLshv_8H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl", + 0b0111, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> { + let Inst{11} = Imm{2}; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMLSLdsv_2S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl", + 0b0111, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def SQDMLSLdsv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl", + 0b0111, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} + +multiclass Neon_ScalarXIndexedElem_MLAL_Patterns< + SDPatternOperator opnode, + SDPatternOperator coreopnode, + Instruction INST, + ValueType ResTy, RegisterClass ResFPRC, RegisterClass FPRC, + ValueType OpTy, + ValueType OpVTy, ValueType ExTy, RegisterOperand VPRC, Operand OpImm> { + + def : Pat<(ResTy (opnode + (ResTy ResFPRC:$Ra), + (ResTy (coreopnode (OpTy FPRC:$Rn), + (OpTy (scalar_to_vector + (ExTy (vector_extract (OpVTy VPRC:$MRm), OpImm:$Imm)))))))), + (ResTy (INST (ResTy ResFPRC:$Ra), + (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>; + + // swapped operands + def : Pat<(ResTy (opnode + (ResTy ResFPRC:$Ra), + (ResTy (coreopnode + (OpTy (scalar_to_vector + (ExTy (vector_extract (OpVTy VPRC:$MRm), OpImm:$Imm)))), + (OpTy FPRC:$Rn))))), + (ResTy (INST (ResTy ResFPRC:$Ra), + (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>; +} + +// Patterns for Scalar Signed saturating +// doubling multiply-add long (scalar, by element) +defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds, + int_arm_neon_vqdmull, SQDMLALshv_4H, v1i32, FPR32, FPR16, v1i16, v4i16, + i32, VPR64Lo, neon_uimm2_bare>; +defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds, + int_arm_neon_vqdmull, SQDMLALshv_8H, v1i32, FPR32, FPR16, v1i16, v8i16, + i32, VPR128Lo, neon_uimm3_bare>; +defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds, + int_arm_neon_vqdmull, SQDMLALdsv_2S, v1i64, FPR64, FPR32, v1i32, v2i32, + i32, VPR64Lo, neon_uimm1_bare>; +defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds, + int_arm_neon_vqdmull, SQDMLALdsv_4S, v1i64, FPR64, FPR32, v1i32, v4i32, + i32, VPR128Lo, neon_uimm2_bare>; + +// Patterns for Scalar Signed saturating +// doubling multiply-sub long (scalar, by element) +defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs, + int_arm_neon_vqdmull, SQDMLSLshv_4H, v1i32, FPR32, FPR16, v1i16, v4i16, + i32, VPR64Lo, neon_uimm2_bare>; +defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs, + int_arm_neon_vqdmull, SQDMLSLshv_8H, v1i32, FPR32, FPR16, v1i16, v8i16, + i32, VPR128Lo, neon_uimm3_bare>; +defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs, + int_arm_neon_vqdmull, SQDMLSLdsv_2S, v1i64, FPR64, FPR32, v1i32, v2i32, + i32, VPR64Lo, neon_uimm1_bare>; +defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs, + int_arm_neon_vqdmull, SQDMLSLdsv_4S, v1i64, FPR64, FPR32, v1i32, v4i32, + i32, VPR128Lo, neon_uimm2_bare>; + +// Scalar general arithmetic operation +class Neon_Scalar_GeneralMath2D_pattern<SDPatternOperator opnode, + Instruction INST> + : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>; + +class Neon_Scalar_GeneralMath3D_pattern<SDPatternOperator opnode, + Instruction INST> + : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), + (INST FPR64:$Rn, FPR64:$Rm)>; + +class Neon_Scalar_GeneralMath4D_pattern<SDPatternOperator opnode, + Instruction INST> + : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm), + (v1f64 FPR64:$Ra))), + (INST FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; + +def : Neon_Scalar_GeneralMath3D_pattern<fadd, FADDddd>; +def : Neon_Scalar_GeneralMath3D_pattern<fmul, FMULddd>; +def : Neon_Scalar_GeneralMath3D_pattern<fsub, FSUBddd>; +def : Neon_Scalar_GeneralMath3D_pattern<fdiv, FDIVddd>; +def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vabds, FABDddd>; +def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vmaxs, FMAXddd>; +def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vmins, FMINddd>; +def : Neon_Scalar_GeneralMath3D_pattern<int_aarch64_neon_vmaxnm, FMAXNMddd>; +def : Neon_Scalar_GeneralMath3D_pattern<int_aarch64_neon_vminnm, FMINNMddd>; + +def : Neon_Scalar_GeneralMath2D_pattern<fabs, FABSdd>; +def : Neon_Scalar_GeneralMath2D_pattern<fneg, FNEGdd>; + +def : Neon_Scalar_GeneralMath4D_pattern<fma, FMADDdddd>; +def : Neon_Scalar_GeneralMath4D_pattern<fmsub, FMSUBdddd>; + +// Scalar Signed saturating doubling multiply returning +// high half (scalar, by element) +def SQDMULHhhv_4H : NeonI_ScalarXIndexedElemArith<"sqdmulh", + 0b1100, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR64Lo, neon_uimm2_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMULHhhv_8H : NeonI_ScalarXIndexedElemArith<"sqdmulh", + 0b1100, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR128Lo, neon_uimm3_bare> { + let Inst{11} = Imm{2}; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMULHssv_2S : NeonI_ScalarXIndexedElemArith<"sqdmulh", + 0b1100, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR64, neon_uimm1_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def SQDMULHssv_4S : NeonI_ScalarXIndexedElemArith<"sqdmulh", + 0b1100, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} + +// Patterns for Scalar Signed saturating doubling multiply returning +// high half (scalar, by element) +defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh, + SQDMULHhhv_4H, v1i16, FPR16, v1i16, i16, v4i16, + i32, VPR64Lo, neon_uimm2_bare>; +defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh, + SQDMULHhhv_8H, v1i16, FPR16, v1i16, i16, v8i16, + i32, VPR128Lo, neon_uimm3_bare>; +defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh, + SQDMULHssv_2S, v1i32, FPR32, v1i32, i32, v2i32, + i32, VPR64Lo, neon_uimm1_bare>; +defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh, + SQDMULHssv_4S, v1i32, FPR32, v1i32, i32, v4i32, + i32, VPR128Lo, neon_uimm2_bare>; + +// Scalar Signed saturating rounding doubling multiply +// returning high half (scalar, by element) +def SQRDMULHhhv_4H : NeonI_ScalarXIndexedElemArith<"sqrdmulh", + 0b1101, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR64Lo, neon_uimm2_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQRDMULHhhv_8H : NeonI_ScalarXIndexedElemArith<"sqrdmulh", + 0b1101, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR128Lo, neon_uimm3_bare> { + let Inst{11} = Imm{2}; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQRDMULHssv_2S : NeonI_ScalarXIndexedElemArith<"sqrdmulh", + 0b1101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR64, neon_uimm1_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def SQRDMULHssv_4S : NeonI_ScalarXIndexedElemArith<"sqrdmulh", + 0b1101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} + +defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh, + SQRDMULHhhv_4H, v1i16, FPR16, v1i16, i16, v4i16, i32, + VPR64Lo, neon_uimm2_bare>; +defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh, + SQRDMULHhhv_8H, v1i16, FPR16, v1i16, i16, v8i16, i32, + VPR128Lo, neon_uimm3_bare>; +defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh, + SQRDMULHssv_2S, v1i32, FPR32, v1i32, i32, v2i32, i32, + VPR64Lo, neon_uimm1_bare>; +defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh, + SQRDMULHssv_4S, v1i32, FPR32, v1i32, i32, v4i32, i32, + VPR128Lo, neon_uimm2_bare>; + +// Scalar Copy - DUP element to scalar +class NeonI_Scalar_DUP<string asmop, string asmlane, + RegisterClass ResRC, RegisterOperand VPRC, + Operand OpImm> + : NeonI_ScalarCopy<(outs ResRC:$Rd), (ins VPRC:$Rn, OpImm:$Imm), + asmop # "\t$Rd, $Rn." # asmlane # "[$Imm]", + [], + NoItinerary> { + bits<4> Imm; +} + +def DUPbv_B : NeonI_Scalar_DUP<"dup", "b", FPR8, VPR128, neon_uimm4_bare> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} +def DUPhv_H : NeonI_Scalar_DUP<"dup", "h", FPR16, VPR128, neon_uimm3_bare> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} +def DUPsv_S : NeonI_Scalar_DUP<"dup", "s", FPR32, VPR128, neon_uimm2_bare> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} +def DUPdv_D : NeonI_Scalar_DUP<"dup", "d", FPR64, VPR128, neon_uimm1_bare> { + let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; +} + +multiclass NeonI_Scalar_DUP_Elt_pattern<Instruction DUPI, ValueType ResTy, + ValueType OpTy, Operand OpImm, + ValueType OpNTy, ValueType ExTy, Operand OpNImm> { + def : Pat<(ResTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)), + (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>; + + def : Pat<(ResTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)), + (ResTy (DUPI + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + OpNImm:$Imm))>; +} + +// Patterns for vector extract of FP data using scalar DUP instructions +defm : NeonI_Scalar_DUP_Elt_pattern<DUPsv_S, f32, + v4f32, neon_uimm2_bare, v2f32, v4f32, neon_uimm1_bare>; +defm : NeonI_Scalar_DUP_Elt_pattern<DUPdv_D, f64, + v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>; + +multiclass NeonI_Scalar_DUP_Ext_Vec_pattern<Instruction DUPI, + ValueType ResTy, ValueType OpTy,Operand OpLImm, + ValueType NOpTy, ValueType ExTy, Operand OpNImm> { + + def : Pat<(ResTy (extract_subvector (OpTy VPR128:$Rn), OpLImm:$Imm)), + (ResTy (DUPI VPR128:$Rn, OpLImm:$Imm))>; + + def : Pat<(ResTy (extract_subvector (NOpTy VPR64:$Rn), OpNImm:$Imm)), + (ResTy (DUPI + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + OpNImm:$Imm))>; +} + +// Patterns for extract subvectors of v1ix data using scalar DUP instructions. +defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPbv_B, v1i8, v16i8, neon_uimm4_bare, + v8i8, v16i8, neon_uimm3_bare>; +defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPhv_H, v1i16, v8i16, neon_uimm3_bare, + v4i16, v8i16, neon_uimm2_bare>; +defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPsv_S, v1i32, v4i32, neon_uimm2_bare, + v2i32, v4i32, neon_uimm1_bare>; + +multiclass NeonI_Scalar_DUP_Copy_pattern1<Instruction DUPI, ValueType ResTy, + ValueType OpTy, ValueType ElemTy, + Operand OpImm, ValueType OpNTy, + ValueType ExTy, Operand OpNImm> { + + def : Pat<(ResTy (vector_insert (ResTy undef), + (ElemTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)), + (neon_uimm0_bare:$Imm))), + (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>; + + def : Pat<(ResTy (vector_insert (ResTy undef), + (ElemTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)), + (OpNImm:$Imm))), + (ResTy (DUPI + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + OpNImm:$Imm))>; +} + +multiclass NeonI_Scalar_DUP_Copy_pattern2<Instruction DUPI, ValueType ResTy, + ValueType OpTy, ValueType ElemTy, + Operand OpImm, ValueType OpNTy, + ValueType ExTy, Operand OpNImm> { + + def : Pat<(ResTy (scalar_to_vector + (ElemTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)))), + (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>; + + def : Pat<(ResTy (scalar_to_vector + (ElemTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)))), + (ResTy (DUPI + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + OpNImm:$Imm))>; +} + +// Patterns for vector copy to v1ix and v1fx vectors using scalar DUP +// instructions. +defm : NeonI_Scalar_DUP_Copy_pattern1<DUPdv_D, + v1i64, v2i64, i64, neon_uimm1_bare, + v1i64, v2i64, neon_uimm0_bare>; +defm : NeonI_Scalar_DUP_Copy_pattern1<DUPsv_S, + v1i32, v4i32, i32, neon_uimm2_bare, + v2i32, v4i32, neon_uimm1_bare>; +defm : NeonI_Scalar_DUP_Copy_pattern1<DUPhv_H, + v1i16, v8i16, i32, neon_uimm3_bare, + v4i16, v8i16, neon_uimm2_bare>; +defm : NeonI_Scalar_DUP_Copy_pattern1<DUPbv_B, + v1i8, v16i8, i32, neon_uimm4_bare, + v8i8, v16i8, neon_uimm3_bare>; +defm : NeonI_Scalar_DUP_Copy_pattern1<DUPdv_D, + v1f64, v2f64, f64, neon_uimm1_bare, + v1f64, v2f64, neon_uimm0_bare>; +defm : NeonI_Scalar_DUP_Copy_pattern1<DUPsv_S, + v1f32, v4f32, f32, neon_uimm2_bare, + v2f32, v4f32, neon_uimm1_bare>; +defm : NeonI_Scalar_DUP_Copy_pattern2<DUPdv_D, + v1i64, v2i64, i64, neon_uimm1_bare, + v1i64, v2i64, neon_uimm0_bare>; +defm : NeonI_Scalar_DUP_Copy_pattern2<DUPsv_S, + v1i32, v4i32, i32, neon_uimm2_bare, + v2i32, v4i32, neon_uimm1_bare>; +defm : NeonI_Scalar_DUP_Copy_pattern2<DUPhv_H, + v1i16, v8i16, i32, neon_uimm3_bare, + v4i16, v8i16, neon_uimm2_bare>; +defm : NeonI_Scalar_DUP_Copy_pattern2<DUPbv_B, + v1i8, v16i8, i32, neon_uimm4_bare, + v8i8, v16i8, neon_uimm3_bare>; +defm : NeonI_Scalar_DUP_Copy_pattern2<DUPdv_D, + v1f64, v2f64, f64, neon_uimm1_bare, + v1f64, v2f64, neon_uimm0_bare>; +defm : NeonI_Scalar_DUP_Copy_pattern2<DUPsv_S, + v1f32, v4f32, f32, neon_uimm2_bare, + v2f32, v4f32, neon_uimm1_bare>; + +multiclass NeonI_Scalar_DUP_alias<string asmop, string asmlane, + Instruction DUPI, Operand OpImm, + RegisterClass ResRC> { + def : NeonInstAlias<!strconcat(asmop, "$Rd, $Rn" # asmlane # "[$Imm]"), + (DUPI ResRC:$Rd, VPR128:$Rn, OpImm:$Imm), 0b0>; +} + +// Aliases for Scalar copy - DUP element (scalar) +// FIXME: This is actually the preferred syntax but TableGen can't deal with +// custom printing of aliases. +defm : NeonI_Scalar_DUP_alias<"mov", ".b", DUPbv_B, neon_uimm4_bare, FPR8>; +defm : NeonI_Scalar_DUP_alias<"mov", ".h", DUPhv_H, neon_uimm3_bare, FPR16>; +defm : NeonI_Scalar_DUP_alias<"mov", ".s", DUPsv_S, neon_uimm2_bare, FPR32>; +defm : NeonI_Scalar_DUP_alias<"mov", ".d", DUPdv_D, neon_uimm1_bare, FPR64>; + +multiclass NeonI_SDUP<PatFrag GetLow, PatFrag GetHigh, ValueType ResTy, + ValueType OpTy> { + def : Pat<(ResTy (GetLow VPR128:$Rn)), + (ResTy (EXTRACT_SUBREG (OpTy VPR128:$Rn), sub_64))>; + def : Pat<(ResTy (GetHigh VPR128:$Rn)), + (ResTy (DUPdv_D (OpTy VPR128:$Rn), 1))>; +} + +defm : NeonI_SDUP<Neon_Low16B, Neon_High16B, v8i8, v16i8>; +defm : NeonI_SDUP<Neon_Low8H, Neon_High8H, v4i16, v8i16>; +defm : NeonI_SDUP<Neon_Low4S, Neon_High4S, v2i32, v4i32>; +defm : NeonI_SDUP<Neon_Low2D, Neon_High2D, v1i64, v2i64>; +defm : NeonI_SDUP<Neon_Low4float, Neon_High4float, v2f32, v4f32>; +defm : NeonI_SDUP<Neon_Low2double, Neon_High2double, v1f64, v2f64>; + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +//===----------------------------------------------------------------------===// + +// 64-bit vector bitcasts... + +def : Pat<(v1i64 (bitconvert (v8i8 VPR64:$src))), (v1i64 VPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v8i8 VPR64:$src))), (v2f32 VPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v8i8 VPR64:$src))), (v2i32 VPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v8i8 VPR64:$src))), (v4i16 VPR64:$src)>; + +def : Pat<(v1i64 (bitconvert (v4i16 VPR64:$src))), (v1i64 VPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v4i16 VPR64:$src))), (v2i32 VPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v4i16 VPR64:$src))), (v2f32 VPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v4i16 VPR64:$src))), (v8i8 VPR64:$src)>; + +def : Pat<(v1i64 (bitconvert (v2i32 VPR64:$src))), (v1i64 VPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v2i32 VPR64:$src))), (v2f32 VPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v2i32 VPR64:$src))), (v4i16 VPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v2i32 VPR64:$src))), (v8i8 VPR64:$src)>; + +def : Pat<(v1i64 (bitconvert (v2f32 VPR64:$src))), (v1i64 VPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v2f32 VPR64:$src))), (v2i32 VPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v2f32 VPR64:$src))), (v4i16 VPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v2f32 VPR64:$src))), (v8i8 VPR64:$src)>; + +def : Pat<(v2f32 (bitconvert (v1i64 VPR64:$src))), (v2f32 VPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v1i64 VPR64:$src))), (v2i32 VPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v1i64 VPR64:$src))), (v4i16 VPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v1i64 VPR64:$src))), (v8i8 VPR64:$src)>; + +// ..and 128-bit vector bitcasts... + +def : Pat<(v2f64 (bitconvert (v16i8 VPR128:$src))), (v2f64 VPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v16i8 VPR128:$src))), (v2i64 VPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v16i8 VPR128:$src))), (v4f32 VPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v16i8 VPR128:$src))), (v4i32 VPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v16i8 VPR128:$src))), (v8i16 VPR128:$src)>; + +def : Pat<(v2f64 (bitconvert (v8i16 VPR128:$src))), (v2f64 VPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v8i16 VPR128:$src))), (v2i64 VPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v8i16 VPR128:$src))), (v4i32 VPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v8i16 VPR128:$src))), (v4f32 VPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v8i16 VPR128:$src))), (v16i8 VPR128:$src)>; + +def : Pat<(v2f64 (bitconvert (v4i32 VPR128:$src))), (v2f64 VPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v4i32 VPR128:$src))), (v2i64 VPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v4i32 VPR128:$src))), (v4f32 VPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v4i32 VPR128:$src))), (v8i16 VPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v4i32 VPR128:$src))), (v16i8 VPR128:$src)>; + +def : Pat<(v2f64 (bitconvert (v4f32 VPR128:$src))), (v2f64 VPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v4f32 VPR128:$src))), (v2i64 VPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v4f32 VPR128:$src))), (v4i32 VPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v4f32 VPR128:$src))), (v8i16 VPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v4f32 VPR128:$src))), (v16i8 VPR128:$src)>; + +def : Pat<(v2f64 (bitconvert (v2i64 VPR128:$src))), (v2f64 VPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v2i64 VPR128:$src))), (v4f32 VPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v2i64 VPR128:$src))), (v4i32 VPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v2i64 VPR128:$src))), (v8i16 VPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v2i64 VPR128:$src))), (v16i8 VPR128:$src)>; + +def : Pat<(v2i64 (bitconvert (v2f64 VPR128:$src))), (v2i64 VPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v2f64 VPR128:$src))), (v4f32 VPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v2f64 VPR128:$src))), (v4i32 VPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v2f64 VPR128:$src))), (v8i16 VPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v2f64 VPR128:$src))), (v16i8 VPR128:$src)>; + +// ...and scalar bitcasts... +def : Pat<(f16 (bitconvert (v1i16 FPR16:$src))), (f16 FPR16:$src)>; +def : Pat<(f32 (bitconvert (v1i32 FPR32:$src))), (f32 FPR32:$src)>; +def : Pat<(f64 (bitconvert (v1i64 FPR64:$src))), (f64 FPR64:$src)>; +def : Pat<(f32 (bitconvert (v1f32 FPR32:$src))), (f32 FPR32:$src)>; +def : Pat<(f64 (bitconvert (v1f64 FPR64:$src))), (f64 FPR64:$src)>; + +def : Pat<(i64 (bitconvert (v1i64 FPR64:$src))), (FMOVxd $src)>; +def : Pat<(i64 (bitconvert (v1f64 FPR64:$src))), (FMOVxd $src)>; +def : Pat<(i64 (bitconvert (v2i32 FPR64:$src))), (FMOVxd $src)>; +def : Pat<(i64 (bitconvert (v2f32 FPR64:$src))), (FMOVxd $src)>; +def : Pat<(i64 (bitconvert (v4i16 FPR64:$src))), (FMOVxd $src)>; +def : Pat<(i64 (bitconvert (v8i8 FPR64:$src))), (FMOVxd $src)>; + +def : Pat<(i32 (bitconvert (v1i32 FPR32:$src))), (FMOVws $src)>; + +def : Pat<(v8i8 (bitconvert (v1i64 VPR64:$src))), (v8i8 VPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v1i64 VPR64:$src))), (v4i16 VPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v1i64 VPR64:$src))), (v2i32 VPR64:$src)>; + +def : Pat<(f64 (bitconvert (v8i8 VPR64:$src))), (f64 VPR64:$src)>; +def : Pat<(f64 (bitconvert (v4i16 VPR64:$src))), (f64 VPR64:$src)>; +def : Pat<(f64 (bitconvert (v2i32 VPR64:$src))), (f64 VPR64:$src)>; +def : Pat<(f64 (bitconvert (v2f32 VPR64:$src))), (f64 VPR64:$src)>; +def : Pat<(f64 (bitconvert (v1i64 VPR64:$src))), (f64 VPR64:$src)>; + +def : Pat<(f128 (bitconvert (v16i8 VPR128:$src))), (f128 VPR128:$src)>; +def : Pat<(f128 (bitconvert (v8i16 VPR128:$src))), (f128 VPR128:$src)>; +def : Pat<(f128 (bitconvert (v4i32 VPR128:$src))), (f128 VPR128:$src)>; +def : Pat<(f128 (bitconvert (v2i64 VPR128:$src))), (f128 VPR128:$src)>; +def : Pat<(f128 (bitconvert (v4f32 VPR128:$src))), (f128 VPR128:$src)>; +def : Pat<(f128 (bitconvert (v2f64 VPR128:$src))), (f128 VPR128:$src)>; + +def : Pat<(v1i16 (bitconvert (f16 FPR16:$src))), (v1i16 FPR16:$src)>; +def : Pat<(v1i32 (bitconvert (f32 FPR32:$src))), (v1i32 FPR32:$src)>; +def : Pat<(v1i64 (bitconvert (f64 FPR64:$src))), (v1i64 FPR64:$src)>; +def : Pat<(v1f32 (bitconvert (f32 FPR32:$src))), (v1f32 FPR32:$src)>; +def : Pat<(v1f64 (bitconvert (f64 FPR64:$src))), (v1f64 FPR64:$src)>; + +def : Pat<(v1i64 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>; +def : Pat<(v1f64 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>; +def : Pat<(v2i32 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>; +def : Pat<(v2f32 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>; +def : Pat<(v4i16 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>; +def : Pat<(v8i8 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>; + +def : Pat<(v1i32 (bitconvert (i32 GPR32:$src))), (FMOVsw $src)>; + +def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v1i64 (bitconvert (f64 FPR64:$src))), (v1i64 FPR64:$src)>; + +def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), (v2f64 FPR128:$src)>; + +// Scalar Three Same + +def neon_uimm3 : Operand<i64>, + ImmLeaf<i64, [{return Imm < 8;}]> { + let ParserMatchClass = uimm3_asmoperand; + let PrintMethod = "printUImmHexOperand"; +} + +def neon_uimm4 : Operand<i64>, + ImmLeaf<i64, [{return Imm < 16;}]> { + let ParserMatchClass = uimm4_asmoperand; + let PrintMethod = "printUImmHexOperand"; +} + +// Bitwise Extract +class NeonI_Extract<bit q, bits<2> op2, string asmop, + string OpS, RegisterOperand OpVPR, Operand OpImm> + : NeonI_BitExtract<q, op2, (outs OpVPR:$Rd), + (ins OpVPR:$Rn, OpVPR:$Rm, OpImm:$Index), + asmop # "\t$Rd." # OpS # ", $Rn." # OpS # + ", $Rm." # OpS # ", $Index", + [], + NoItinerary>{ + bits<4> Index; +} + +def EXTvvvi_8b : NeonI_Extract<0b0, 0b00, "ext", "8b", + VPR64, neon_uimm3> { + let Inst{14-11} = {0b0, Index{2}, Index{1}, Index{0}}; +} + +def EXTvvvi_16b: NeonI_Extract<0b1, 0b00, "ext", "16b", + VPR128, neon_uimm4> { + let Inst{14-11} = Index; +} + +class NI_Extract<ValueType OpTy, RegisterOperand OpVPR, Instruction INST, + Operand OpImm> + : Pat<(OpTy (Neon_vextract (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm), + (i64 OpImm:$Imm))), + (INST OpVPR:$Rn, OpVPR:$Rm, OpImm:$Imm)>; + +def : NI_Extract<v8i8, VPR64, EXTvvvi_8b, neon_uimm3>; +def : NI_Extract<v4i16, VPR64, EXTvvvi_8b, neon_uimm3>; +def : NI_Extract<v2i32, VPR64, EXTvvvi_8b, neon_uimm3>; +def : NI_Extract<v1i64, VPR64, EXTvvvi_8b, neon_uimm3>; +def : NI_Extract<v2f32, VPR64, EXTvvvi_8b, neon_uimm3>; +def : NI_Extract<v1f64, VPR64, EXTvvvi_8b, neon_uimm3>; +def : NI_Extract<v16i8, VPR128, EXTvvvi_16b, neon_uimm4>; +def : NI_Extract<v8i16, VPR128, EXTvvvi_16b, neon_uimm4>; +def : NI_Extract<v4i32, VPR128, EXTvvvi_16b, neon_uimm4>; +def : NI_Extract<v2i64, VPR128, EXTvvvi_16b, neon_uimm4>; +def : NI_Extract<v4f32, VPR128, EXTvvvi_16b, neon_uimm4>; +def : NI_Extract<v2f64, VPR128, EXTvvvi_16b, neon_uimm4>; + +// Table lookup +class NI_TBL<bit q, bits<2> op2, bits<2> len, bit op, + string asmop, string OpS, RegisterOperand OpVPR, + RegisterOperand VecList> + : NeonI_TBL<q, op2, len, op, + (outs OpVPR:$Rd), (ins VecList:$Rn, OpVPR:$Rm), + asmop # "\t$Rd." # OpS # ", $Rn, $Rm." # OpS, + [], + NoItinerary>; + +// The vectors in look up table are always 16b +multiclass NI_TBL_pat<bits<2> len, bit op, string asmop, string List> { + def _8b : NI_TBL<0, 0b00, len, op, asmop, "8b", VPR64, + !cast<RegisterOperand>(List # "16B_operand")>; + + def _16b : NI_TBL<1, 0b00, len, op, asmop, "16b", VPR128, + !cast<RegisterOperand>(List # "16B_operand")>; +} + +defm TBL1 : NI_TBL_pat<0b00, 0b0, "tbl", "VOne">; +defm TBL2 : NI_TBL_pat<0b01, 0b0, "tbl", "VPair">; +defm TBL3 : NI_TBL_pat<0b10, 0b0, "tbl", "VTriple">; +defm TBL4 : NI_TBL_pat<0b11, 0b0, "tbl", "VQuad">; + +// Table lookup extention +class NI_TBX<bit q, bits<2> op2, bits<2> len, bit op, + string asmop, string OpS, RegisterOperand OpVPR, + RegisterOperand VecList> + : NeonI_TBL<q, op2, len, op, + (outs OpVPR:$Rd), (ins OpVPR:$src, VecList:$Rn, OpVPR:$Rm), + asmop # "\t$Rd." # OpS # ", $Rn, $Rm." # OpS, + [], + NoItinerary> { + let Constraints = "$src = $Rd"; +} + +// The vectors in look up table are always 16b +multiclass NI_TBX_pat<bits<2> len, bit op, string asmop, string List> { + def _8b : NI_TBX<0, 0b00, len, op, asmop, "8b", VPR64, + !cast<RegisterOperand>(List # "16B_operand")>; + + def _16b : NI_TBX<1, 0b00, len, op, asmop, "16b", VPR128, + !cast<RegisterOperand>(List # "16B_operand")>; +} + +defm TBX1 : NI_TBX_pat<0b00, 0b1, "tbx", "VOne">; +defm TBX2 : NI_TBX_pat<0b01, 0b1, "tbx", "VPair">; +defm TBX3 : NI_TBX_pat<0b10, 0b1, "tbx", "VTriple">; +defm TBX4 : NI_TBX_pat<0b11, 0b1, "tbx", "VQuad">; + +class NeonI_INS_main<string asmop, string Res, ValueType ResTy, + RegisterClass OpGPR, ValueType OpTy, Operand OpImm> + : NeonI_copy<0b1, 0b0, 0b0011, + (outs VPR128:$Rd), (ins VPR128:$src, OpGPR:$Rn, OpImm:$Imm), + asmop # "\t$Rd." # Res # "[$Imm], $Rn", + [(set (ResTy VPR128:$Rd), + (ResTy (vector_insert + (ResTy VPR128:$src), + (OpTy OpGPR:$Rn), + (OpImm:$Imm))))], + NoItinerary> { + bits<4> Imm; + let Constraints = "$src = $Rd"; +} + +//Insert element (vector, from main) +def INSbw : NeonI_INS_main<"ins", "b", v16i8, GPR32, i32, + neon_uimm4_bare> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} +def INShw : NeonI_INS_main<"ins", "h", v8i16, GPR32, i32, + neon_uimm3_bare> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} +def INSsw : NeonI_INS_main<"ins", "s", v4i32, GPR32, i32, + neon_uimm2_bare> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} +def INSdx : NeonI_INS_main<"ins", "d", v2i64, GPR64, i64, + neon_uimm1_bare> { + let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; +} + +def : NeonInstAlias<"mov $Rd.b[$Imm], $Rn", + (INSbw VPR128:$Rd, GPR32:$Rn, neon_uimm4_bare:$Imm), 0>; +def : NeonInstAlias<"mov $Rd.h[$Imm], $Rn", + (INShw VPR128:$Rd, GPR32:$Rn, neon_uimm3_bare:$Imm), 0>; +def : NeonInstAlias<"mov $Rd.s[$Imm], $Rn", + (INSsw VPR128:$Rd, GPR32:$Rn, neon_uimm2_bare:$Imm), 0>; +def : NeonInstAlias<"mov $Rd.d[$Imm], $Rn", + (INSdx VPR128:$Rd, GPR64:$Rn, neon_uimm1_bare:$Imm), 0>; + +class Neon_INS_main_pattern <ValueType ResTy,ValueType ExtResTy, + RegisterClass OpGPR, ValueType OpTy, + Operand OpImm, Instruction INS> + : Pat<(ResTy (vector_insert + (ResTy VPR64:$src), + (OpTy OpGPR:$Rn), + (OpImm:$Imm))), + (ResTy (EXTRACT_SUBREG + (ExtResTy (INS (ExtResTy (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)), + OpGPR:$Rn, OpImm:$Imm)), sub_64))>; + +def INSbw_pattern : Neon_INS_main_pattern<v8i8, v16i8, GPR32, i32, + neon_uimm3_bare, INSbw>; +def INShw_pattern : Neon_INS_main_pattern<v4i16, v8i16, GPR32, i32, + neon_uimm2_bare, INShw>; +def INSsw_pattern : Neon_INS_main_pattern<v2i32, v4i32, GPR32, i32, + neon_uimm1_bare, INSsw>; +def INSdx_pattern : Neon_INS_main_pattern<v1i64, v2i64, GPR64, i64, + neon_uimm0_bare, INSdx>; + +class NeonI_INS_element<string asmop, string Res, Operand ResImm> + : NeonI_insert<0b1, 0b1, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn, + ResImm:$Immd, ResImm:$Immn), + asmop # "\t$Rd." # Res # "[$Immd], $Rn." # Res # "[$Immn]", + [], + NoItinerary> { + let Constraints = "$src = $Rd"; + bits<4> Immd; + bits<4> Immn; +} + +//Insert element (vector, from element) +def INSELb : NeonI_INS_element<"ins", "b", neon_uimm4_bare> { + let Inst{20-16} = {Immd{3}, Immd{2}, Immd{1}, Immd{0}, 0b1}; + let Inst{14-11} = {Immn{3}, Immn{2}, Immn{1}, Immn{0}}; +} +def INSELh : NeonI_INS_element<"ins", "h", neon_uimm3_bare> { + let Inst{20-16} = {Immd{2}, Immd{1}, Immd{0}, 0b1, 0b0}; + let Inst{14-11} = {Immn{2}, Immn{1}, Immn{0}, 0b0}; + // bit 11 is unspecified, but should be set to zero. +} +def INSELs : NeonI_INS_element<"ins", "s", neon_uimm2_bare> { + let Inst{20-16} = {Immd{1}, Immd{0}, 0b1, 0b0, 0b0}; + let Inst{14-11} = {Immn{1}, Immn{0}, 0b0, 0b0}; + // bits 11-12 are unspecified, but should be set to zero. +} +def INSELd : NeonI_INS_element<"ins", "d", neon_uimm1_bare> { + let Inst{20-16} = {Immd, 0b1, 0b0, 0b0, 0b0}; + let Inst{14-11} = {Immn{0}, 0b0, 0b0, 0b0}; + // bits 11-13 are unspecified, but should be set to zero. +} + +def : NeonInstAlias<"mov $Rd.b[$Immd], $Rn.b[$Immn]", + (INSELb VPR128:$Rd, VPR128:$Rn, + neon_uimm4_bare:$Immd, neon_uimm4_bare:$Immn), 0>; +def : NeonInstAlias<"mov $Rd.h[$Immd], $Rn.h[$Immn]", + (INSELh VPR128:$Rd, VPR128:$Rn, + neon_uimm3_bare:$Immd, neon_uimm3_bare:$Immn), 0>; +def : NeonInstAlias<"mov $Rd.s[$Immd], $Rn.s[$Immn]", + (INSELs VPR128:$Rd, VPR128:$Rn, + neon_uimm2_bare:$Immd, neon_uimm2_bare:$Immn), 0>; +def : NeonInstAlias<"mov $Rd.d[$Immd], $Rn.d[$Immn]", + (INSELd VPR128:$Rd, VPR128:$Rn, + neon_uimm1_bare:$Immd, neon_uimm1_bare:$Immn), 0>; + +multiclass Neon_INS_elt_pattern<ValueType ResTy, ValueType NaTy, + ValueType MidTy, Operand StImm, Operand NaImm, + Instruction INS> { +def : Pat<(ResTy (vector_insert + (ResTy VPR128:$src), + (MidTy (vector_extract + (ResTy VPR128:$Rn), + (StImm:$Immn))), + (StImm:$Immd))), + (INS (ResTy VPR128:$src), (ResTy VPR128:$Rn), + StImm:$Immd, StImm:$Immn)>; + +def : Pat <(ResTy (vector_insert + (ResTy VPR128:$src), + (MidTy (vector_extract + (NaTy VPR64:$Rn), + (NaImm:$Immn))), + (StImm:$Immd))), + (INS (ResTy VPR128:$src), + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)), + StImm:$Immd, NaImm:$Immn)>; + +def : Pat <(NaTy (vector_insert + (NaTy VPR64:$src), + (MidTy (vector_extract + (ResTy VPR128:$Rn), + (StImm:$Immn))), + (NaImm:$Immd))), + (NaTy (EXTRACT_SUBREG + (ResTy (INS + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)), + (ResTy VPR128:$Rn), + NaImm:$Immd, StImm:$Immn)), + sub_64))>; + +def : Pat <(NaTy (vector_insert + (NaTy VPR64:$src), + (MidTy (vector_extract + (NaTy VPR64:$Rn), + (NaImm:$Immn))), + (NaImm:$Immd))), + (NaTy (EXTRACT_SUBREG + (ResTy (INS + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)), + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)), + NaImm:$Immd, NaImm:$Immn)), + sub_64))>; +} + +defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, neon_uimm2_bare, + neon_uimm1_bare, INSELs>; +defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, neon_uimm1_bare, + neon_uimm0_bare, INSELd>; +defm : Neon_INS_elt_pattern<v16i8, v8i8, i32, neon_uimm4_bare, + neon_uimm3_bare, INSELb>; +defm : Neon_INS_elt_pattern<v8i16, v4i16, i32, neon_uimm3_bare, + neon_uimm2_bare, INSELh>; +defm : Neon_INS_elt_pattern<v4i32, v2i32, i32, neon_uimm2_bare, + neon_uimm1_bare, INSELs>; +defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, neon_uimm1_bare, + neon_uimm0_bare, INSELd>; + +multiclass Neon_INS_elt_float_pattern<ValueType ResTy, ValueType NaTy, + ValueType MidTy, + RegisterClass OpFPR, Operand ResImm, + SubRegIndex SubIndex, Instruction INS> { +def : Pat <(ResTy (vector_insert + (ResTy VPR128:$src), + (MidTy OpFPR:$Rn), + (ResImm:$Imm))), + (INS (ResTy VPR128:$src), + (ResTy (SUBREG_TO_REG (i64 0), OpFPR:$Rn, SubIndex)), + ResImm:$Imm, + (i64 0))>; + +def : Pat <(NaTy (vector_insert + (NaTy VPR64:$src), + (MidTy OpFPR:$Rn), + (ResImm:$Imm))), + (NaTy (EXTRACT_SUBREG + (ResTy (INS + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)), + (ResTy (SUBREG_TO_REG (i64 0), (MidTy OpFPR:$Rn), SubIndex)), + ResImm:$Imm, + (i64 0))), + sub_64))>; +} + +defm : Neon_INS_elt_float_pattern<v4f32, v2f32, f32, FPR32, neon_uimm2_bare, + sub_32, INSELs>; +defm : Neon_INS_elt_float_pattern<v2f64, v1f64, f64, FPR64, neon_uimm1_bare, + sub_64, INSELd>; + +class NeonI_SMOV<string asmop, string Res, bit Q, + ValueType OpTy, ValueType eleTy, + Operand OpImm, RegisterClass ResGPR, ValueType ResTy> + : NeonI_copy<Q, 0b0, 0b0101, + (outs ResGPR:$Rd), (ins VPR128:$Rn, OpImm:$Imm), + asmop # "\t$Rd, $Rn." # Res # "[$Imm]", + [(set (ResTy ResGPR:$Rd), + (ResTy (sext_inreg + (ResTy (vector_extract + (OpTy VPR128:$Rn), (OpImm:$Imm))), + eleTy)))], + NoItinerary> { + bits<4> Imm; +} + +//Signed integer move (main, from element) +def SMOVwb : NeonI_SMOV<"smov", "b", 0b0, v16i8, i8, neon_uimm4_bare, + GPR32, i32> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} +def SMOVwh : NeonI_SMOV<"smov", "h", 0b0, v8i16, i16, neon_uimm3_bare, + GPR32, i32> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} +def SMOVxb : NeonI_SMOV<"smov", "b", 0b1, v16i8, i8, neon_uimm4_bare, + GPR64, i64> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} +def SMOVxh : NeonI_SMOV<"smov", "h", 0b1, v8i16, i16, neon_uimm3_bare, + GPR64, i64> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} +def SMOVxs : NeonI_SMOV<"smov", "s", 0b1, v4i32, i32, neon_uimm2_bare, + GPR64, i64> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} + +multiclass Neon_SMOVx_pattern <ValueType StTy, ValueType NaTy, + ValueType eleTy, Operand StImm, Operand NaImm, + Instruction SMOVI> { + def : Pat<(i64 (sext_inreg + (i64 (anyext + (i32 (vector_extract + (StTy VPR128:$Rn), (StImm:$Imm))))), + eleTy)), + (SMOVI VPR128:$Rn, StImm:$Imm)>; + + def : Pat<(i64 (sext + (i32 (vector_extract + (StTy VPR128:$Rn), (StImm:$Imm))))), + (SMOVI VPR128:$Rn, StImm:$Imm)>; + + def : Pat<(i64 (sext_inreg + (i64 (vector_extract + (NaTy VPR64:$Rn), (NaImm:$Imm))), + eleTy)), + (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + NaImm:$Imm)>; + + def : Pat<(i64 (sext_inreg + (i64 (anyext + (i32 (vector_extract + (NaTy VPR64:$Rn), (NaImm:$Imm))))), + eleTy)), + (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + NaImm:$Imm)>; + + def : Pat<(i64 (sext + (i32 (vector_extract + (NaTy VPR64:$Rn), (NaImm:$Imm))))), + (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + NaImm:$Imm)>; +} + +defm : Neon_SMOVx_pattern<v16i8, v8i8, i8, neon_uimm4_bare, + neon_uimm3_bare, SMOVxb>; +defm : Neon_SMOVx_pattern<v8i16, v4i16, i16, neon_uimm3_bare, + neon_uimm2_bare, SMOVxh>; +defm : Neon_SMOVx_pattern<v4i32, v2i32, i32, neon_uimm2_bare, + neon_uimm1_bare, SMOVxs>; + +class Neon_SMOVw_pattern <ValueType StTy, ValueType NaTy, + ValueType eleTy, Operand StImm, Operand NaImm, + Instruction SMOVI> + : Pat<(i32 (sext_inreg + (i32 (vector_extract + (NaTy VPR64:$Rn), (NaImm:$Imm))), + eleTy)), + (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + NaImm:$Imm)>; + +def : Neon_SMOVw_pattern<v16i8, v8i8, i8, neon_uimm4_bare, + neon_uimm3_bare, SMOVwb>; +def : Neon_SMOVw_pattern<v8i16, v4i16, i16, neon_uimm3_bare, + neon_uimm2_bare, SMOVwh>; + +class NeonI_UMOV<string asmop, string Res, bit Q, + ValueType OpTy, Operand OpImm, + RegisterClass ResGPR, ValueType ResTy> + : NeonI_copy<Q, 0b0, 0b0111, + (outs ResGPR:$Rd), (ins VPR128:$Rn, OpImm:$Imm), + asmop # "\t$Rd, $Rn." # Res # "[$Imm]", + [(set (ResTy ResGPR:$Rd), + (ResTy (vector_extract + (OpTy VPR128:$Rn), (OpImm:$Imm))))], + NoItinerary> { + bits<4> Imm; +} + +//Unsigned integer move (main, from element) +def UMOVwb : NeonI_UMOV<"umov", "b", 0b0, v16i8, neon_uimm4_bare, + GPR32, i32> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} +def UMOVwh : NeonI_UMOV<"umov", "h", 0b0, v8i16, neon_uimm3_bare, + GPR32, i32> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} +def UMOVws : NeonI_UMOV<"umov", "s", 0b0, v4i32, neon_uimm2_bare, + GPR32, i32> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} +def UMOVxd : NeonI_UMOV<"umov", "d", 0b1, v2i64, neon_uimm1_bare, + GPR64, i64> { + let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; +} + +def : NeonInstAlias<"mov $Rd, $Rn.s[$Imm]", + (UMOVws GPR32:$Rd, VPR128:$Rn, neon_uimm2_bare:$Imm), 0>; +def : NeonInstAlias<"mov $Rd, $Rn.d[$Imm]", + (UMOVxd GPR64:$Rd, VPR128:$Rn, neon_uimm1_bare:$Imm), 0>; + +class Neon_UMOV_pattern <ValueType StTy, ValueType NaTy, ValueType ResTy, + Operand StImm, Operand NaImm, + Instruction SMOVI> + : Pat<(ResTy (vector_extract + (NaTy VPR64:$Rn), NaImm:$Imm)), + (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + NaImm:$Imm)>; + +def : Neon_UMOV_pattern<v16i8, v8i8, i32, neon_uimm4_bare, + neon_uimm3_bare, UMOVwb>; +def : Neon_UMOV_pattern<v8i16, v4i16, i32, neon_uimm3_bare, + neon_uimm2_bare, UMOVwh>; +def : Neon_UMOV_pattern<v4i32, v2i32, i32, neon_uimm2_bare, + neon_uimm1_bare, UMOVws>; + +def : Pat<(i32 (and + (i32 (vector_extract + (v16i8 VPR128:$Rn), (neon_uimm4_bare:$Imm))), + 255)), + (UMOVwb VPR128:$Rn, neon_uimm4_bare:$Imm)>; + +def : Pat<(i32 (and + (i32 (vector_extract + (v8i16 VPR128:$Rn), (neon_uimm3_bare:$Imm))), + 65535)), + (UMOVwh VPR128:$Rn, neon_uimm3_bare:$Imm)>; + +def : Pat<(i64 (zext + (i32 (vector_extract + (v2i64 VPR128:$Rn), (neon_uimm1_bare:$Imm))))), + (UMOVxd VPR128:$Rn, neon_uimm1_bare:$Imm)>; + +def : Pat<(i32 (and + (i32 (vector_extract + (v8i8 VPR64:$Rn), (neon_uimm3_bare:$Imm))), + 255)), + (UMOVwb (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64), + neon_uimm3_bare:$Imm)>; + +def : Pat<(i32 (and + (i32 (vector_extract + (v4i16 VPR64:$Rn), (neon_uimm2_bare:$Imm))), + 65535)), + (UMOVwh (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64), + neon_uimm2_bare:$Imm)>; + +def : Pat<(i64 (zext + (i32 (vector_extract + (v1i64 VPR64:$Rn), (neon_uimm0_bare:$Imm))))), + (UMOVxd (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64), + neon_uimm0_bare:$Imm)>; + +// Additional copy patterns for scalar types +def : Pat<(i32 (vector_extract (v1i8 FPR8:$Rn), (i64 0))), + (UMOVwb (v16i8 + (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8)), (i64 0))>; + +def : Pat<(i32 (vector_extract (v1i16 FPR16:$Rn), (i64 0))), + (UMOVwh (v8i16 + (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16)), (i64 0))>; + +def : Pat<(i32 (vector_extract (v1i32 FPR32:$Rn), (i64 0))), + (FMOVws FPR32:$Rn)>; + +def : Pat<(i64 (vector_extract (v1i64 FPR64:$Rn), (i64 0))), + (FMOVxd FPR64:$Rn)>; + +def : Pat<(f64 (vector_extract (v1f64 FPR64:$Rn), (i64 0))), + (f64 FPR64:$Rn)>; + +def : Pat<(f32 (vector_extract (v1f32 FPR32:$Rn), (i64 0))), + (f32 FPR32:$Rn)>; + +def : Pat<(v1i8 (scalar_to_vector GPR32:$Rn)), + (v1i8 (EXTRACT_SUBREG (v16i8 + (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))), + sub_8))>; + +def : Pat<(v1i16 (scalar_to_vector GPR32:$Rn)), + (v1i16 (EXTRACT_SUBREG (v8i16 + (INShw (v8i16 (IMPLICIT_DEF)), $Rn, (i64 0))), + sub_16))>; + +def : Pat<(v1i32 (scalar_to_vector GPR32:$src)), + (FMOVsw $src)>; + +def : Pat<(v1i64 (scalar_to_vector GPR64:$src)), + (FMOVdx $src)>; + +def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$Rn))), + (v1f32 FPR32:$Rn)>; +def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Rn))), + (v1f64 FPR64:$Rn)>; + +def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))), + (FMOVdd $src)>; + +def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$src))), + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), + (f64 FPR64:$src), sub_64)>; + +class NeonI_DUP_Elt<bit Q, string asmop, string rdlane, string rnlane, + RegisterOperand ResVPR, Operand OpImm> + : NeonI_copy<Q, 0b0, 0b0000, (outs ResVPR:$Rd), + (ins VPR128:$Rn, OpImm:$Imm), + asmop # "\t$Rd" # rdlane # ", $Rn" # rnlane # "[$Imm]", + [], + NoItinerary> { + bits<4> Imm; +} + +def DUPELT16b : NeonI_DUP_Elt<0b1, "dup", ".16b", ".b", VPR128, + neon_uimm4_bare> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} + +def DUPELT8h : NeonI_DUP_Elt<0b1, "dup", ".8h", ".h", VPR128, + neon_uimm3_bare> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} + +def DUPELT4s : NeonI_DUP_Elt<0b1, "dup", ".4s", ".s", VPR128, + neon_uimm2_bare> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} + +def DUPELT2d : NeonI_DUP_Elt<0b1, "dup", ".2d", ".d", VPR128, + neon_uimm1_bare> { + let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; +} + +def DUPELT8b : NeonI_DUP_Elt<0b0, "dup", ".8b", ".b", VPR64, + neon_uimm4_bare> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} + +def DUPELT4h : NeonI_DUP_Elt<0b0, "dup", ".4h", ".h", VPR64, + neon_uimm3_bare> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} + +def DUPELT2s : NeonI_DUP_Elt<0b0, "dup", ".2s", ".s", VPR64, + neon_uimm2_bare> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} + +multiclass NeonI_DUP_Elt_pattern<Instruction DUPELT, ValueType ResTy, + ValueType OpTy,ValueType NaTy, + ValueType ExTy, Operand OpLImm, + Operand OpNImm> { +def : Pat<(ResTy (Neon_vduplane (OpTy VPR128:$Rn), OpLImm:$Imm)), + (ResTy (DUPELT (OpTy VPR128:$Rn), OpLImm:$Imm))>; + +def : Pat<(ResTy (Neon_vduplane + (NaTy VPR64:$Rn), OpNImm:$Imm)), + (ResTy (DUPELT + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), OpNImm:$Imm))>; +} +defm : NeonI_DUP_Elt_pattern<DUPELT16b, v16i8, v16i8, v8i8, v16i8, + neon_uimm4_bare, neon_uimm3_bare>; +defm : NeonI_DUP_Elt_pattern<DUPELT8b, v8i8, v16i8, v8i8, v16i8, + neon_uimm4_bare, neon_uimm3_bare>; +defm : NeonI_DUP_Elt_pattern<DUPELT8h, v8i16, v8i16, v4i16, v8i16, + neon_uimm3_bare, neon_uimm2_bare>; +defm : NeonI_DUP_Elt_pattern<DUPELT4h, v4i16, v8i16, v4i16, v8i16, + neon_uimm3_bare, neon_uimm2_bare>; +defm : NeonI_DUP_Elt_pattern<DUPELT4s, v4i32, v4i32, v2i32, v4i32, + neon_uimm2_bare, neon_uimm1_bare>; +defm : NeonI_DUP_Elt_pattern<DUPELT2s, v2i32, v4i32, v2i32, v4i32, + neon_uimm2_bare, neon_uimm1_bare>; +defm : NeonI_DUP_Elt_pattern<DUPELT2d, v2i64, v2i64, v1i64, v2i64, + neon_uimm1_bare, neon_uimm0_bare>; +defm : NeonI_DUP_Elt_pattern<DUPELT4s, v4f32, v4f32, v2f32, v4f32, + neon_uimm2_bare, neon_uimm1_bare>; +defm : NeonI_DUP_Elt_pattern<DUPELT2s, v2f32, v4f32, v2f32, v4f32, + neon_uimm2_bare, neon_uimm1_bare>; +defm : NeonI_DUP_Elt_pattern<DUPELT2d, v2f64, v2f64, v1f64, v2f64, + neon_uimm1_bare, neon_uimm0_bare>; + +def : Pat<(v2f32 (Neon_vdup (f32 FPR32:$Rn))), + (v2f32 (DUPELT2s + (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32), + (i64 0)))>; +def : Pat<(v4f32 (Neon_vdup (f32 FPR32:$Rn))), + (v4f32 (DUPELT4s + (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32), + (i64 0)))>; +def : Pat<(v2f64 (Neon_vdup (f64 FPR64:$Rn))), + (v2f64 (DUPELT2d + (SUBREG_TO_REG (i64 0), FPR64:$Rn, sub_64), + (i64 0)))>; + +class NeonI_DUP<bit Q, string asmop, string rdlane, + RegisterOperand ResVPR, ValueType ResTy, + RegisterClass OpGPR, ValueType OpTy> + : NeonI_copy<Q, 0b0, 0b0001, (outs ResVPR:$Rd), (ins OpGPR:$Rn), + asmop # "\t$Rd" # rdlane # ", $Rn", + [(set (ResTy ResVPR:$Rd), + (ResTy (Neon_vdup (OpTy OpGPR:$Rn))))], + NoItinerary>; + +def DUP16b : NeonI_DUP<0b1, "dup", ".16b", VPR128, v16i8, GPR32, i32> { + let Inst{20-16} = 0b00001; + // bits 17-20 are unspecified, but should be set to zero. +} + +def DUP8h : NeonI_DUP<0b1, "dup", ".8h", VPR128, v8i16, GPR32, i32> { + let Inst{20-16} = 0b00010; + // bits 18-20 are unspecified, but should be set to zero. +} + +def DUP4s : NeonI_DUP<0b1, "dup", ".4s", VPR128, v4i32, GPR32, i32> { + let Inst{20-16} = 0b00100; + // bits 19-20 are unspecified, but should be set to zero. +} + +def DUP2d : NeonI_DUP<0b1, "dup", ".2d", VPR128, v2i64, GPR64, i64> { + let Inst{20-16} = 0b01000; + // bit 20 is unspecified, but should be set to zero. +} + +def DUP8b : NeonI_DUP<0b0, "dup", ".8b", VPR64, v8i8, GPR32, i32> { + let Inst{20-16} = 0b00001; + // bits 17-20 are unspecified, but should be set to zero. +} + +def DUP4h : NeonI_DUP<0b0, "dup", ".4h", VPR64, v4i16, GPR32, i32> { + let Inst{20-16} = 0b00010; + // bits 18-20 are unspecified, but should be set to zero. +} + +def DUP2s : NeonI_DUP<0b0, "dup", ".2s", VPR64, v2i32, GPR32, i32> { + let Inst{20-16} = 0b00100; + // bits 19-20 are unspecified, but should be set to zero. +} + +// patterns for CONCAT_VECTORS +multiclass Concat_Vector_Pattern<ValueType ResTy, ValueType OpTy> { +def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), undef)), + (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)>; +def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))), + (INSELd + (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rm, sub_64)), + (i64 1), + (i64 0))>; +def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rn))), + (DUPELT2d + (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + (i64 0))> ; +} + +defm : Concat_Vector_Pattern<v16i8, v8i8>; +defm : Concat_Vector_Pattern<v8i16, v4i16>; +defm : Concat_Vector_Pattern<v4i32, v2i32>; +defm : Concat_Vector_Pattern<v2i64, v1i64>; +defm : Concat_Vector_Pattern<v4f32, v2f32>; +defm : Concat_Vector_Pattern<v2f64, v1f64>; + +//patterns for EXTRACT_SUBVECTOR +def : Pat<(v8i8 (extract_subvector (v16i8 VPR128:$Rn), (i64 0))), + (v8i8 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v4i16 (extract_subvector (v8i16 VPR128:$Rn), (i64 0))), + (v4i16 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v2i32 (extract_subvector (v4i32 VPR128:$Rn), (i64 0))), + (v2i32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v1i64 (extract_subvector (v2i64 VPR128:$Rn), (i64 0))), + (v1i64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v2f32 (extract_subvector (v4f32 VPR128:$Rn), (i64 0))), + (v2f32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v1f64 (extract_subvector (v2f64 VPR128:$Rn), (i64 0))), + (v1f64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; + +// The followings are for instruction class (3V Elem) + +// Variant 1 + +class NI_2VE<bit q, bit u, bits<2> size, bits<4> opcode, + string asmop, string ResS, string OpS, string EleOpS, + Operand OpImm, RegisterOperand ResVPR, + RegisterOperand OpVPR, RegisterOperand EleOpVPR> + : NeonI_2VElem<q, u, size, opcode, + (outs ResVPR:$Rd), (ins ResVPR:$src, OpVPR:$Rn, + EleOpVPR:$Re, OpImm:$Index), + asmop # "\t$Rd." # ResS # ", $Rn." # OpS # + ", $Re." # EleOpS # "[$Index]", + [], + NoItinerary> { + bits<3> Index; + bits<5> Re; + + let Constraints = "$src = $Rd"; +} + +multiclass NI_2VE_v1<bit u, bits<4> opcode, string asmop> { + // vector register class for element is always 128-bit to cover the max index + def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", + neon_uimm2_bare, VPR64, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // Index operations on 16-bit(H) elements are restricted to using v0-v15. + def _4h8h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h", + neon_uimm3_bare, VPR64, VPR64, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } + + def _8h8h : NI_2VE<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h", + neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } +} + +defm MLAvve : NI_2VE_v1<0b1, 0b0000, "mla">; +defm MLSvve : NI_2VE_v1<0b1, 0b0100, "mls">; + +// Pattern for lane in 128-bit vector +class NI_2VE_laneq<Instruction INST, Operand OpImm, SDPatternOperator op, + RegisterOperand ResVPR, RegisterOperand OpVPR, + RegisterOperand EleOpVPR, ValueType ResTy, ValueType OpTy, + ValueType EleOpTy> + : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn), + (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST ResVPR:$src, OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VE_lane<Instruction INST, Operand OpImm, SDPatternOperator op, + RegisterOperand ResVPR, RegisterOperand OpVPR, + RegisterOperand EleOpVPR, ValueType ResTy, ValueType OpTy, + ValueType EleOpTy> + : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn), + (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST ResVPR:$src, OpVPR:$Rn, + (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; + +multiclass NI_2VE_v1_pat<string subop, SDPatternOperator op> +{ + def : NI_2VE_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare, + op, VPR64, VPR64, VPR128, v2i32, v2i32, v4i32>; + + def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare, + op, VPR128, VPR128, VPR128, v4i32, v4i32, v4i32>; + + def : NI_2VE_laneq<!cast<Instruction>(subop # "_4h8h"), neon_uimm3_bare, + op, VPR64, VPR64, VPR128Lo, v4i16, v4i16, v8i16>; + + def : NI_2VE_laneq<!cast<Instruction>(subop # "_8h8h"), neon_uimm3_bare, + op, VPR128, VPR128, VPR128Lo, v8i16, v8i16, v8i16>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare, + op, VPR64, VPR64, VPR64, v2i32, v2i32, v2i32>; + + def : NI_2VE_lane<!cast<Instruction>(subop # "_4h8h"), neon_uimm2_bare, + op, VPR64, VPR64, VPR64Lo, v4i16, v4i16, v4i16>; +} + +defm MLA_lane_v1 : NI_2VE_v1_pat<"MLAvve", Neon_mla>; +defm MLS_lane_v1 : NI_2VE_v1_pat<"MLSvve", Neon_mls>; + +class NI_2VE_2op<bit q, bit u, bits<2> size, bits<4> opcode, + string asmop, string ResS, string OpS, string EleOpS, + Operand OpImm, RegisterOperand ResVPR, + RegisterOperand OpVPR, RegisterOperand EleOpVPR> + : NeonI_2VElem<q, u, size, opcode, + (outs ResVPR:$Rd), (ins OpVPR:$Rn, + EleOpVPR:$Re, OpImm:$Index), + asmop # "\t$Rd." # ResS # ", $Rn." # OpS # + ", $Re." # EleOpS # "[$Index]", + [], + NoItinerary> { + bits<3> Index; + bits<5> Re; +} + +multiclass NI_2VE_v1_2op<bit u, bits<4> opcode, string asmop> { + // vector register class for element is always 128-bit to cover the max index + def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", + neon_uimm2_bare, VPR64, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // Index operations on 16-bit(H) elements are restricted to using v0-v15. + def _4h8h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h", + neon_uimm3_bare, VPR64, VPR64, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } + + def _8h8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h", + neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } +} + +defm MULve : NI_2VE_v1_2op<0b0, 0b1000, "mul">; +defm SQDMULHve : NI_2VE_v1_2op<0b0, 0b1100, "sqdmulh">; +defm SQRDMULHve : NI_2VE_v1_2op<0b0, 0b1101, "sqrdmulh">; + +// Pattern for lane in 128-bit vector +class NI_2VE_mul_laneq<Instruction INST, Operand OpImm, SDPatternOperator op, + RegisterOperand OpVPR, RegisterOperand EleOpVPR, + ValueType ResTy, ValueType OpTy, ValueType EleOpTy> + : Pat<(ResTy (op (OpTy OpVPR:$Rn), + (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VE_mul_lane<Instruction INST, Operand OpImm, SDPatternOperator op, + RegisterOperand OpVPR, RegisterOperand EleOpVPR, + ValueType ResTy, ValueType OpTy, ValueType EleOpTy> + : Pat<(ResTy (op (OpTy OpVPR:$Rn), + (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST OpVPR:$Rn, + (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; + +multiclass NI_2VE_mul_v1_pat<string subop, SDPatternOperator op> { + def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare, + op, VPR64, VPR128, v2i32, v2i32, v4i32>; + + def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare, + op, VPR128, VPR128, v4i32, v4i32, v4i32>; + + def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4h8h"), neon_uimm3_bare, + op, VPR64, VPR128Lo, v4i16, v4i16, v8i16>; + + def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_8h8h"), neon_uimm3_bare, + op, VPR128, VPR128Lo, v8i16, v8i16, v8i16>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare, + op, VPR64, VPR64, v2i32, v2i32, v2i32>; + + def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_4h8h"), neon_uimm2_bare, + op, VPR64, VPR64Lo, v4i16, v4i16, v4i16>; +} + +defm MUL_lane_v1 : NI_2VE_mul_v1_pat<"MULve", mul>; +defm SQDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQDMULHve", int_arm_neon_vqdmulh>; +defm SQRDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQRDMULHve", int_arm_neon_vqrdmulh>; + +// Variant 2 + +multiclass NI_2VE_v2_2op<bit u, bits<4> opcode, string asmop> { + // vector register class for element is always 128-bit to cover the max index + def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", + neon_uimm2_bare, VPR64, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // _1d2d doesn't exist! + + def _2d2d : NI_2VE_2op<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d", + neon_uimm1_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{0}}; + let Inst{21} = 0b0; + let Inst{20-16} = Re; + } +} + +defm FMULve : NI_2VE_v2_2op<0b0, 0b1001, "fmul">; +defm FMULXve : NI_2VE_v2_2op<0b1, 0b1001, "fmulx">; + +class NI_2VE_mul_lane_2d<Instruction INST, Operand OpImm, SDPatternOperator op, + RegisterOperand OpVPR, RegisterOperand EleOpVPR, + ValueType ResTy, ValueType OpTy, ValueType EleOpTy, + SDPatternOperator coreop> + : Pat<(ResTy (op (OpTy OpVPR:$Rn), + (OpTy (coreop (EleOpTy EleOpVPR:$Re), (EleOpTy EleOpVPR:$Re))))), + (INST OpVPR:$Rn, + (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), 0)>; + +multiclass NI_2VE_mul_v2_pat<string subop, SDPatternOperator op> { + def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare, + op, VPR64, VPR128, v2f32, v2f32, v4f32>; + + def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare, + op, VPR128, VPR128, v4f32, v4f32, v4f32>; + + def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2d2d"), neon_uimm1_bare, + op, VPR128, VPR128, v2f64, v2f64, v2f64>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare, + op, VPR64, VPR64, v2f32, v2f32, v2f32>; + + def : NI_2VE_mul_lane_2d<!cast<Instruction>(subop # "_2d2d"), neon_uimm1_bare, + op, VPR128, VPR64, v2f64, v2f64, v1f64, + BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>; +} + +defm FMUL_lane_v2 : NI_2VE_mul_v2_pat<"FMULve", fmul>; +defm FMULX_lane_v2 : NI_2VE_mul_v2_pat<"FMULXve", int_aarch64_neon_vmulx>; + +def : Pat<(v2f32 (fmul (v2f32 (Neon_vdup (f32 FPR32:$Re))), + (v2f32 VPR64:$Rn))), + (FMULve_2s4s VPR64:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>; + +def : Pat<(v4f32 (fmul (v4f32 (Neon_vdup (f32 FPR32:$Re))), + (v4f32 VPR128:$Rn))), + (FMULve_4s4s VPR128:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>; + +def : Pat<(v2f64 (fmul (v2f64 (Neon_vdup (f64 FPR64:$Re))), + (v2f64 VPR128:$Rn))), + (FMULve_2d2d VPR128:$Rn, (SUBREG_TO_REG (i64 0), $Re, sub_64), 0)>; + +// The followings are patterns using fma +// -ffp-contract=fast generates fma + +multiclass NI_2VE_v2<bit u, bits<4> opcode, string asmop> { + // vector register class for element is always 128-bit to cover the max index + def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", + neon_uimm2_bare, VPR64, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // _1d2d doesn't exist! + + def _2d2d : NI_2VE<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d", + neon_uimm1_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{0}}; + let Inst{21} = 0b0; + let Inst{20-16} = Re; + } +} + +defm FMLAvve : NI_2VE_v2<0b0, 0b0001, "fmla">; +defm FMLSvve : NI_2VE_v2<0b0, 0b0101, "fmls">; + +// Pattern for lane in 128-bit vector +class NI_2VEswap_laneq<Instruction INST, Operand OpImm, SDPatternOperator op, + RegisterOperand ResVPR, RegisterOperand OpVPR, + ValueType ResTy, ValueType OpTy, + SDPatternOperator coreop> + : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))), + (ResTy ResVPR:$src), (ResTy ResVPR:$Rn))), + (INST ResVPR:$src, ResVPR:$Rn, OpVPR:$Re, OpImm:$Index)>; + +// Pattern for lane 0 +class NI_2VEfma_lane0<Instruction INST, SDPatternOperator op, + RegisterOperand ResVPR, ValueType ResTy> + : Pat<(ResTy (op (ResTy ResVPR:$Rn), + (ResTy (Neon_vdup (f32 FPR32:$Re))), + (ResTy ResVPR:$src))), + (INST ResVPR:$src, ResVPR:$Rn, + (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>; + +// Pattern for lane in 64-bit vector +class NI_2VEswap_lane<Instruction INST, Operand OpImm, SDPatternOperator op, + RegisterOperand ResVPR, RegisterOperand OpVPR, + ValueType ResTy, ValueType OpTy, + SDPatternOperator coreop> + : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))), + (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))), + (INST ResVPR:$src, ResVPR:$Rn, + (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VEswap_lane_2d2d<Instruction INST, Operand OpImm, + SDPatternOperator op, + RegisterOperand ResVPR, RegisterOperand OpVPR, + ValueType ResTy, ValueType OpTy, + SDPatternOperator coreop> + : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (OpTy OpVPR:$Re))), + (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))), + (INST ResVPR:$src, ResVPR:$Rn, + (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), 0)>; + + +multiclass NI_2VE_fma_v2_pat<string subop, SDPatternOperator op> { + def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"), + neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VEfma_lane0<!cast<Instruction>(subop # "_2s4s"), + op, VPR64, v2f32>; + + def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"), + neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VEfma_lane0<!cast<Instruction>(subop # "_4s4s"), + op, VPR128, v4f32>; + + def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"), + neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64, + BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>; +} + +defm FMLA_lane_v2_s : NI_2VE_fma_v2_pat<"FMLAvve", fma>; + +// Pattern for lane 0 +class NI_2VEfms_lane0<Instruction INST, SDPatternOperator op, + RegisterOperand ResVPR, ValueType ResTy> + : Pat<(ResTy (op (ResTy (fneg ResVPR:$Rn)), + (ResTy (Neon_vdup (f32 FPR32:$Re))), + (ResTy ResVPR:$src))), + (INST ResVPR:$src, ResVPR:$Rn, + (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>; + +multiclass NI_2VE_fms_v2_pat<string subop, SDPatternOperator op> +{ + def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"), + neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32, + BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"), + neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32, + BinOpFrag<(Neon_vduplane + (fneg node:$LHS), node:$RHS)>>; + + def : NI_2VEfms_lane0<!cast<Instruction>(subop # "_2s4s"), + op, VPR64, v2f32>; + + def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"), + neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32, + BinOpFrag<(fneg (Neon_vduplane + node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"), + neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32, + BinOpFrag<(Neon_vduplane + (fneg node:$LHS), node:$RHS)>>; + + def : NI_2VEfms_lane0<!cast<Instruction>(subop # "_4s4s"), + op, VPR128, v4f32>; + + def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64, + BinOpFrag<(fneg (Neon_vduplane + node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64, + BinOpFrag<(Neon_vduplane + (fneg node:$LHS), node:$RHS)>>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"), + neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32, + BinOpFrag<(fneg (Neon_vduplane + node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"), + neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32, + BinOpFrag<(Neon_vduplane + (fneg node:$LHS), node:$RHS)>>; + + def : NI_2VEswap_lane<!cast<Instruction>(subop # "_4s4s"), + neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32, + BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_lane<!cast<Instruction>(subop # "_4s4s"), + neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32, + BinOpFrag<(Neon_vduplane (fneg node:$LHS), node:$RHS)>>; + + def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64, + BinOpFrag<(fneg (Neon_combine_2d + node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64, + BinOpFrag<(Neon_combine_2d + (fneg node:$LHS), (fneg node:$RHS))>>; +} + +defm FMLS_lane_v2_s : NI_2VE_fms_v2_pat<"FMLSvve", fma>; + +// Variant 3: Long type +// E.g. SMLAL : 4S/4H/H (v0-v15), 2D/2S/S +// SMLAL2: 4S/8H/H (v0-v15), 2D/4S/S + +multiclass NI_2VE_v3<bit u, bits<4> opcode, string asmop> { + // vector register class for element is always 128-bit to cover the max index + def _2d2s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s", + neon_uimm2_bare, VPR128, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _2d4s : NI_2VE<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // Index operations on 16-bit(H) elements are restricted to using v0-v15. + def _4s8h : NI_2VE<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h", + neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } + + def _4s4h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h", + neon_uimm3_bare, VPR128, VPR64, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } +} + +defm SMLALvve : NI_2VE_v3<0b0, 0b0010, "smlal">; +defm UMLALvve : NI_2VE_v3<0b1, 0b0010, "umlal">; +defm SMLSLvve : NI_2VE_v3<0b0, 0b0110, "smlsl">; +defm UMLSLvve : NI_2VE_v3<0b1, 0b0110, "umlsl">; +defm SQDMLALvve : NI_2VE_v3<0b0, 0b0011, "sqdmlal">; +defm SQDMLSLvve : NI_2VE_v3<0b0, 0b0111, "sqdmlsl">; + +multiclass NI_2VE_v3_2op<bit u, bits<4> opcode, string asmop> { + // vector register class for element is always 128-bit to cover the max index + def _2d2s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s", + neon_uimm2_bare, VPR128, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _2d4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // Index operations on 16-bit(H) elements are restricted to using v0-v15. + def _4s8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h", + neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } + + def _4s4h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h", + neon_uimm3_bare, VPR128, VPR64, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } +} + +defm SMULLve : NI_2VE_v3_2op<0b0, 0b1010, "smull">; +defm UMULLve : NI_2VE_v3_2op<0b1, 0b1010, "umull">; +defm SQDMULLve : NI_2VE_v3_2op<0b0, 0b1011, "sqdmull">; + +def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))), + (FMOVdd $src)>; +def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$src))), + (FMOVss $src)>; + +// Pattern for lane in 128-bit vector +class NI_2VEL2_laneq<Instruction INST, Operand OpImm, SDPatternOperator op, + RegisterOperand EleOpVPR, ValueType ResTy, + ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy, + SDPatternOperator hiop> + : Pat<(ResTy (op (ResTy VPR128:$src), + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (Neon_vduplane + (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST VPR128:$src, VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VEL2_lane<Instruction INST, Operand OpImm, SDPatternOperator op, + RegisterOperand EleOpVPR, ValueType ResTy, + ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy, + SDPatternOperator hiop> + : Pat<(ResTy (op (ResTy VPR128:$src), + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (Neon_vduplane + (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST VPR128:$src, VPR128:$Rn, + (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; + +class NI_2VEL2_lane0<Instruction INST, SDPatternOperator op, + ValueType ResTy, ValueType OpTy, ValueType HalfOpTy, + SDPatternOperator hiop, Instruction DupInst> + : Pat<(ResTy (op (ResTy VPR128:$src), + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))), + (INST VPR128:$src, VPR128:$Rn, (DupInst $Re), 0)>; + +multiclass NI_2VEL_v3_pat<string subop, SDPatternOperator op> { + def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare, + op, VPR128, VPR64, VPR128Lo, v4i32, v4i16, v8i16>; + + def : NI_2VE_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare, + op, VPR128, VPR64, VPR128, v2i64, v2i32, v4i32>; + + def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare, + op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>; + + def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare, + op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>; + + def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_4s8h"), + op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>; + + def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_2d4s"), + op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare, + op, VPR128, VPR64, VPR64Lo, v4i32, v4i16, v4i16>; + + def : NI_2VE_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare, + op, VPR128, VPR64, VPR64, v2i64, v2i32, v2i32>; + + def : NI_2VEL2_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare, + op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>; + + def : NI_2VEL2_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare, + op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>; +} + +defm SMLAL_lane_v3 : NI_2VEL_v3_pat<"SMLALvve", Neon_smlal>; +defm UMLAL_lane_v3 : NI_2VEL_v3_pat<"UMLALvve", Neon_umlal>; +defm SMLSL_lane_v3 : NI_2VEL_v3_pat<"SMLSLvve", Neon_smlsl>; +defm UMLSL_lane_v3 : NI_2VEL_v3_pat<"UMLSLvve", Neon_umlsl>; + +// Pattern for lane in 128-bit vector +class NI_2VEL2_mul_laneq<Instruction INST, Operand OpImm, SDPatternOperator op, + RegisterOperand EleOpVPR, ValueType ResTy, + ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy, + SDPatternOperator hiop> + : Pat<(ResTy (op + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (Neon_vduplane + (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VEL2_mul_lane<Instruction INST, Operand OpImm, SDPatternOperator op, + RegisterOperand EleOpVPR, ValueType ResTy, + ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy, + SDPatternOperator hiop> + : Pat<(ResTy (op + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (Neon_vduplane + (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST VPR128:$Rn, + (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; + +// Pattern for fixed lane 0 +class NI_2VEL2_mul_lane0<Instruction INST, SDPatternOperator op, + ValueType ResTy, ValueType OpTy, ValueType HalfOpTy, + SDPatternOperator hiop, Instruction DupInst> + : Pat<(ResTy (op + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))), + (INST VPR128:$Rn, (DupInst $Re), 0)>; + +multiclass NI_2VEL_mul_v3_pat<string subop, SDPatternOperator op> { + def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare, + op, VPR64, VPR128Lo, v4i32, v4i16, v8i16>; + + def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare, + op, VPR64, VPR128, v2i64, v2i32, v4i32>; + + def : NI_2VEL2_mul_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare, + op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>; + + def : NI_2VEL2_mul_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare, + op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>; + + def : NI_2VEL2_mul_lane0<!cast<Instruction>(subop # "_4s8h"), + op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>; + + def : NI_2VEL2_mul_lane0<!cast<Instruction>(subop # "_2d4s"), + op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare, + op, VPR64, VPR64Lo, v4i32, v4i16, v4i16>; + + def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare, + op, VPR64, VPR64, v2i64, v2i32, v2i32>; + + def : NI_2VEL2_mul_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare, + op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>; + + def : NI_2VEL2_mul_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare, + op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>; +} + +defm SMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SMULLve", int_arm_neon_vmulls>; +defm UMULL_lane_v3 : NI_2VEL_mul_v3_pat<"UMULLve", int_arm_neon_vmullu>; +defm SQDMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SQDMULLve", int_arm_neon_vqdmull>; + +multiclass NI_qdma<SDPatternOperator op> { + def _4s : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (op node:$Ra, + (v4i32 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>; + + def _2d : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (op node:$Ra, + (v2i64 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>; +} + +defm Neon_qdmlal : NI_qdma<int_arm_neon_vqadds>; +defm Neon_qdmlsl : NI_qdma<int_arm_neon_vqsubs>; + +multiclass NI_2VEL_v3_qdma_pat<string subop, string op> { + def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare, + !cast<PatFrag>(op # "_4s"), VPR128, VPR64, VPR128Lo, + v4i32, v4i16, v8i16>; + + def : NI_2VE_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare, + !cast<PatFrag>(op # "_2d"), VPR128, VPR64, VPR128, + v2i64, v2i32, v4i32>; + + def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare, + !cast<PatFrag>(op # "_4s"), VPR128Lo, + v4i32, v8i16, v8i16, v4i16, Neon_High8H>; + + def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare, + !cast<PatFrag>(op # "_2d"), VPR128, + v2i64, v4i32, v4i32, v2i32, Neon_High4S>; + + def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_4s8h"), + !cast<PatFrag>(op # "_4s"), + v4i32, v8i16, v4i16, Neon_High8H, DUP8h>; + + def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_2d4s"), + !cast<PatFrag>(op # "_2d"), + v2i64, v4i32, v2i32, Neon_High4S, DUP4s>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare, + !cast<PatFrag>(op # "_4s"), VPR128, VPR64, VPR64Lo, + v4i32, v4i16, v4i16>; + + def : NI_2VE_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare, + !cast<PatFrag>(op # "_2d"), VPR128, VPR64, VPR64, + v2i64, v2i32, v2i32>; + + def : NI_2VEL2_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare, + !cast<PatFrag>(op # "_4s"), VPR64Lo, + v4i32, v8i16, v4i16, v4i16, Neon_High8H>; + + def : NI_2VEL2_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare, + !cast<PatFrag>(op # "_2d"), VPR64, + v2i64, v4i32, v2i32, v2i32, Neon_High4S>; +} + +defm SQDMLAL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLALvve", "Neon_qdmlal">; +defm SQDMLSL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLSLvve", "Neon_qdmlsl">; + +// End of implementation for instruction class (3V Elem) + +class NeonI_REV<string asmop, string Res, bits<2> size, bit Q, bit U, + bits<5> opcode, RegisterOperand ResVPR, ValueType ResTy, + SDPatternOperator Neon_Rev> + : NeonI_2VMisc<Q, U, size, opcode, + (outs ResVPR:$Rd), (ins ResVPR:$Rn), + asmop # "\t$Rd." # Res # ", $Rn." # Res, + [(set (ResTy ResVPR:$Rd), + (ResTy (Neon_Rev (ResTy ResVPR:$Rn))))], + NoItinerary> ; + +def REV64_16b : NeonI_REV<"rev64", "16b", 0b00, 0b1, 0b0, 0b00000, VPR128, + v16i8, Neon_rev64>; +def REV64_8h : NeonI_REV<"rev64", "8h", 0b01, 0b1, 0b0, 0b00000, VPR128, + v8i16, Neon_rev64>; +def REV64_4s : NeonI_REV<"rev64", "4s", 0b10, 0b1, 0b0, 0b00000, VPR128, + v4i32, Neon_rev64>; +def REV64_8b : NeonI_REV<"rev64", "8b", 0b00, 0b0, 0b0, 0b00000, VPR64, + v8i8, Neon_rev64>; +def REV64_4h : NeonI_REV<"rev64", "4h", 0b01, 0b0, 0b0, 0b00000, VPR64, + v4i16, Neon_rev64>; +def REV64_2s : NeonI_REV<"rev64", "2s", 0b10, 0b0, 0b0, 0b00000, VPR64, + v2i32, Neon_rev64>; + +def : Pat<(v4f32 (Neon_rev64 (v4f32 VPR128:$Rn))), (REV64_4s VPR128:$Rn)>; +def : Pat<(v2f32 (Neon_rev64 (v2f32 VPR64:$Rn))), (REV64_2s VPR64:$Rn)>; + +def REV32_16b : NeonI_REV<"rev32", "16b", 0b00, 0b1, 0b1, 0b00000, VPR128, + v16i8, Neon_rev32>; +def REV32_8h : NeonI_REV<"rev32", "8h", 0b01, 0b1, 0b1, 0b00000, VPR128, + v8i16, Neon_rev32>; +def REV32_8b : NeonI_REV<"rev32", "8b", 0b00, 0b0, 0b1, 0b00000, VPR64, + v8i8, Neon_rev32>; +def REV32_4h : NeonI_REV<"rev32", "4h", 0b01, 0b0, 0b1, 0b00000, VPR64, + v4i16, Neon_rev32>; + +def REV16_16b : NeonI_REV<"rev16", "16b", 0b00, 0b1, 0b0, 0b00001, VPR128, + v16i8, Neon_rev16>; +def REV16_8b : NeonI_REV<"rev16", "8b", 0b00, 0b0, 0b0, 0b00001, VPR64, + v8i8, Neon_rev16>; + +multiclass NeonI_PairwiseAdd<string asmop, bit U, bits<5> opcode, + SDPatternOperator Neon_Padd> { + def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.8h, $Rn.16b", + [(set (v8i16 VPR128:$Rd), + (v8i16 (Neon_Padd (v16i8 VPR128:$Rn))))], + NoItinerary>; + + def 8b4h : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.4h, $Rn.8b", + [(set (v4i16 VPR64:$Rd), + (v4i16 (Neon_Padd (v8i8 VPR64:$Rn))))], + NoItinerary>; + + def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.8h", + [(set (v4i32 VPR128:$Rd), + (v4i32 (Neon_Padd (v8i16 VPR128:$Rn))))], + NoItinerary>; + + def 4h2s : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.4h", + [(set (v2i32 VPR64:$Rd), + (v2i32 (Neon_Padd (v4i16 VPR64:$Rn))))], + NoItinerary>; + + def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.2d, $Rn.4s", + [(set (v2i64 VPR128:$Rd), + (v2i64 (Neon_Padd (v4i32 VPR128:$Rn))))], + NoItinerary>; + + def 2s1d : NeonI_2VMisc<0b0, U, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.1d, $Rn.2s", + [(set (v1i64 VPR64:$Rd), + (v1i64 (Neon_Padd (v2i32 VPR64:$Rn))))], + NoItinerary>; +} + +defm SADDLP : NeonI_PairwiseAdd<"saddlp", 0b0, 0b00010, + int_arm_neon_vpaddls>; +defm UADDLP : NeonI_PairwiseAdd<"uaddlp", 0b1, 0b00010, + int_arm_neon_vpaddlu>; + +multiclass NeonI_PairwiseAddAcc<string asmop, bit U, bits<5> opcode, + SDPatternOperator Neon_Padd> { + let Constraints = "$src = $Rd" in { + def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "\t$Rd.8h, $Rn.16b", + [(set (v8i16 VPR128:$Rd), + (v8i16 (Neon_Padd + (v8i16 VPR128:$src), (v16i8 VPR128:$Rn))))], + NoItinerary>; + + def 8b4h : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn), + asmop # "\t$Rd.4h, $Rn.8b", + [(set (v4i16 VPR64:$Rd), + (v4i16 (Neon_Padd + (v4i16 VPR64:$src), (v8i8 VPR64:$Rn))))], + NoItinerary>; + + def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.8h", + [(set (v4i32 VPR128:$Rd), + (v4i32 (Neon_Padd + (v4i32 VPR128:$src), (v8i16 VPR128:$Rn))))], + NoItinerary>; + + def 4h2s : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.4h", + [(set (v2i32 VPR64:$Rd), + (v2i32 (Neon_Padd + (v2i32 VPR64:$src), (v4i16 VPR64:$Rn))))], + NoItinerary>; + + def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "\t$Rd.2d, $Rn.4s", + [(set (v2i64 VPR128:$Rd), + (v2i64 (Neon_Padd + (v2i64 VPR128:$src), (v4i32 VPR128:$Rn))))], + NoItinerary>; + + def 2s1d : NeonI_2VMisc<0b0, U, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn), + asmop # "\t$Rd.1d, $Rn.2s", + [(set (v1i64 VPR64:$Rd), + (v1i64 (Neon_Padd + (v1i64 VPR64:$src), (v2i32 VPR64:$Rn))))], + NoItinerary>; + } +} + +defm SADALP : NeonI_PairwiseAddAcc<"sadalp", 0b0, 0b00110, + int_arm_neon_vpadals>; +defm UADALP : NeonI_PairwiseAddAcc<"uadalp", 0b1, 0b00110, + int_arm_neon_vpadalu>; + +multiclass NeonI_2VMisc_BHSDsize_1Arg<string asmop, bit U, bits<5> opcode> { + def 16b : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.16b, $Rn.16b", + [], NoItinerary>; + + def 8h : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.8h, $Rn.8h", + [], NoItinerary>; + + def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.4s", + [], NoItinerary>; + + def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.2d, $Rn.2d", + [], NoItinerary>; + + def 8b : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.8b, $Rn.8b", + [], NoItinerary>; + + def 4h : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.4h, $Rn.4h", + [], NoItinerary>; + + def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.2s", + [], NoItinerary>; +} + +defm SQABS : NeonI_2VMisc_BHSDsize_1Arg<"sqabs", 0b0, 0b00111>; +defm SQNEG : NeonI_2VMisc_BHSDsize_1Arg<"sqneg", 0b1, 0b00111>; +defm ABS : NeonI_2VMisc_BHSDsize_1Arg<"abs", 0b0, 0b01011>; +defm NEG : NeonI_2VMisc_BHSDsize_1Arg<"neg", 0b1, 0b01011>; + +multiclass NeonI_2VMisc_BHSD_1Arg_Pattern<string Prefix, + SDPatternOperator Neon_Op> { + def : Pat<(v16i8 (Neon_Op (v16i8 VPR128:$Rn))), + (v16i8 (!cast<Instruction>(Prefix # 16b) (v16i8 VPR128:$Rn)))>; + + def : Pat<(v8i16 (Neon_Op (v8i16 VPR128:$Rn))), + (v8i16 (!cast<Instruction>(Prefix # 8h) (v8i16 VPR128:$Rn)))>; + + def : Pat<(v4i32 (Neon_Op (v4i32 VPR128:$Rn))), + (v4i32 (!cast<Instruction>(Prefix # 4s) (v4i32 VPR128:$Rn)))>; + + def : Pat<(v2i64 (Neon_Op (v2i64 VPR128:$Rn))), + (v2i64 (!cast<Instruction>(Prefix # 2d) (v2i64 VPR128:$Rn)))>; + + def : Pat<(v8i8 (Neon_Op (v8i8 VPR64:$Rn))), + (v8i8 (!cast<Instruction>(Prefix # 8b) (v8i8 VPR64:$Rn)))>; + + def : Pat<(v4i16 (Neon_Op (v4i16 VPR64:$Rn))), + (v4i16 (!cast<Instruction>(Prefix # 4h) (v4i16 VPR64:$Rn)))>; + + def : Pat<(v2i32 (Neon_Op (v2i32 VPR64:$Rn))), + (v2i32 (!cast<Instruction>(Prefix # 2s) (v2i32 VPR64:$Rn)))>; +} + +defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"SQABS", int_arm_neon_vqabs>; +defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"SQNEG", int_arm_neon_vqneg>; +defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"ABS", int_arm_neon_vabs>; + +def : Pat<(v16i8 (sub + (v16i8 Neon_AllZero), + (v16i8 VPR128:$Rn))), + (v16i8 (NEG16b (v16i8 VPR128:$Rn)))>; +def : Pat<(v8i8 (sub + (v8i8 Neon_AllZero), + (v8i8 VPR64:$Rn))), + (v8i8 (NEG8b (v8i8 VPR64:$Rn)))>; +def : Pat<(v8i16 (sub + (v8i16 (bitconvert (v16i8 Neon_AllZero))), + (v8i16 VPR128:$Rn))), + (v8i16 (NEG8h (v8i16 VPR128:$Rn)))>; +def : Pat<(v4i16 (sub + (v4i16 (bitconvert (v8i8 Neon_AllZero))), + (v4i16 VPR64:$Rn))), + (v4i16 (NEG4h (v4i16 VPR64:$Rn)))>; +def : Pat<(v4i32 (sub + (v4i32 (bitconvert (v16i8 Neon_AllZero))), + (v4i32 VPR128:$Rn))), + (v4i32 (NEG4s (v4i32 VPR128:$Rn)))>; +def : Pat<(v2i32 (sub + (v2i32 (bitconvert (v8i8 Neon_AllZero))), + (v2i32 VPR64:$Rn))), + (v2i32 (NEG2s (v2i32 VPR64:$Rn)))>; +def : Pat<(v2i64 (sub + (v2i64 (bitconvert (v16i8 Neon_AllZero))), + (v2i64 VPR128:$Rn))), + (v2i64 (NEG2d (v2i64 VPR128:$Rn)))>; + +multiclass NeonI_2VMisc_BHSDsize_2Args<string asmop, bit U, bits<5> opcode> { + let Constraints = "$src = $Rd" in { + def 16b : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "\t$Rd.16b, $Rn.16b", + [], NoItinerary>; + + def 8h : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "\t$Rd.8h, $Rn.8h", + [], NoItinerary>; + + def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.4s", + [], NoItinerary>; + + def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "\t$Rd.2d, $Rn.2d", + [], NoItinerary>; + + def 8b : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn), + asmop # "\t$Rd.8b, $Rn.8b", + [], NoItinerary>; + + def 4h : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn), + asmop # "\t$Rd.4h, $Rn.4h", + [], NoItinerary>; + + def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.2s", + [], NoItinerary>; + } +} + +defm SUQADD : NeonI_2VMisc_BHSDsize_2Args<"suqadd", 0b0, 0b00011>; +defm USQADD : NeonI_2VMisc_BHSDsize_2Args<"usqadd", 0b1, 0b00011>; + +multiclass NeonI_2VMisc_BHSD_2Args_Pattern<string Prefix, + SDPatternOperator Neon_Op> { + def : Pat<(v16i8 (Neon_Op (v16i8 VPR128:$src), (v16i8 VPR128:$Rn))), + (v16i8 (!cast<Instruction>(Prefix # 16b) + (v16i8 VPR128:$src), (v16i8 VPR128:$Rn)))>; + + def : Pat<(v8i16 (Neon_Op (v8i16 VPR128:$src), (v8i16 VPR128:$Rn))), + (v8i16 (!cast<Instruction>(Prefix # 8h) + (v8i16 VPR128:$src), (v8i16 VPR128:$Rn)))>; + + def : Pat<(v4i32 (Neon_Op (v4i32 VPR128:$src), (v4i32 VPR128:$Rn))), + (v4i32 (!cast<Instruction>(Prefix # 4s) + (v4i32 VPR128:$src), (v4i32 VPR128:$Rn)))>; + + def : Pat<(v2i64 (Neon_Op (v2i64 VPR128:$src), (v2i64 VPR128:$Rn))), + (v2i64 (!cast<Instruction>(Prefix # 2d) + (v2i64 VPR128:$src), (v2i64 VPR128:$Rn)))>; + + def : Pat<(v8i8 (Neon_Op (v8i8 VPR64:$src), (v8i8 VPR64:$Rn))), + (v8i8 (!cast<Instruction>(Prefix # 8b) + (v8i8 VPR64:$src), (v8i8 VPR64:$Rn)))>; + + def : Pat<(v4i16 (Neon_Op (v4i16 VPR64:$src), (v4i16 VPR64:$Rn))), + (v4i16 (!cast<Instruction>(Prefix # 4h) + (v4i16 VPR64:$src), (v4i16 VPR64:$Rn)))>; + + def : Pat<(v2i32 (Neon_Op (v2i32 VPR64:$src), (v2i32 VPR64:$Rn))), + (v2i32 (!cast<Instruction>(Prefix # 2s) + (v2i32 VPR64:$src), (v2i32 VPR64:$Rn)))>; +} + +defm : NeonI_2VMisc_BHSD_2Args_Pattern<"SUQADD", int_aarch64_neon_suqadd>; +defm : NeonI_2VMisc_BHSD_2Args_Pattern<"USQADD", int_aarch64_neon_usqadd>; + +multiclass NeonI_2VMisc_BHSsizes<string asmop, bit U, + SDPatternOperator Neon_Op> { + def 16b : NeonI_2VMisc<0b1, U, 0b00, 0b00100, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.16b, $Rn.16b", + [(set (v16i8 VPR128:$Rd), + (v16i8 (Neon_Op (v16i8 VPR128:$Rn))))], + NoItinerary>; + + def 8h : NeonI_2VMisc<0b1, U, 0b01, 0b00100, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.8h, $Rn.8h", + [(set (v8i16 VPR128:$Rd), + (v8i16 (Neon_Op (v8i16 VPR128:$Rn))))], + NoItinerary>; + + def 4s : NeonI_2VMisc<0b1, U, 0b10, 0b00100, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.4s", + [(set (v4i32 VPR128:$Rd), + (v4i32 (Neon_Op (v4i32 VPR128:$Rn))))], + NoItinerary>; + + def 8b : NeonI_2VMisc<0b0, U, 0b00, 0b00100, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.8b, $Rn.8b", + [(set (v8i8 VPR64:$Rd), + (v8i8 (Neon_Op (v8i8 VPR64:$Rn))))], + NoItinerary>; + + def 4h : NeonI_2VMisc<0b0, U, 0b01, 0b00100, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.4h, $Rn.4h", + [(set (v4i16 VPR64:$Rd), + (v4i16 (Neon_Op (v4i16 VPR64:$Rn))))], + NoItinerary>; + + def 2s : NeonI_2VMisc<0b0, U, 0b10, 0b00100, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.2s", + [(set (v2i32 VPR64:$Rd), + (v2i32 (Neon_Op (v2i32 VPR64:$Rn))))], + NoItinerary>; +} + +defm CLS : NeonI_2VMisc_BHSsizes<"cls", 0b0, int_arm_neon_vcls>; +defm CLZ : NeonI_2VMisc_BHSsizes<"clz", 0b1, ctlz>; + +multiclass NeonI_2VMisc_Bsize<string asmop, bit U, bits<2> size, + bits<5> Opcode> { + def 16b : NeonI_2VMisc<0b1, U, size, Opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.16b, $Rn.16b", + [], NoItinerary>; + + def 8b : NeonI_2VMisc<0b0, U, size, Opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.8b, $Rn.8b", + [], NoItinerary>; +} + +defm CNT : NeonI_2VMisc_Bsize<"cnt", 0b0, 0b00, 0b00101>; +defm NOT : NeonI_2VMisc_Bsize<"not", 0b1, 0b00, 0b00101>; +defm RBIT : NeonI_2VMisc_Bsize<"rbit", 0b1, 0b01, 0b00101>; + +def : NeonInstAlias<"mvn $Rd.16b, $Rn.16b", + (NOT16b VPR128:$Rd, VPR128:$Rn), 0>; +def : NeonInstAlias<"mvn $Rd.8b, $Rn.8b", + (NOT8b VPR64:$Rd, VPR64:$Rn), 0>; + +def : Pat<(v16i8 (ctpop (v16i8 VPR128:$Rn))), + (v16i8 (CNT16b (v16i8 VPR128:$Rn)))>; +def : Pat<(v8i8 (ctpop (v8i8 VPR64:$Rn))), + (v8i8 (CNT8b (v8i8 VPR64:$Rn)))>; + +def : Pat<(v16i8 (xor + (v16i8 VPR128:$Rn), + (v16i8 Neon_AllOne))), + (v16i8 (NOT16b (v16i8 VPR128:$Rn)))>; +def : Pat<(v8i8 (xor + (v8i8 VPR64:$Rn), + (v8i8 Neon_AllOne))), + (v8i8 (NOT8b (v8i8 VPR64:$Rn)))>; +def : Pat<(v8i16 (xor + (v8i16 VPR128:$Rn), + (v8i16 (bitconvert (v16i8 Neon_AllOne))))), + (NOT16b VPR128:$Rn)>; +def : Pat<(v4i16 (xor + (v4i16 VPR64:$Rn), + (v4i16 (bitconvert (v8i8 Neon_AllOne))))), + (NOT8b VPR64:$Rn)>; +def : Pat<(v4i32 (xor + (v4i32 VPR128:$Rn), + (v4i32 (bitconvert (v16i8 Neon_AllOne))))), + (NOT16b VPR128:$Rn)>; +def : Pat<(v2i32 (xor + (v2i32 VPR64:$Rn), + (v2i32 (bitconvert (v8i8 Neon_AllOne))))), + (NOT8b VPR64:$Rn)>; +def : Pat<(v2i64 (xor + (v2i64 VPR128:$Rn), + (v2i64 (bitconvert (v16i8 Neon_AllOne))))), + (NOT16b VPR128:$Rn)>; + +def : Pat<(v16i8 (int_aarch64_neon_rbit (v16i8 VPR128:$Rn))), + (v16i8 (RBIT16b (v16i8 VPR128:$Rn)))>; +def : Pat<(v8i8 (int_aarch64_neon_rbit (v8i8 VPR64:$Rn))), + (v8i8 (RBIT8b (v8i8 VPR64:$Rn)))>; + +multiclass NeonI_2VMisc_SDsizes<string asmop, bit U, bits<5> opcode, + SDPatternOperator Neon_Op> { + def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.4s", + [(set (v4f32 VPR128:$Rd), + (v4f32 (Neon_Op (v4f32 VPR128:$Rn))))], + NoItinerary>; + + def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.2d, $Rn.2d", + [(set (v2f64 VPR128:$Rd), + (v2f64 (Neon_Op (v2f64 VPR128:$Rn))))], + NoItinerary>; + + def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.2s", + [(set (v2f32 VPR64:$Rd), + (v2f32 (Neon_Op (v2f32 VPR64:$Rn))))], + NoItinerary>; +} + +defm FABS : NeonI_2VMisc_SDsizes<"fabs", 0b0, 0b01111, fabs>; +defm FNEG : NeonI_2VMisc_SDsizes<"fneg", 0b1, 0b01111, fneg>; + +multiclass NeonI_2VMisc_HSD_Narrow<string asmop, bit U, bits<5> opcode> { + def 8h8b : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.8b, $Rn.8h", + [], NoItinerary>; + + def 4s4h : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4h, $Rn.4s", + [], NoItinerary>; + + def 2d2s : NeonI_2VMisc<0b0, U, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.2s, $Rn.2d", + [], NoItinerary>; + + let Constraints = "$Rd = $src" in { + def 8h16b : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "2\t$Rd.16b, $Rn.8h", + [], NoItinerary>; + + def 4s8h : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "2\t$Rd.8h, $Rn.4s", + [], NoItinerary>; + + def 2d4s : NeonI_2VMisc<0b1, U, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "2\t$Rd.4s, $Rn.2d", + [], NoItinerary>; + } +} + +defm XTN : NeonI_2VMisc_HSD_Narrow<"xtn", 0b0, 0b10010>; +defm SQXTUN : NeonI_2VMisc_HSD_Narrow<"sqxtun", 0b1, 0b10010>; +defm SQXTN : NeonI_2VMisc_HSD_Narrow<"sqxtn", 0b0, 0b10100>; +defm UQXTN : NeonI_2VMisc_HSD_Narrow<"uqxtn", 0b1, 0b10100>; + +multiclass NeonI_2VMisc_Narrow_Patterns<string Prefix, + SDPatternOperator Neon_Op> { + def : Pat<(v8i8 (Neon_Op (v8i16 VPR128:$Rn))), + (v8i8 (!cast<Instruction>(Prefix # 8h8b) (v8i16 VPR128:$Rn)))>; + + def : Pat<(v4i16 (Neon_Op (v4i32 VPR128:$Rn))), + (v4i16 (!cast<Instruction>(Prefix # 4s4h) (v4i32 VPR128:$Rn)))>; + + def : Pat<(v2i32 (Neon_Op (v2i64 VPR128:$Rn))), + (v2i32 (!cast<Instruction>(Prefix # 2d2s) (v2i64 VPR128:$Rn)))>; + + def : Pat<(v16i8 (concat_vectors + (v8i8 VPR64:$src), + (v8i8 (Neon_Op (v8i16 VPR128:$Rn))))), + (!cast<Instruction>(Prefix # 8h16b) + (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64), + VPR128:$Rn)>; + + def : Pat<(v8i16 (concat_vectors + (v4i16 VPR64:$src), + (v4i16 (Neon_Op (v4i32 VPR128:$Rn))))), + (!cast<Instruction>(Prefix # 4s8h) + (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64), + VPR128:$Rn)>; + + def : Pat<(v4i32 (concat_vectors + (v2i32 VPR64:$src), + (v2i32 (Neon_Op (v2i64 VPR128:$Rn))))), + (!cast<Instruction>(Prefix # 2d4s) + (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64), + VPR128:$Rn)>; +} + +defm : NeonI_2VMisc_Narrow_Patterns<"XTN", trunc>; +defm : NeonI_2VMisc_Narrow_Patterns<"SQXTUN", int_arm_neon_vqmovnsu>; +defm : NeonI_2VMisc_Narrow_Patterns<"SQXTN", int_arm_neon_vqmovns>; +defm : NeonI_2VMisc_Narrow_Patterns<"UQXTN", int_arm_neon_vqmovnu>; + +multiclass NeonI_2VMisc_SHIFT<string asmop, bit U, bits<5> opcode> { + let DecoderMethod = "DecodeSHLLInstruction" in { + def 8b8h : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR128:$Rd), + (ins VPR64:$Rn, uimm_exact8:$Imm), + asmop # "\t$Rd.8h, $Rn.8b, $Imm", + [], NoItinerary>; + + def 4h4s : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR128:$Rd), + (ins VPR64:$Rn, uimm_exact16:$Imm), + asmop # "\t$Rd.4s, $Rn.4h, $Imm", + [], NoItinerary>; + + def 2s2d : NeonI_2VMisc<0b0, U, 0b10, opcode, + (outs VPR128:$Rd), + (ins VPR64:$Rn, uimm_exact32:$Imm), + asmop # "\t$Rd.2d, $Rn.2s, $Imm", + [], NoItinerary>; + + def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), + (ins VPR128:$Rn, uimm_exact8:$Imm), + asmop # "2\t$Rd.8h, $Rn.16b, $Imm", + [], NoItinerary>; + + def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), + (ins VPR128:$Rn, uimm_exact16:$Imm), + asmop # "2\t$Rd.4s, $Rn.8h, $Imm", + [], NoItinerary>; + + def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode, + (outs VPR128:$Rd), + (ins VPR128:$Rn, uimm_exact32:$Imm), + asmop # "2\t$Rd.2d, $Rn.4s, $Imm", + [], NoItinerary>; + } +} + +defm SHLL : NeonI_2VMisc_SHIFT<"shll", 0b1, 0b10011>; + +class NeonI_SHLL_Patterns<ValueType OpTy, ValueType DesTy, + SDPatternOperator ExtOp, Operand Neon_Imm, + string suffix> + : Pat<(DesTy (shl + (DesTy (ExtOp (OpTy VPR64:$Rn))), + (DesTy (Neon_vdup + (i32 Neon_Imm:$Imm))))), + (!cast<Instruction>("SHLL" # suffix) VPR64:$Rn, Neon_Imm:$Imm)>; + +class NeonI_SHLL_High_Patterns<ValueType OpTy, ValueType DesTy, + SDPatternOperator ExtOp, Operand Neon_Imm, + string suffix, PatFrag GetHigh> + : Pat<(DesTy (shl + (DesTy (ExtOp + (OpTy (GetHigh VPR128:$Rn)))), + (DesTy (Neon_vdup + (i32 Neon_Imm:$Imm))))), + (!cast<Instruction>("SHLL" # suffix) VPR128:$Rn, Neon_Imm:$Imm)>; + +def : NeonI_SHLL_Patterns<v8i8, v8i16, zext, uimm_exact8, "8b8h">; +def : NeonI_SHLL_Patterns<v8i8, v8i16, sext, uimm_exact8, "8b8h">; +def : NeonI_SHLL_Patterns<v4i16, v4i32, zext, uimm_exact16, "4h4s">; +def : NeonI_SHLL_Patterns<v4i16, v4i32, sext, uimm_exact16, "4h4s">; +def : NeonI_SHLL_Patterns<v2i32, v2i64, zext, uimm_exact32, "2s2d">; +def : NeonI_SHLL_Patterns<v2i32, v2i64, sext, uimm_exact32, "2s2d">; +def : NeonI_SHLL_High_Patterns<v8i8, v8i16, zext, uimm_exact8, "16b8h", + Neon_High16B>; +def : NeonI_SHLL_High_Patterns<v8i8, v8i16, sext, uimm_exact8, "16b8h", + Neon_High16B>; +def : NeonI_SHLL_High_Patterns<v4i16, v4i32, zext, uimm_exact16, "8h4s", + Neon_High8H>; +def : NeonI_SHLL_High_Patterns<v4i16, v4i32, sext, uimm_exact16, "8h4s", + Neon_High8H>; +def : NeonI_SHLL_High_Patterns<v2i32, v2i64, zext, uimm_exact32, "4s2d", + Neon_High4S>; +def : NeonI_SHLL_High_Patterns<v2i32, v2i64, sext, uimm_exact32, "4s2d", + Neon_High4S>; + +multiclass NeonI_2VMisc_SD_Narrow<string asmop, bit U, bits<5> opcode> { + def 4s4h : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4h, $Rn.4s", + [], NoItinerary>; + + def 2d2s : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.2s, $Rn.2d", + [], NoItinerary>; + + let Constraints = "$src = $Rd" in { + def 4s8h : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "2\t$Rd.8h, $Rn.4s", + [], NoItinerary>; + + def 2d4s : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "2\t$Rd.4s, $Rn.2d", + [], NoItinerary>; + } +} + +defm FCVTN : NeonI_2VMisc_SD_Narrow<"fcvtn", 0b0, 0b10110>; + +multiclass NeonI_2VMisc_Narrow_Pattern<string prefix, + SDPatternOperator f32_to_f16_Op, + SDPatternOperator f64_to_f32_Op> { + + def : Pat<(v4i16 (f32_to_f16_Op (v4f32 VPR128:$Rn))), + (!cast<Instruction>(prefix # "4s4h") (v4f32 VPR128:$Rn))>; + + def : Pat<(v8i16 (concat_vectors + (v4i16 VPR64:$src), + (v4i16 (f32_to_f16_Op (v4f32 VPR128:$Rn))))), + (!cast<Instruction>(prefix # "4s8h") + (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)), + (v4f32 VPR128:$Rn))>; + + def : Pat<(v2f32 (f64_to_f32_Op (v2f64 VPR128:$Rn))), + (!cast<Instruction>(prefix # "2d2s") (v2f64 VPR128:$Rn))>; + + def : Pat<(v4f32 (concat_vectors + (v2f32 VPR64:$src), + (v2f32 (f64_to_f32_Op (v2f64 VPR128:$Rn))))), + (!cast<Instruction>(prefix # "2d4s") + (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)), + (v2f64 VPR128:$Rn))>; +} + +defm : NeonI_2VMisc_Narrow_Pattern<"FCVTN", int_arm_neon_vcvtfp2hf, fround>; + +multiclass NeonI_2VMisc_D_Narrow<string asmop, string prefix, bit U, + bits<5> opcode> { + def 2d2s : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.2s, $Rn.2d", + [], NoItinerary>; + + def 2d4s : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "2\t$Rd.4s, $Rn.2d", + [], NoItinerary> { + let Constraints = "$src = $Rd"; + } + + def : Pat<(v2f32 (int_aarch64_neon_fcvtxn (v2f64 VPR128:$Rn))), + (!cast<Instruction>(prefix # "2d2s") VPR128:$Rn)>; + + def : Pat<(v4f32 (concat_vectors + (v2f32 VPR64:$src), + (v2f32 (int_aarch64_neon_fcvtxn (v2f64 VPR128:$Rn))))), + (!cast<Instruction>(prefix # "2d4s") + (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)), + VPR128:$Rn)>; +} + +defm FCVTXN : NeonI_2VMisc_D_Narrow<"fcvtxn","FCVTXN", 0b1, 0b10110>; + +def Neon_High4Float : PatFrag<(ops node:$in), + (extract_subvector (v4f32 node:$in), (iPTR 2))>; + +multiclass NeonI_2VMisc_HS_Extend<string asmop, bit U, bits<5> opcode> { + def 4h4s : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.4s, $Rn.4h", + [], NoItinerary>; + + def 2s2d : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.2d, $Rn.2s", + [], NoItinerary>; + + def 8h4s : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "2\t$Rd.4s, $Rn.8h", + [], NoItinerary>; + + def 4s2d : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "2\t$Rd.2d, $Rn.4s", + [], NoItinerary>; +} + +defm FCVTL : NeonI_2VMisc_HS_Extend<"fcvtl", 0b0, 0b10111>; + +multiclass NeonI_2VMisc_Extend_Pattern<string prefix> { + def : Pat<(v4f32 (int_arm_neon_vcvthf2fp (v4i16 VPR64:$Rn))), + (!cast<Instruction>(prefix # "4h4s") VPR64:$Rn)>; + + def : Pat<(v4f32 (int_arm_neon_vcvthf2fp + (v4i16 (Neon_High8H + (v8i16 VPR128:$Rn))))), + (!cast<Instruction>(prefix # "8h4s") VPR128:$Rn)>; + + def : Pat<(v2f64 (fextend (v2f32 VPR64:$Rn))), + (!cast<Instruction>(prefix # "2s2d") VPR64:$Rn)>; + + def : Pat<(v2f64 (fextend + (v2f32 (Neon_High4Float + (v4f32 VPR128:$Rn))))), + (!cast<Instruction>(prefix # "4s2d") VPR128:$Rn)>; +} + +defm : NeonI_2VMisc_Extend_Pattern<"FCVTL">; + +multiclass NeonI_2VMisc_SD_Conv<string asmop, bit Size, bit U, bits<5> opcode, + ValueType ResTy4s, ValueType OpTy4s, + ValueType ResTy2d, ValueType OpTy2d, + ValueType ResTy2s, ValueType OpTy2s, + SDPatternOperator Neon_Op> { + + def 4s : NeonI_2VMisc<0b1, U, {Size, 0b0}, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.4s", + [(set (ResTy4s VPR128:$Rd), + (ResTy4s (Neon_Op (OpTy4s VPR128:$Rn))))], + NoItinerary>; + + def 2d : NeonI_2VMisc<0b1, U, {Size, 0b1}, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.2d, $Rn.2d", + [(set (ResTy2d VPR128:$Rd), + (ResTy2d (Neon_Op (OpTy2d VPR128:$Rn))))], + NoItinerary>; + + def 2s : NeonI_2VMisc<0b0, U, {Size, 0b0}, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.2s", + [(set (ResTy2s VPR64:$Rd), + (ResTy2s (Neon_Op (OpTy2s VPR64:$Rn))))], + NoItinerary>; +} + +multiclass NeonI_2VMisc_fp_to_int<string asmop, bit Size, bit U, + bits<5> opcode, SDPatternOperator Neon_Op> { + defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4i32, v4f32, v2i64, + v2f64, v2i32, v2f32, Neon_Op>; +} + +defm FCVTNS : NeonI_2VMisc_fp_to_int<"fcvtns", 0b0, 0b0, 0b11010, + int_aarch64_neon_fcvtns>; +defm FCVTNU : NeonI_2VMisc_fp_to_int<"fcvtnu", 0b0, 0b1, 0b11010, + int_aarch64_neon_fcvtnu>; +defm FCVTPS : NeonI_2VMisc_fp_to_int<"fcvtps", 0b1, 0b0, 0b11010, + int_aarch64_neon_fcvtps>; +defm FCVTPU : NeonI_2VMisc_fp_to_int<"fcvtpu", 0b1, 0b1, 0b11010, + int_aarch64_neon_fcvtpu>; +defm FCVTMS : NeonI_2VMisc_fp_to_int<"fcvtms", 0b0, 0b0, 0b11011, + int_aarch64_neon_fcvtms>; +defm FCVTMU : NeonI_2VMisc_fp_to_int<"fcvtmu", 0b0, 0b1, 0b11011, + int_aarch64_neon_fcvtmu>; +defm FCVTZS : NeonI_2VMisc_fp_to_int<"fcvtzs", 0b1, 0b0, 0b11011, fp_to_sint>; +defm FCVTZU : NeonI_2VMisc_fp_to_int<"fcvtzu", 0b1, 0b1, 0b11011, fp_to_uint>; +defm FCVTAS : NeonI_2VMisc_fp_to_int<"fcvtas", 0b0, 0b0, 0b11100, + int_aarch64_neon_fcvtas>; +defm FCVTAU : NeonI_2VMisc_fp_to_int<"fcvtau", 0b0, 0b1, 0b11100, + int_aarch64_neon_fcvtau>; + +multiclass NeonI_2VMisc_int_to_fp<string asmop, bit Size, bit U, + bits<5> opcode, SDPatternOperator Neon_Op> { + defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4f32, v4i32, v2f64, + v2i64, v2f32, v2i32, Neon_Op>; +} + +defm SCVTF : NeonI_2VMisc_int_to_fp<"scvtf", 0b0, 0b0, 0b11101, sint_to_fp>; +defm UCVTF : NeonI_2VMisc_int_to_fp<"ucvtf", 0b0, 0b1, 0b11101, uint_to_fp>; + +multiclass NeonI_2VMisc_fp_to_fp<string asmop, bit Size, bit U, + bits<5> opcode, SDPatternOperator Neon_Op> { + defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4f32, v4f32, v2f64, + v2f64, v2f32, v2f32, Neon_Op>; +} + +defm FRINTN : NeonI_2VMisc_fp_to_fp<"frintn", 0b0, 0b0, 0b11000, + int_aarch64_neon_frintn>; +defm FRINTA : NeonI_2VMisc_fp_to_fp<"frinta", 0b0, 0b1, 0b11000, frnd>; +defm FRINTP : NeonI_2VMisc_fp_to_fp<"frintp", 0b1, 0b0, 0b11000, fceil>; +defm FRINTM : NeonI_2VMisc_fp_to_fp<"frintm", 0b0, 0b0, 0b11001, ffloor>; +defm FRINTX : NeonI_2VMisc_fp_to_fp<"frintx", 0b0, 0b1, 0b11001, frint>; +defm FRINTZ : NeonI_2VMisc_fp_to_fp<"frintz", 0b1, 0b0, 0b11001, ftrunc>; +defm FRINTI : NeonI_2VMisc_fp_to_fp<"frinti", 0b1, 0b1, 0b11001, fnearbyint>; +defm FRECPE : NeonI_2VMisc_fp_to_fp<"frecpe", 0b1, 0b0, 0b11101, + int_arm_neon_vrecpe>; +defm FRSQRTE : NeonI_2VMisc_fp_to_fp<"frsqrte", 0b1, 0b1, 0b11101, + int_arm_neon_vrsqrte>; +defm FSQRT : NeonI_2VMisc_fp_to_fp<"fsqrt", 0b1, 0b1, 0b11111, fsqrt>; + +multiclass NeonI_2VMisc_S_Conv<string asmop, bit Size, bit U, + bits<5> opcode, SDPatternOperator Neon_Op> { + def 4s : NeonI_2VMisc<0b1, U, {Size, 0b0}, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.4s", + [(set (v4i32 VPR128:$Rd), + (v4i32 (Neon_Op (v4i32 VPR128:$Rn))))], + NoItinerary>; + + def 2s : NeonI_2VMisc<0b0, U, {Size, 0b0}, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.2s", + [(set (v2i32 VPR64:$Rd), + (v2i32 (Neon_Op (v2i32 VPR64:$Rn))))], + NoItinerary>; +} + +defm URECPE : NeonI_2VMisc_S_Conv<"urecpe", 0b1, 0b0, 0b11100, + int_arm_neon_vrecpe>; +defm URSQRTE : NeonI_2VMisc_S_Conv<"ursqrte", 0b1, 0b1, 0b11100, + int_arm_neon_vrsqrte>; + +// Crypto Class +class NeonI_Cryptoaes_2v<bits<2> size, bits<5> opcode, + string asmop, SDPatternOperator opnode> + : NeonI_Crypto_AES<size, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "\t$Rd.16b, $Rn.16b", + [(set (v16i8 VPR128:$Rd), + (v16i8 (opnode (v16i8 VPR128:$src), + (v16i8 VPR128:$Rn))))], + NoItinerary>{ + let Constraints = "$src = $Rd"; + let Predicates = [HasNEON, HasCrypto]; +} + +def AESE : NeonI_Cryptoaes_2v<0b00, 0b00100, "aese", int_arm_neon_aese>; +def AESD : NeonI_Cryptoaes_2v<0b00, 0b00101, "aesd", int_arm_neon_aesd>; + +class NeonI_Cryptoaes<bits<2> size, bits<5> opcode, + string asmop, SDPatternOperator opnode> + : NeonI_Crypto_AES<size, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.16b, $Rn.16b", + [(set (v16i8 VPR128:$Rd), + (v16i8 (opnode (v16i8 VPR128:$Rn))))], + NoItinerary>; + +def AESMC : NeonI_Cryptoaes<0b00, 0b00110, "aesmc", int_arm_neon_aesmc>; +def AESIMC : NeonI_Cryptoaes<0b00, 0b00111, "aesimc", int_arm_neon_aesimc>; + +class NeonI_Cryptosha_vv<bits<2> size, bits<5> opcode, + string asmop, SDPatternOperator opnode> + : NeonI_Crypto_SHA<size, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.4s", + [(set (v4i32 VPR128:$Rd), + (v4i32 (opnode (v4i32 VPR128:$src), + (v4i32 VPR128:$Rn))))], + NoItinerary> { + let Constraints = "$src = $Rd"; + let Predicates = [HasNEON, HasCrypto]; +} + +def SHA1SU1 : NeonI_Cryptosha_vv<0b00, 0b00001, "sha1su1", + int_arm_neon_sha1su1>; +def SHA256SU0 : NeonI_Cryptosha_vv<0b00, 0b00010, "sha256su0", + int_arm_neon_sha256su0>; + +class NeonI_Cryptosha_ss<bits<2> size, bits<5> opcode, + string asmop, SDPatternOperator opnode> + : NeonI_Crypto_SHA<size, opcode, + (outs FPR32:$Rd), (ins FPR32:$Rn), + asmop # "\t$Rd, $Rn", + [(set (v1i32 FPR32:$Rd), + (v1i32 (opnode (v1i32 FPR32:$Rn))))], + NoItinerary> { + let Predicates = [HasNEON, HasCrypto]; +} + +def SHA1H : NeonI_Cryptosha_ss<0b00, 0b00000, "sha1h", int_arm_neon_sha1h>; + +class NeonI_Cryptosha3_vvv<bits<2> size, bits<3> opcode, string asmop, + SDPatternOperator opnode> + : NeonI_Crypto_3VSHA<size, opcode, + (outs VPR128:$Rd), + (ins VPR128:$src, VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s", + [(set (v4i32 VPR128:$Rd), + (v4i32 (opnode (v4i32 VPR128:$src), + (v4i32 VPR128:$Rn), + (v4i32 VPR128:$Rm))))], + NoItinerary> { + let Constraints = "$src = $Rd"; + let Predicates = [HasNEON, HasCrypto]; +} + +def SHA1SU0 : NeonI_Cryptosha3_vvv<0b00, 0b011, "sha1su0", + int_arm_neon_sha1su0>; +def SHA256SU1 : NeonI_Cryptosha3_vvv<0b00, 0b110, "sha256su1", + int_arm_neon_sha256su1>; + +class NeonI_Cryptosha3_qqv<bits<2> size, bits<3> opcode, string asmop, + SDPatternOperator opnode> + : NeonI_Crypto_3VSHA<size, opcode, + (outs FPR128:$Rd), + (ins FPR128:$src, FPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd, $Rn, $Rm.4s", + [(set (v4i32 FPR128:$Rd), + (v4i32 (opnode (v4i32 FPR128:$src), + (v4i32 FPR128:$Rn), + (v4i32 VPR128:$Rm))))], + NoItinerary> { + let Constraints = "$src = $Rd"; + let Predicates = [HasNEON, HasCrypto]; +} + +def SHA256H : NeonI_Cryptosha3_qqv<0b00, 0b100, "sha256h", + int_arm_neon_sha256h>; +def SHA256H2 : NeonI_Cryptosha3_qqv<0b00, 0b101, "sha256h2", + int_arm_neon_sha256h2>; + +class NeonI_Cryptosha3_qsv<bits<2> size, bits<3> opcode, string asmop, + SDPatternOperator opnode> + : NeonI_Crypto_3VSHA<size, opcode, + (outs FPR128:$Rd), + (ins FPR128:$src, FPR32:$Rn, VPR128:$Rm), + asmop # "\t$Rd, $Rn, $Rm.4s", + [(set (v4i32 FPR128:$Rd), + (v4i32 (opnode (v4i32 FPR128:$src), + (v1i32 FPR32:$Rn), + (v4i32 VPR128:$Rm))))], + NoItinerary> { + let Constraints = "$src = $Rd"; + let Predicates = [HasNEON, HasCrypto]; +} + +def SHA1C : NeonI_Cryptosha3_qsv<0b00, 0b000, "sha1c", int_aarch64_neon_sha1c>; +def SHA1P : NeonI_Cryptosha3_qsv<0b00, 0b001, "sha1p", int_aarch64_neon_sha1p>; +def SHA1M : NeonI_Cryptosha3_qsv<0b00, 0b010, "sha1m", int_aarch64_neon_sha1m>; + +// +// Patterns for handling half-precision values +// + +// Convert f16 value coming in as i16 value to f32 +def : Pat<(f32 (f16_to_f32 (i32 (and (i32 GPR32:$Rn), 65535)))), + (FCVTsh (EXTRACT_SUBREG (FMOVsw GPR32:$Rn), sub_16))>; +def : Pat<(f32 (f16_to_f32 (i32 (assertzext GPR32:$Rn)))), + (FCVTsh (EXTRACT_SUBREG (FMOVsw GPR32:$Rn), sub_16))>; + +def : Pat<(f32 (f16_to_f32 (i32 (assertzext (i32 ( + f32_to_f16 (f32 FPR32:$Rn))))))), + (f32 FPR32:$Rn)>; + +// Patterns for vector extract of half-precision FP value in i16 storage type +def : Pat<(f32 (f16_to_f32 ( i32 (and (i32 (vector_extract + (v4i16 VPR64:$Rn), neon_uimm2_bare:$Imm)), 65535)))), + (FCVTsh (f16 (DUPhv_H + (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + neon_uimm2_bare:$Imm)))>; + +def : Pat<(f32 (f16_to_f32 ( i32 (and (i32 (vector_extract + (v8i16 VPR128:$Rn), neon_uimm3_bare:$Imm)), 65535)))), + (FCVTsh (f16 (DUPhv_H (v8i16 VPR128:$Rn), neon_uimm3_bare:$Imm)))>; + +// Patterns for vector insert of half-precision FP value 0 in i16 storage type +def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn), + (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 0))))))), + (neon_uimm3_bare:$Imm))), + (v8i16 (INSELh (v8i16 VPR128:$Rn), + (v8i16 (SUBREG_TO_REG (i64 0), + (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 WZR))), sub_16)), + sub_16)), + neon_uimm3_bare:$Imm, 0))>; + +def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn), + (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 0))))))), + (neon_uimm2_bare:$Imm))), + (v4i16 (EXTRACT_SUBREG + (v8i16 (INSELh + (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + (v8i16 (SUBREG_TO_REG (i64 0), + (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 WZR))), sub_16)), + sub_16)), + neon_uimm2_bare:$Imm, 0)), + sub_64))>; + +// Patterns for vector insert of half-precision FP value in i16 storage type +def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn), + (i32 (assertsext (i32 (fp_to_sint + (f32 (f16_to_f32 (i32 (and (i32 GPR32:$src), 65535)))))))), + (neon_uimm3_bare:$Imm))), + (v8i16 (INSELh (v8i16 VPR128:$Rn), + (v8i16 (SUBREG_TO_REG (i64 0), + (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 GPR32:$src))), sub_16)), + sub_16)), + neon_uimm3_bare:$Imm, 0))>; + +def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn), + (i32 (assertsext (i32 (fp_to_sint + (f32 (f16_to_f32 (i32 (and (i32 GPR32:$src), 65535)))))))), + (neon_uimm2_bare:$Imm))), + (v4i16 (EXTRACT_SUBREG + (v8i16 (INSELh + (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + (v8i16 (SUBREG_TO_REG (i64 0), + (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 GPR32:$src))), sub_16)), + sub_16)), + neon_uimm2_bare:$Imm, 0)), + sub_64))>; + +def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn), + (i32 (vector_extract (v8i16 VPR128:$src), neon_uimm3_bare:$Imm2)), + (neon_uimm3_bare:$Imm1))), + (v8i16 (INSELh (v8i16 VPR128:$Rn), (v8i16 VPR128:$src), + neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2))>; + +// Patterns for vector copy of half-precision FP value in i16 storage type +def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn), + (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 (and (i32 + (vector_extract (v8i16 VPR128:$src), neon_uimm3_bare:$Imm2)), + 65535)))))))), + (neon_uimm3_bare:$Imm1))), + (v8i16 (INSELh (v8i16 VPR128:$Rn), (v8i16 VPR128:$src), + neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2))>; + +def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn), + (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 (and (i32 + (vector_extract (v4i16 VPR64:$src), neon_uimm3_bare:$Imm2)), + 65535)))))))), + (neon_uimm3_bare:$Imm1))), + (v4i16 (EXTRACT_SUBREG + (v8i16 (INSELh + (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)), + neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2)), + sub_64))>; + + diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp index 3d22330..8cfb968 100644 --- a/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -109,6 +109,11 @@ bool AArch64AsmPrinter::lowerOperand(const MachineOperand &MO, case MachineOperand::MO_Immediate: MCOp = MCOperand::CreateImm(MO.getImm()); break; + case MachineOperand::MO_FPImmediate: { + assert(MO.getFPImm()->isZero() && "Only fp imm 0.0 is supported"); + MCOp = MCOperand::CreateFPImm(0.0); + break; + } case MachineOperand::MO_BlockAddress: MCOp = lowerSymbolOperand(MO, GetBlockAddressSymbol(MO.getBlockAddress())); break; @@ -116,7 +121,7 @@ bool AArch64AsmPrinter::lowerOperand(const MachineOperand &MO, MCOp = lowerSymbolOperand(MO, GetExternalSymbolSymbol(MO.getSymbolName())); break; case MachineOperand::MO_GlobalAddress: - MCOp = lowerSymbolOperand(MO, Mang->getSymbol(MO.getGlobal())); + MCOp = lowerSymbolOperand(MO, getSymbol(MO.getGlobal())); break; case MachineOperand::MO_MachineBasicBlock: MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create( diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp index 20b0dcf..75ec44f 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -29,9 +29,8 @@ using namespace llvm; -AArch64RegisterInfo::AArch64RegisterInfo(const AArch64InstrInfo &tii, - const AArch64Subtarget &sti) - : AArch64GenRegisterInfo(AArch64::X30), TII(tii) { +AArch64RegisterInfo::AArch64RegisterInfo() + : AArch64GenRegisterInfo(AArch64::X30) { } const uint16_t * @@ -122,6 +121,8 @@ AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MBBI, return; } + const AArch64InstrInfo &TII = + *static_cast<const AArch64InstrInfo*>(MF.getTarget().getInstrInfo()); int MinOffset, MaxOffset, OffsetScale; if (MI.getOpcode() == AArch64::ADDxxi_lsl0_s) { MinOffset = 0; diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h index bb64fd5..4d67943 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/lib/Target/AArch64/AArch64RegisterInfo.h @@ -25,12 +25,7 @@ class AArch64InstrInfo; class AArch64Subtarget; struct AArch64RegisterInfo : public AArch64GenRegisterInfo { -private: - const AArch64InstrInfo &TII; - -public: - AArch64RegisterInfo(const AArch64InstrInfo &tii, - const AArch64Subtarget &sti); + AArch64RegisterInfo(); const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const; const uint32_t *getCallPreservedMask(CallingConv::ID) const; diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td index bd79546..4e2022c 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/lib/Target/AArch64/AArch64RegisterInfo.td @@ -12,15 +12,25 @@ //===----------------------------------------------------------------------===// let Namespace = "AArch64" in { -def sub_128 : SubRegIndex; -def sub_64 : SubRegIndex; -def sub_32 : SubRegIndex; -def sub_16 : SubRegIndex; -def sub_8 : SubRegIndex; - -// The VPR registers are handled as sub-registers of FPR equivalents, but -// they're really the same thing. We give this concept a special index. -def sub_alias : SubRegIndex; +def sub_128 : SubRegIndex<128>; +def sub_64 : SubRegIndex<64>; +def sub_32 : SubRegIndex<32>; +def sub_16 : SubRegIndex<16>; +def sub_8 : SubRegIndex<8>; + +// Note: Code depends on these having consecutive numbers. +def qqsub : SubRegIndex<256, 256>; + +def qsub_0 : SubRegIndex<128>; +def qsub_1 : SubRegIndex<128, 128>; +def qsub_2 : ComposedSubRegIndex<qqsub, qsub_0>; +def qsub_3 : ComposedSubRegIndex<qqsub, qsub_1>; + +def dsub_0 : SubRegIndex<64>; +def dsub_1 : SubRegIndex<64, 64>; +def dsub_2 : ComposedSubRegIndex<qsub_1, dsub_0>; +def dsub_3 : ComposedSubRegIndex<qsub_1, dsub_1>; +def dsub_4 : ComposedSubRegIndex<qsub_2, dsub_0>; } // Registers are identified with 5-bit ID numbers. @@ -137,60 +147,51 @@ foreach Index = 0-31 in { } -def FPR8 : RegisterClass<"AArch64", [i8], 8, +def FPR8 : RegisterClass<"AArch64", [i8, v1i8], 8, (sequence "B%u", 0, 31)> { } -def FPR16 : RegisterClass<"AArch64", [f16], 16, +def FPR16 : RegisterClass<"AArch64", [f16, v1i16], 16, (sequence "H%u", 0, 31)> { } -def FPR32 : RegisterClass<"AArch64", [f32], 32, +def FPR32 : RegisterClass<"AArch64", [f32, v1i32, v1f32], 32, (sequence "S%u", 0, 31)> { } -def FPR64 : RegisterClass<"AArch64", [f64], 64, - (sequence "D%u", 0, 31)> { -} +def FPR64 : RegisterClass<"AArch64", + [f64, v2f32, v2i32, v4i16, v8i8, v1i64, v1f64], + 64, (sequence "D%u", 0, 31)>; -def FPR128 : RegisterClass<"AArch64", [f128], 128, - (sequence "Q%u", 0, 31)> { -} +def FPR128 : RegisterClass<"AArch64", + [f128, v2f64, v2i64, v4f32, v4i32, v8i16, v16i8], + 128, (sequence "Q%u", 0, 31)>; +def FPR64Lo : RegisterClass<"AArch64", + [f64, v2f32, v2i32, v4i16, v8i8, v1i64, v1f64], + 64, (sequence "D%u", 0, 15)>; + +def FPR128Lo : RegisterClass<"AArch64", + [f128, v2f64, v2i64, v4f32, v4i32, v8i16, v16i8], + 128, (sequence "Q%u", 0, 15)>; //===----------------------------------------------------------------------===// // Vector registers: //===----------------------------------------------------------------------===// -// NEON registers simply specify the overall vector, and it's expected that -// Instructions will individually specify the acceptable data layout. In -// principle this leaves two approaches open: -// + An operand, giving a single ADDvvv instruction (for example). This turns -// out to be unworkable in the assembly parser (without every Instruction -// having a "cvt" function, at least) because the constraints can't be -// properly enforced. It also complicates specifying patterns since each -// instruction will accept many types. -// + A bare token (e.g. ".2d"). This means the AsmParser has to know specific -// details about NEON registers, but simplifies most other details. -// -// The second approach was taken. - -foreach Index = 0-31 in { - def V # Index : AArch64RegWithSubs<Index, "v" # Index, - [!cast<Register>("Q" # Index)], - [sub_alias]>, - DwarfRegNum<[!add(Index, 64)]>; +def VPR64AsmOperand : AsmOperandClass { + let Name = "VPR"; + let PredicateMethod = "isReg"; + let RenderMethod = "addRegOperands"; } -// These two classes contain the same registers, which should be reasonably -// sensible for MC and allocation purposes, but allows them to be treated -// separately for things like stack spilling. -def VPR64 : RegisterClass<"AArch64", [v2f32, v2i32, v4i16, v8i8], 64, - (sequence "V%u", 0, 31)>; +def VPR64 : RegisterOperand<FPR64, "printVPRRegister">; + +def VPR128 : RegisterOperand<FPR128, "printVPRRegister">; + +def VPR64Lo : RegisterOperand<FPR64Lo, "printVPRRegister">; -def VPR128 : RegisterClass<"AArch64", - [v2f64, v2i64, v4f32, v4i32, v8i16, v16i8], 128, - (sequence "V%u", 0, 31)>; +def VPR128Lo : RegisterOperand<FPR128Lo, "printVPRRegister">; // Flags register def NZCV : Register<"nzcv"> { @@ -201,3 +202,90 @@ def FlagClass : RegisterClass<"AArch64", [i32], 32, (add NZCV)> { let CopyCost = -1; let isAllocatable = 0; } + +//===----------------------------------------------------------------------===// +// Consecutive vector registers +//===----------------------------------------------------------------------===// +// 2 Consecutive 64-bit registers: D0_D1, D1_D2, ..., D30_D31 +def Tuples2D : RegisterTuples<[dsub_0, dsub_1], + [(rotl FPR64, 0), (rotl FPR64, 1)]>; + +// 3 Consecutive 64-bit registers: D0_D1_D2, ..., D31_D0_D1 +def Tuples3D : RegisterTuples<[dsub_0, dsub_1, dsub_2], + [(rotl FPR64, 0), (rotl FPR64, 1), + (rotl FPR64, 2)]>; + +// 4 Consecutive 64-bit registers: D0_D1_D2_D3, ..., D31_D0_D1_D2 +def Tuples4D : RegisterTuples<[dsub_0, dsub_1, dsub_2, dsub_3], + [(rotl FPR64, 0), (rotl FPR64, 1), + (rotl FPR64, 2), (rotl FPR64, 3)]>; + +// 2 Consecutive 128-bit registers: Q0_Q1, Q1_Q2, ..., Q30_Q31 +def Tuples2Q : RegisterTuples<[qsub_0, qsub_1], + [(rotl FPR128, 0), (rotl FPR128, 1)]>; + +// 3 Consecutive 128-bit registers: Q0_Q1_Q2, ..., Q31_Q0_Q1 +def Tuples3Q : RegisterTuples<[qsub_0, qsub_1, qsub_2], + [(rotl FPR128, 0), (rotl FPR128, 1), + (rotl FPR128, 2)]>; + +// 4 Consecutive 128-bit registers: Q0_Q1_Q2_Q3, ..., Q31_Q0_Q1_Q2 +def Tuples4Q : RegisterTuples<[qsub_0, qsub_1, qsub_2, qsub_3], + [(rotl FPR128, 0), (rotl FPR128, 1), + (rotl FPR128, 2), (rotl FPR128, 3)]>; + +// The followings are super register classes to model 2/3/4 consecutive +// 64-bit/128-bit registers. + +def DPair : RegisterClass<"AArch64", [v2i64], 64, (add Tuples2D)>; + +def DTriple : RegisterClass<"AArch64", [untyped], 64, (add Tuples3D)> { + let Size = 192; // 3 x 64 bits, we have no predefined type of that size. +} + +def DQuad : RegisterClass<"AArch64", [v4i64], 64, (add Tuples4D)>; + +def QPair : RegisterClass<"AArch64", [v4i64], 128, (add Tuples2Q)>; + +def QTriple : RegisterClass<"AArch64", [untyped], 128, (add Tuples3Q)> { + let Size = 384; // 3 x 128 bits, we have no predefined type of that size. +} + +def QQuad : RegisterClass<"AArch64", [v8i64], 128, (add Tuples4Q)>; + + +// The followings are vector list operands +multiclass VectorList_operands<string PREFIX, string LAYOUT, int Count, + RegisterClass RegList> { + def _asmoperand : AsmOperandClass { + let Name = PREFIX # LAYOUT # Count; + let RenderMethod = "addVectorListOperands"; + let PredicateMethod = + "isVectorList<A64Layout::VL_" # LAYOUT # ", " # Count # ">"; + let ParserMethod = "ParseVectorList"; + } + + def _operand : RegisterOperand<RegList, + "printVectorList<A64Layout::VL_" # LAYOUT # ", " # Count # ">"> { + let ParserMatchClass = + !cast<AsmOperandClass>(PREFIX # LAYOUT # "_asmoperand"); + } +} + +multiclass VectorList_BHSD<string PREFIX, int Count, RegisterClass DRegList, + RegisterClass QRegList> { + defm 8B : VectorList_operands<PREFIX, "8B", Count, DRegList>; + defm 4H : VectorList_operands<PREFIX, "4H", Count, DRegList>; + defm 2S : VectorList_operands<PREFIX, "2S", Count, DRegList>; + defm 1D : VectorList_operands<PREFIX, "1D", Count, DRegList>; + defm 16B : VectorList_operands<PREFIX, "16B", Count, QRegList>; + defm 8H : VectorList_operands<PREFIX, "8H", Count, QRegList>; + defm 4S : VectorList_operands<PREFIX, "4S", Count, QRegList>; + defm 2D : VectorList_operands<PREFIX, "2D", Count, QRegList>; +} + +// Vector list operand with 1/2/3/4 registers: VOne8B_operand,..., VQuad2D_operand +defm VOne : VectorList_BHSD<"VOne", 1, FPR64, FPR128>; +defm VPair : VectorList_BHSD<"VPair", 2, DPair, QPair>; +defm VTriple : VectorList_BHSD<"VTriple", 3, DTriple, QTriple>; +defm VQuad : VectorList_BHSD<"VQuad", 4, DQuad, QQuad>;
\ No newline at end of file diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index d17b738..5c693c1 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -25,13 +25,31 @@ using namespace llvm; +// Pin the vtable to this file. +void AArch64Subtarget::anchor() {} + AArch64Subtarget::AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS) - : AArch64GenSubtargetInfo(TT, CPU, FS) - , HasNEON(true) - , HasCrypto(true) - , TargetTriple(TT) { + : AArch64GenSubtargetInfo(TT, CPU, FS), HasFPARMv8(false), HasNEON(false), + HasCrypto(false), TargetTriple(TT), CPUString(CPU) { + + initializeSubtargetFeatures(CPU, FS); +} + +void AArch64Subtarget::initializeSubtargetFeatures(StringRef CPU, + StringRef FS) { + if (CPU.empty()) + CPUString = "generic"; + + std::string FullFS = FS; + if (CPUString == "generic") { + // Enable FP by default. + if (FullFS.empty()) + FullFS = "+fp-armv8"; + else + FullFS = "+fp-armv8," + FullFS; + } - ParseSubtargetFeatures(CPU, FS); + ParseSubtargetFeatures(CPU, FullFS); } bool AArch64Subtarget::GVIsIndirectSymbol(const GlobalValue *GV, diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index 2e9205f..bbfd3bc 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -27,18 +27,31 @@ class StringRef; class GlobalValue; class AArch64Subtarget : public AArch64GenSubtargetInfo { + virtual void anchor(); protected: + bool HasFPARMv8; bool HasNEON; bool HasCrypto; /// TargetTriple - What processor and OS we're targeting. Triple TargetTriple; + + /// CPUString - String name of used CPU. + std::string CPUString; + +private: + void initializeSubtargetFeatures(StringRef CPU, StringRef FS); + public: /// This constructor initializes the data members to match that /// of the specified triple. /// AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS); + virtual bool enableMachineScheduler() const { + return true; + } + /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef FS); @@ -46,8 +59,13 @@ public: bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const; bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } - bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; } + bool isTargetLinux() const { return TargetTriple.isOSLinux(); } + + bool hasFPARMv8() const { return HasFPARMv8; } + bool hasNEON() const { return HasNEON; } + bool hasCrypto() const { return HasCrypto; } + const std::string & getCPUString() const { return CPUString; } }; } // End llvm namespace diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index df599d5..f1695e2 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -38,6 +38,7 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT, TLInfo(*this), TSInfo(*this), FrameLowering(Subtarget) { + initAsmInfo(); } namespace { diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 69bb80a..fbbce11 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -54,8 +54,9 @@ public: #include "AArch64GenAsmMatcher.inc" }; - AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser) - : MCTargetAsmParser(), STI(_STI), Parser(_Parser) { + AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser, + const MCInstrInfo &MII) + : MCTargetAsmParser(), STI(_STI), Parser(_Parser) { MCAsmParserExtension::Initialize(_Parser); // Initialize the set of available features. @@ -126,6 +127,11 @@ public: OperandMatchResultTy ParseSysRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands); + bool TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc, StringRef &Layout, + SMLoc &LayoutLoc); + + OperandMatchResultTy ParseVectorList(SmallVectorImpl<MCParsedAsmOperand *> &); + bool validateInstruction(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Operands); @@ -153,6 +159,7 @@ private: k_Immediate, // Including expressions referencing symbols k_Register, k_ShiftExtend, + k_VectorList, // A sequential list of 1 to 4 registers. k_SysReg, // The register operand of MRS and MSR instructions k_Token, // The mnemonic; other raw tokens the auto-generated k_WrappedRegister // Load/store exclusive permit a wrapped register. @@ -188,6 +195,13 @@ private: bool ImplicitAmount; }; + // A vector register list is a sequential list of 1 to 4 registers. + struct VectorListOp { + unsigned RegNum; + unsigned Count; + A64Layout::VectorLayout Layout; + }; + struct SysRegOp { const char *Data; unsigned Length; @@ -205,6 +219,7 @@ private: struct ImmOp Imm; struct RegOp Reg; struct ShiftExtendOp ShiftExtend; + struct VectorListOp VectorList; struct SysRegOp SysReg; struct TokOp Tok; }; @@ -454,7 +469,7 @@ public: } bool isMOVN32Imm() const { - static AArch64MCExpr::VariantKind PermittedModifiers[] = { + static const AArch64MCExpr::VariantKind PermittedModifiers[] = { AArch64MCExpr::VK_AARCH64_SABS_G0, AArch64MCExpr::VK_AARCH64_SABS_G1, AArch64MCExpr::VK_AARCH64_DTPREL_G1, @@ -463,13 +478,13 @@ public: AArch64MCExpr::VK_AARCH64_TPREL_G1, AArch64MCExpr::VK_AARCH64_TPREL_G0, }; - unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); + const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); return isMoveWideImm(32, PermittedModifiers, NumModifiers); } bool isMOVN64Imm() const { - static AArch64MCExpr::VariantKind PermittedModifiers[] = { + static const AArch64MCExpr::VariantKind PermittedModifiers[] = { AArch64MCExpr::VK_AARCH64_SABS_G0, AArch64MCExpr::VK_AARCH64_SABS_G1, AArch64MCExpr::VK_AARCH64_SABS_G2, @@ -481,14 +496,14 @@ public: AArch64MCExpr::VK_AARCH64_TPREL_G1, AArch64MCExpr::VK_AARCH64_TPREL_G0, }; - unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); + const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); return isMoveWideImm(64, PermittedModifiers, NumModifiers); } bool isMOVZ32Imm() const { - static AArch64MCExpr::VariantKind PermittedModifiers[] = { + static const AArch64MCExpr::VariantKind PermittedModifiers[] = { AArch64MCExpr::VK_AARCH64_ABS_G0, AArch64MCExpr::VK_AARCH64_ABS_G1, AArch64MCExpr::VK_AARCH64_SABS_G0, @@ -499,13 +514,13 @@ public: AArch64MCExpr::VK_AARCH64_TPREL_G1, AArch64MCExpr::VK_AARCH64_TPREL_G0, }; - unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); + const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); return isMoveWideImm(32, PermittedModifiers, NumModifiers); } bool isMOVZ64Imm() const { - static AArch64MCExpr::VariantKind PermittedModifiers[] = { + static const AArch64MCExpr::VariantKind PermittedModifiers[] = { AArch64MCExpr::VK_AARCH64_ABS_G0, AArch64MCExpr::VK_AARCH64_ABS_G1, AArch64MCExpr::VK_AARCH64_ABS_G2, @@ -521,13 +536,13 @@ public: AArch64MCExpr::VK_AARCH64_TPREL_G1, AArch64MCExpr::VK_AARCH64_TPREL_G0, }; - unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); + const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); return isMoveWideImm(64, PermittedModifiers, NumModifiers); } bool isMOVK32Imm() const { - static AArch64MCExpr::VariantKind PermittedModifiers[] = { + static const AArch64MCExpr::VariantKind PermittedModifiers[] = { AArch64MCExpr::VK_AARCH64_ABS_G0_NC, AArch64MCExpr::VK_AARCH64_ABS_G1_NC, AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC, @@ -536,13 +551,13 @@ public: AArch64MCExpr::VK_AARCH64_TPREL_G1_NC, AArch64MCExpr::VK_AARCH64_TPREL_G0_NC, }; - unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); + const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); return isMoveWideImm(32, PermittedModifiers, NumModifiers); } bool isMOVK64Imm() const { - static AArch64MCExpr::VariantKind PermittedModifiers[] = { + static const AArch64MCExpr::VariantKind PermittedModifiers[] = { AArch64MCExpr::VK_AARCH64_ABS_G0_NC, AArch64MCExpr::VK_AARCH64_ABS_G1_NC, AArch64MCExpr::VK_AARCH64_ABS_G2_NC, @@ -553,13 +568,13 @@ public: AArch64MCExpr::VK_AARCH64_TPREL_G1_NC, AArch64MCExpr::VK_AARCH64_TPREL_G0_NC, }; - unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); + const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); return isMoveWideImm(64, PermittedModifiers, NumModifiers); } bool isMoveWideImm(unsigned RegWidth, - AArch64MCExpr::VariantKind *PermittedModifiers, + const AArch64MCExpr::VariantKind *PermittedModifiers, unsigned NumModifiers) const { if (!isImmWithLSL()) return false; @@ -664,8 +679,86 @@ public: return !ShiftExtend.ImplicitAmount && ShiftExtend.Amount <= 4; } - template<int MemSize> bool isSImm7Scaled() const { - if (!isImm()) return false; + // if 0 < value <= w, return true + bool isShrFixedWidth(int w) const { + if (!isImm()) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) + return false; + int64_t Value = CE->getValue(); + return Value > 0 && Value <= w; + } + + bool isShrImm8() const { return isShrFixedWidth(8); } + + bool isShrImm16() const { return isShrFixedWidth(16); } + + bool isShrImm32() const { return isShrFixedWidth(32); } + + bool isShrImm64() const { return isShrFixedWidth(64); } + + // if 0 <= value < w, return true + bool isShlFixedWidth(int w) const { + if (!isImm()) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) + return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value < w; + } + + bool isShlImm8() const { return isShlFixedWidth(8); } + + bool isShlImm16() const { return isShlFixedWidth(16); } + + bool isShlImm32() const { return isShlFixedWidth(32); } + + bool isShlImm64() const { return isShlFixedWidth(64); } + + bool isNeonMovImmShiftLSL() const { + if (!isShiftOrExtend()) + return false; + + if (ShiftExtend.ShiftType != A64SE::LSL) + return false; + + // Valid shift amount is 0, 8, 16 and 24. + return ShiftExtend.Amount % 8 == 0 && ShiftExtend.Amount <= 24; + } + + bool isNeonMovImmShiftLSLH() const { + if (!isShiftOrExtend()) + return false; + + if (ShiftExtend.ShiftType != A64SE::LSL) + return false; + + // Valid shift amount is 0 and 8. + return ShiftExtend.Amount == 0 || ShiftExtend.Amount == 8; + } + + bool isNeonMovImmShiftMSL() const { + if (!isShiftOrExtend()) + return false; + + if (ShiftExtend.ShiftType != A64SE::MSL) + return false; + + // Valid shift amount is 8 and 16. + return ShiftExtend.Amount == 8 || ShiftExtend.Amount == 16; + } + + template <A64Layout::VectorLayout Layout, unsigned Count> + bool isVectorList() const { + return Kind == k_VectorList && VectorList.Layout == Layout && + VectorList.Count == Count; + } + + template <int MemSize> bool isSImm7Scaled() const { + if (!isImm()) + return false; const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); if (!CE) return false; @@ -705,10 +798,38 @@ public: return isa<MCConstantExpr>(getImm()); } + bool isNeonUImm64Mask() const { + if (!isImm()) + return false; + + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) + return false; + + uint64_t Value = CE->getValue(); + + // i64 value with each byte being either 0x00 or 0xff. + for (unsigned i = 0; i < 8; ++i, Value >>= 8) + if ((Value & 0xff) != 0 && (Value & 0xff) != 0xff) + return false; + return true; + } + + // if value == N, return true + template<int N> + bool isExactImm() const { + if (!isImm()) return false; + + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + + return CE->getValue() == N; + } + static AArch64Operand *CreateImmWithLSL(const MCExpr *Val, unsigned ShiftAmount, bool ImplicitAmount, - SMLoc S, SMLoc E) { + SMLoc S,SMLoc E) { AArch64Operand *Op = new AArch64Operand(k_ImmWithLSL, S, E); Op->ImmWithLSL.Val = Val; Op->ImmWithLSL.ShiftAmount = ShiftAmount; @@ -766,6 +887,18 @@ public: return Op; } + static AArch64Operand *CreateVectorList(unsigned RegNum, unsigned Count, + A64Layout::VectorLayout Layout, + SMLoc S, SMLoc E) { + AArch64Operand *Op = new AArch64Operand(k_VectorList, S, E); + Op->VectorList.RegNum = RegNum; + Op->VectorList.Count = Count; + Op->VectorList.Layout = Layout; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + static AArch64Operand *CreateToken(StringRef Str, SMLoc S) { AArch64Operand *Op = new AArch64Operand(k_Token, S, S); Op->Tok.Data = Str.data(); @@ -1026,6 +1159,40 @@ public: Inst.addOperand(MCOperand::CreateImm(ShiftExtend.Amount)); } + // For Vector Immediates shifted imm operands. + void addNeonMovImmShiftLSLOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + if (ShiftExtend.Amount % 8 != 0 || ShiftExtend.Amount > 24) + llvm_unreachable("Invalid shift amount for vector immediate inst."); + + // Encode LSL shift amount 0, 8, 16, 24 as 0, 1, 2, 3. + int64_t Imm = ShiftExtend.Amount / 8; + Inst.addOperand(MCOperand::CreateImm(Imm)); + } + + void addNeonMovImmShiftLSLHOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + if (ShiftExtend.Amount != 0 && ShiftExtend.Amount != 8) + llvm_unreachable("Invalid shift amount for vector immediate inst."); + + // Encode LSLH shift amount 0, 8 as 0, 1. + int64_t Imm = ShiftExtend.Amount / 8; + Inst.addOperand(MCOperand::CreateImm(Imm)); + } + + void addNeonMovImmShiftMSLOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + if (ShiftExtend.Amount != 8 && ShiftExtend.Amount != 16) + llvm_unreachable("Invalid shift amount for vector immediate inst."); + + // Encode MSL shift amount 8, 16 as 0, 1. + int64_t Imm = ShiftExtend.Amount / 8 - 1; + Inst.addOperand(MCOperand::CreateImm(Imm)); + } + // For the extend in load-store (register offset) instructions. template<unsigned MemSize> void addAddrRegExtendOperands(MCInst &Inst, unsigned N) const { @@ -1065,6 +1232,25 @@ public: Inst.addOperand(MCOperand::CreateImm(ShiftExtend.Amount)); } + + void addNeonUImm64MaskOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + // A bit from each byte in the constant forms the encoded immediate + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + uint64_t Value = CE->getValue(); + + unsigned Imm = 0; + for (unsigned i = 0; i < 8; ++i, Value >>= 8) { + Imm |= (Value & 1) << i; + } + Inst.addOperand(MCOperand::CreateImm(Imm)); + } + + void addVectorListOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(VectorList.RegNum)); + } }; } // end anonymous namespace. @@ -1104,7 +1290,6 @@ AArch64AsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands, else return MatchOperand_Success; } - // ... or it might be a symbolish thing } // Fall through @@ -1148,7 +1333,7 @@ AArch64AsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands, return ParseOperand(Operands, Mnemonic); } // The following will likely be useful later, but not in very early cases - case AsmToken::LCurly: // Weird SIMD lists + case AsmToken::LCurly: // SIMD vector list is not parsed here llvm_unreachable("Don't know how to deal with '{' in operand"); return MatchOperand_ParseFail; } @@ -1306,7 +1491,7 @@ AArch64AsmParser::ParseImmWithLSLOperand( // The optional operand must be "lsl #N" where N is non-negative. if (Parser.getTok().is(AsmToken::Identifier) - && Parser.getTok().getIdentifier().lower() == "lsl") { + && Parser.getTok().getIdentifier().equals_lower("lsl")) { Parser.Lex(); if (Parser.getTok().is(AsmToken::Hash)) { @@ -1363,9 +1548,8 @@ AArch64AsmParser::ParseCRxOperand( return MatchOperand_ParseFail; } - std::string LowerTok = Parser.getTok().getIdentifier().lower(); - StringRef Tok(LowerTok); - if (Tok[0] != 'c') { + StringRef Tok = Parser.getTok().getIdentifier(); + if (Tok[0] != 'c' && Tok[0] != 'C') { Error(S, "Expected cN operand where 0 <= N <= 15"); return MatchOperand_ParseFail; } @@ -1437,22 +1621,11 @@ AArch64AsmParser::IdentifyRegister(unsigned &RegNum, SMLoc &RegEndLoc, std::string LowerReg = Tok.getString().lower(); size_t DotPos = LowerReg.find('.'); - RegNum = MatchRegisterName(LowerReg.substr(0, DotPos)); - if (RegNum == AArch64::NoRegister) { - RegNum = StringSwitch<unsigned>(LowerReg.substr(0, DotPos)) - .Case("ip0", AArch64::X16) - .Case("ip1", AArch64::X17) - .Case("fp", AArch64::X29) - .Case("lr", AArch64::X30) - .Default(AArch64::NoRegister); - } - if (RegNum == AArch64::NoRegister) - return false; - + bool IsVec128 = false; SMLoc S = Tok.getLoc(); RegEndLoc = SMLoc::getFromPointer(S.getPointer() + DotPos); - if (DotPos == StringRef::npos) { + if (DotPos == std::string::npos) { Layout = StringRef(); } else { // Everything afterwards needs to be a literal token, expected to be @@ -1462,20 +1635,78 @@ AArch64AsmParser::IdentifyRegister(unsigned &RegNum, SMLoc &RegEndLoc, // gives us a permanent string to use in the token (a pointer into LowerReg // would go out of scope when we return). LayoutLoc = SMLoc::getFromPointer(S.getPointer() + DotPos + 1); - std::string LayoutText = LowerReg.substr(DotPos, StringRef::npos); + StringRef LayoutText = StringRef(LowerReg).substr(DotPos); + + // See if it's a 128-bit layout first. Layout = StringSwitch<const char *>(LayoutText) - .Case(".d", ".d").Case(".1d", ".1d").Case(".2d", ".2d") - .Case(".s", ".s").Case(".2s", ".2s").Case(".4s", ".4s") - .Case(".h", ".h").Case(".4h", ".4h").Case(".8h", ".8h") - .Case(".b", ".b").Case(".8b", ".8b").Case(".16b", ".16b") + .Case(".q", ".q").Case(".1q", ".1q") + .Case(".d", ".d").Case(".2d", ".2d") + .Case(".s", ".s").Case(".4s", ".4s") + .Case(".h", ".h").Case(".8h", ".8h") + .Case(".b", ".b").Case(".16b", ".16b") .Default(""); + if (Layout.size() != 0) + IsVec128 = true; + else { + Layout = StringSwitch<const char *>(LayoutText) + .Case(".1d", ".1d") + .Case(".2s", ".2s") + .Case(".4h", ".4h") + .Case(".8b", ".8b") + .Default(""); + } + if (Layout.size() == 0) { - // Malformed register + // If we've still not pinned it down the register is malformed. return false; } } + RegNum = MatchRegisterName(LowerReg.substr(0, DotPos)); + if (RegNum == AArch64::NoRegister) { + RegNum = StringSwitch<unsigned>(LowerReg.substr(0, DotPos)) + .Case("ip0", AArch64::X16) + .Case("ip1", AArch64::X17) + .Case("fp", AArch64::X29) + .Case("lr", AArch64::X30) + .Case("v0", IsVec128 ? AArch64::Q0 : AArch64::D0) + .Case("v1", IsVec128 ? AArch64::Q1 : AArch64::D1) + .Case("v2", IsVec128 ? AArch64::Q2 : AArch64::D2) + .Case("v3", IsVec128 ? AArch64::Q3 : AArch64::D3) + .Case("v4", IsVec128 ? AArch64::Q4 : AArch64::D4) + .Case("v5", IsVec128 ? AArch64::Q5 : AArch64::D5) + .Case("v6", IsVec128 ? AArch64::Q6 : AArch64::D6) + .Case("v7", IsVec128 ? AArch64::Q7 : AArch64::D7) + .Case("v8", IsVec128 ? AArch64::Q8 : AArch64::D8) + .Case("v9", IsVec128 ? AArch64::Q9 : AArch64::D9) + .Case("v10", IsVec128 ? AArch64::Q10 : AArch64::D10) + .Case("v11", IsVec128 ? AArch64::Q11 : AArch64::D11) + .Case("v12", IsVec128 ? AArch64::Q12 : AArch64::D12) + .Case("v13", IsVec128 ? AArch64::Q13 : AArch64::D13) + .Case("v14", IsVec128 ? AArch64::Q14 : AArch64::D14) + .Case("v15", IsVec128 ? AArch64::Q15 : AArch64::D15) + .Case("v16", IsVec128 ? AArch64::Q16 : AArch64::D16) + .Case("v17", IsVec128 ? AArch64::Q17 : AArch64::D17) + .Case("v18", IsVec128 ? AArch64::Q18 : AArch64::D18) + .Case("v19", IsVec128 ? AArch64::Q19 : AArch64::D19) + .Case("v20", IsVec128 ? AArch64::Q20 : AArch64::D20) + .Case("v21", IsVec128 ? AArch64::Q21 : AArch64::D21) + .Case("v22", IsVec128 ? AArch64::Q22 : AArch64::D22) + .Case("v23", IsVec128 ? AArch64::Q23 : AArch64::D23) + .Case("v24", IsVec128 ? AArch64::Q24 : AArch64::D24) + .Case("v25", IsVec128 ? AArch64::Q25 : AArch64::D25) + .Case("v26", IsVec128 ? AArch64::Q26 : AArch64::D26) + .Case("v27", IsVec128 ? AArch64::Q27 : AArch64::D27) + .Case("v28", IsVec128 ? AArch64::Q28 : AArch64::D28) + .Case("v29", IsVec128 ? AArch64::Q29 : AArch64::D29) + .Case("v30", IsVec128 ? AArch64::Q30 : AArch64::D30) + .Case("v31", IsVec128 ? AArch64::Q31 : AArch64::D31) + .Default(AArch64::NoRegister); + } + if (RegNum == AArch64::NoRegister) + return false; + return true; } @@ -1507,6 +1738,7 @@ AArch64AsmParser::ParseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands, case 'h': NumLanes = 8; break; case 's': NumLanes = 4; break; case 'd': NumLanes = 2; break; + case 'q': NumLanes = 1; break; } } @@ -1660,20 +1892,21 @@ AArch64AsmParser::ParseShiftExtend( std::string LowerID = IDVal.lower(); A64SE::ShiftExtSpecifiers Spec = - StringSwitch<A64SE::ShiftExtSpecifiers>(LowerID) - .Case("lsl", A64SE::LSL) - .Case("lsr", A64SE::LSR) - .Case("asr", A64SE::ASR) - .Case("ror", A64SE::ROR) - .Case("uxtb", A64SE::UXTB) - .Case("uxth", A64SE::UXTH) - .Case("uxtw", A64SE::UXTW) - .Case("uxtx", A64SE::UXTX) - .Case("sxtb", A64SE::SXTB) - .Case("sxth", A64SE::SXTH) - .Case("sxtw", A64SE::SXTW) - .Case("sxtx", A64SE::SXTX) - .Default(A64SE::Invalid); + StringSwitch<A64SE::ShiftExtSpecifiers>(LowerID) + .Case("lsl", A64SE::LSL) + .Case("msl", A64SE::MSL) + .Case("lsr", A64SE::LSR) + .Case("asr", A64SE::ASR) + .Case("ror", A64SE::ROR) + .Case("uxtb", A64SE::UXTB) + .Case("uxth", A64SE::UXTH) + .Case("uxtw", A64SE::UXTW) + .Case("uxtx", A64SE::UXTX) + .Case("sxtb", A64SE::SXTB) + .Case("sxth", A64SE::SXTH) + .Case("sxtw", A64SE::SXTW) + .Case("sxtx", A64SE::SXTX) + .Default(A64SE::Invalid); if (Spec == A64SE::Invalid) return MatchOperand_NoMatch; @@ -1683,8 +1916,8 @@ AArch64AsmParser::ParseShiftExtend( S = Parser.getTok().getLoc(); Parser.Lex(); - if (Spec != A64SE::LSL && Spec != A64SE::LSR && - Spec != A64SE::ASR && Spec != A64SE::ROR) { + if (Spec != A64SE::LSL && Spec != A64SE::LSR && Spec != A64SE::ASR && + Spec != A64SE::ROR && Spec != A64SE::MSL) { // The shift amount can be omitted for the extending versions, but not real // shifts: // add x0, x0, x0, uxtb @@ -1724,6 +1957,148 @@ AArch64AsmParser::ParseShiftExtend( return MatchOperand_Success; } +/// Try to parse a vector register token, If it is a vector register, +/// the token is eaten and return true. Otherwise return false. +bool AArch64AsmParser::TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc, + StringRef &Layout, SMLoc &LayoutLoc) { + bool IsVector = true; + + if (!IdentifyRegister(RegNum, RegEndLoc, Layout, LayoutLoc)) + IsVector = false; + else if (!AArch64MCRegisterClasses[AArch64::FPR64RegClassID] + .contains(RegNum) && + !AArch64MCRegisterClasses[AArch64::FPR128RegClassID] + .contains(RegNum)) + IsVector = false; + else if (Layout.size() == 0) + IsVector = false; + + if (!IsVector) + Error(Parser.getTok().getLoc(), "expected vector type register"); + + Parser.Lex(); // Eat this token. + return IsVector; +} + + +// A vector list contains 1-4 consecutive registers. +// Now there are two kinds of vector list when number of vector > 1: +// (1) {Vn.layout, Vn+1.layout, ... , Vm.layout} +// (2) {Vn.layout - Vm.layout} +// If the layout is like .b/.h/.s/.d, also parse the lane. +AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::ParseVectorList( + SmallVectorImpl<MCParsedAsmOperand *> &Operands) { + if (Parser.getTok().isNot(AsmToken::LCurly)) { + Error(Parser.getTok().getLoc(), "'{' expected"); + return MatchOperand_ParseFail; + } + SMLoc SLoc = Parser.getTok().getLoc(); + Parser.Lex(); // Eat '{' token. + + unsigned Reg, Count = 1; + StringRef LayoutStr; + SMLoc RegEndLoc, LayoutLoc; + if (!TryParseVector(Reg, RegEndLoc, LayoutStr, LayoutLoc)) + return MatchOperand_ParseFail; + + if (Parser.getTok().is(AsmToken::Minus)) { + Parser.Lex(); // Eat the minus. + + unsigned Reg2; + StringRef LayoutStr2; + SMLoc RegEndLoc2, LayoutLoc2; + SMLoc RegLoc2 = Parser.getTok().getLoc(); + + if (!TryParseVector(Reg2, RegEndLoc2, LayoutStr2, LayoutLoc2)) + return MatchOperand_ParseFail; + unsigned Space = (Reg < Reg2) ? (Reg2 - Reg) : (Reg2 + 32 - Reg); + + if (LayoutStr != LayoutStr2) { + Error(LayoutLoc2, "expected the same vector layout"); + return MatchOperand_ParseFail; + } + if (Space == 0 || Space > 3) { + Error(RegLoc2, "invalid number of vectors"); + return MatchOperand_ParseFail; + } + + Count += Space; + } else { + unsigned LastReg = Reg; + while (Parser.getTok().is(AsmToken::Comma)) { + Parser.Lex(); // Eat the comma. + unsigned Reg2; + StringRef LayoutStr2; + SMLoc RegEndLoc2, LayoutLoc2; + SMLoc RegLoc2 = Parser.getTok().getLoc(); + + if (!TryParseVector(Reg2, RegEndLoc2, LayoutStr2, LayoutLoc2)) + return MatchOperand_ParseFail; + unsigned Space = (LastReg < Reg2) ? (Reg2 - LastReg) + : (Reg2 + 32 - LastReg); + Count++; + + // The space between two vectors should be 1. And they should have the same layout. + // Total count shouldn't be great than 4 + if (Space != 1) { + Error(RegLoc2, "invalid space between two vectors"); + return MatchOperand_ParseFail; + } + if (LayoutStr != LayoutStr2) { + Error(LayoutLoc2, "expected the same vector layout"); + return MatchOperand_ParseFail; + } + if (Count > 4) { + Error(RegLoc2, "invalid number of vectors"); + return MatchOperand_ParseFail; + } + + LastReg = Reg2; + } + } + + if (Parser.getTok().isNot(AsmToken::RCurly)) { + Error(Parser.getTok().getLoc(), "'}' expected"); + return MatchOperand_ParseFail; + } + SMLoc ELoc = Parser.getTok().getLoc(); + Parser.Lex(); // Eat '}' token. + + A64Layout::VectorLayout Layout = A64StringToVectorLayout(LayoutStr); + if (Count > 1) { // If count > 1, create vector list using super register. + bool IsVec64 = (Layout < A64Layout::VL_16B); + static unsigned SupRegIDs[3][2] = { + { AArch64::QPairRegClassID, AArch64::DPairRegClassID }, + { AArch64::QTripleRegClassID, AArch64::DTripleRegClassID }, + { AArch64::QQuadRegClassID, AArch64::DQuadRegClassID } + }; + unsigned SupRegID = SupRegIDs[Count - 2][static_cast<int>(IsVec64)]; + unsigned Sub0 = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0; + const MCRegisterInfo *MRI = getContext().getRegisterInfo(); + Reg = MRI->getMatchingSuperReg(Reg, Sub0, + &AArch64MCRegisterClasses[SupRegID]); + } + Operands.push_back( + AArch64Operand::CreateVectorList(Reg, Count, Layout, SLoc, ELoc)); + + if (Parser.getTok().is(AsmToken::LBrac)) { + uint32_t NumLanes = 0; + switch(Layout) { + case A64Layout::VL_B : NumLanes = 16; break; + case A64Layout::VL_H : NumLanes = 8; break; + case A64Layout::VL_S : NumLanes = 4; break; + case A64Layout::VL_D : NumLanes = 2; break; + default: + SMLoc Loc = getLexer().getLoc(); + Error(Loc, "expected comma before next operand"); + return MatchOperand_ParseFail; + } + return ParseNEONLane(Operands, NumLanes); + } else { + return MatchOperand_Success; + } +} + // FIXME: We would really like to be able to tablegen'erate this. bool AArch64AsmParser:: validateInstruction(MCInst &Inst, @@ -1918,7 +2293,7 @@ bool AArch64AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { if (getParser().parseExpression(Value)) return true; - getParser().getStreamer().EmitValue(Value, Size, 0/*addrspace*/); + getParser().getStreamer().EmitValue(Value, Size); if (getLexer().is(AsmToken::EndOfStatement)) break; @@ -2019,7 +2394,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, "expected compatible register or floating-point constant"); case Match_FPZero: return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(), - "expected floating-point constant #0.0"); + "expected floating-point constant #0.0 or invalid register type"); case Match_Label: return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(), "expected label or encodable integer pc offset"); @@ -2140,6 +2515,30 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_Width64: return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(), "expected integer in range [<lsb>, 63]"); + case Match_ShrImm8: + return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(), + "expected integer in range [1, 8]"); + case Match_ShrImm16: + return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(), + "expected integer in range [1, 16]"); + case Match_ShrImm32: + return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(), + "expected integer in range [1, 32]"); + case Match_ShrImm64: + return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(), + "expected integer in range [1, 64]"); + case Match_ShlImm8: + return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(), + "expected integer in range [0, 7]"); + case Match_ShlImm16: + return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(), + "expected integer in range [0, 15]"); + case Match_ShlImm32: + return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(), + "expected integer in range [0, 31]"); + case Match_ShlImm64: + return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(), + "expected integer in range [0, 63]"); } llvm_unreachable("Implement any new match types added!"); diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt index 8164d6f..0f2e816 100644 --- a/lib/Target/AArch64/CMakeLists.txt +++ b/lib/Target/AArch64/CMakeLists.txt @@ -28,6 +28,8 @@ add_llvm_target(AArch64CodeGen AArch64TargetObjectFile.cpp ) +add_dependencies(LLVMAArch64CodeGen AArch64CommonTableGen) + add_subdirectory(AsmParser) add_subdirectory(Disassembler) add_subdirectory(InstPrinter) diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index 12c1b8f..be4d7f2 100644 --- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -38,7 +38,7 @@ typedef MCDisassembler::DecodeStatus DecodeStatus; namespace { /// AArch64 disassembler for all AArch64 platforms. class AArch64Disassembler : public MCDisassembler { - const MCRegisterInfo *RegInfo; + OwningPtr<const MCRegisterInfo> RegInfo; public: /// Initializes the disassembler. /// @@ -46,8 +46,7 @@ public: : MCDisassembler(STI), RegInfo(Info) { } - ~AArch64Disassembler() { - } + ~AArch64Disassembler() {} /// See MCDisassembler. DecodeStatus getInstruction(MCInst &instr, @@ -57,7 +56,7 @@ public: raw_ostream &vStream, raw_ostream &cStream) const; - const MCRegisterInfo *getRegInfo() const { return RegInfo; } + const MCRegisterInfo *getRegInfo() const { return RegInfo.get(); } }; } @@ -83,12 +82,38 @@ static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeFPR64LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeVPR128RegisterClass(llvm::MCInst &Inst, - unsigned RegNo, uint64_t Address, - const void *Decoder); +static DecodeStatus DecodeFPR128LoRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder); + +static DecodeStatus DecodeGPR64noxzrRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, + uint64_t Address, + const void *Decoder); + +static DecodeStatus DecodeDPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeQPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeDTripleRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeQTripleRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeDQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeQQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst, unsigned OptionHiS, @@ -111,6 +136,30 @@ static DecodeStatus DecodeFPZeroOperand(llvm::MCInst &Inst, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeShiftRightImm8(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeShiftRightImm16(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeShiftRightImm32(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeShiftRightImm64(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder); + +static DecodeStatus DecodeShiftLeftImm8(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeShiftLeftImm16(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeShiftLeftImm32(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeShiftLeftImm64(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder); + template<int RegWidth> static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst, unsigned FullImm, @@ -127,6 +176,10 @@ static DecodeStatus DecodeRegExtendOperand(llvm::MCInst &Inst, unsigned ShiftAmount, uint64_t Address, const void *Decoder); +template <A64SE::ShiftExtSpecifiers Ext, bool IsHalf> +static DecodeStatus +DecodeNeonMovImmShiftOperand(llvm::MCInst &Inst, unsigned ShiftAmount, + uint64_t Address, const void *Decoder); static DecodeStatus Decode32BitShiftOperand(llvm::MCInst &Inst, unsigned ShiftAmount, @@ -177,6 +230,17 @@ static DecodeStatus DecodeSingleIndexedInstruction(llvm::MCInst &Inst, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLDSTPostInstruction(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder); + +static DecodeStatus DecodeVLDSTLanePostInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder); + +static DecodeStatus DecodeSHLLInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder); static bool Check(DecodeStatus &Out, DecodeStatus In); @@ -208,7 +272,7 @@ DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size, uint8_t bytes[4]; // We want to read exactly 4 bytes of data. - if (Region.readBytes(Address, 4, (uint8_t*)bytes, NULL) == -1) { + if (Region.readBytes(Address, 4, bytes) == -1) { Size = 0; return MCDisassembler::Fail; } @@ -325,6 +389,14 @@ DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo, return MCDisassembler::Success; } +static DecodeStatus +DecodeFPR64LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo > 15) + return MCDisassembler::Fail; + + return DecodeFPR64RegisterClass(Inst, RegNo, Address, Decoder); +} static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo, @@ -338,16 +410,79 @@ DecodeFPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo, } static DecodeStatus -DecodeVPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo, - uint64_t Address, const void *Decoder) { +DecodeFPR128LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo > 15) + return MCDisassembler::Fail; + + return DecodeFPR128RegisterClass(Inst, RegNo, Address, Decoder); +} + +static DecodeStatus DecodeGPR64noxzrRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, + uint64_t Address, + const void *Decoder) { + if (RegNo > 30) + return MCDisassembler::Fail; + + uint16_t Register = getReg(Decoder, AArch64::GPR64noxzrRegClassID, RegNo); + Inst.addOperand(MCOperand::CreateReg(Register)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeRegisterClassByID(llvm::MCInst &Inst, unsigned RegNo, + unsigned RegID, + const void *Decoder) { if (RegNo > 31) return MCDisassembler::Fail; - uint16_t Register = getReg(Decoder, AArch64::VPR128RegClassID, RegNo); + uint16_t Register = getReg(Decoder, RegID, RegNo); Inst.addOperand(MCOperand::CreateReg(Register)); return MCDisassembler::Success; } +static DecodeStatus DecodeDPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + return DecodeRegisterClassByID(Inst, RegNo, AArch64::DPairRegClassID, + Decoder); +} + +static DecodeStatus DecodeQPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + return DecodeRegisterClassByID(Inst, RegNo, AArch64::QPairRegClassID, + Decoder); +} + +static DecodeStatus DecodeDTripleRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder) { + return DecodeRegisterClassByID(Inst, RegNo, AArch64::DTripleRegClassID, + Decoder); +} + +static DecodeStatus DecodeQTripleRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder) { + return DecodeRegisterClassByID(Inst, RegNo, AArch64::QTripleRegClassID, + Decoder); +} + +static DecodeStatus DecodeDQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + return DecodeRegisterClassByID(Inst, RegNo, AArch64::DQuadRegClassID, + Decoder); +} + +static DecodeStatus DecodeQQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + return DecodeRegisterClassByID(Inst, RegNo, AArch64::QQuadRegClassID, + Decoder); +} + static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst, unsigned OptionHiS, uint64_t Address, @@ -396,7 +531,73 @@ static DecodeStatus DecodeFPZeroOperand(llvm::MCInst &Inst, return MCDisassembler::Success; } +static DecodeStatus DecodeShiftRightImm8(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + Inst.addOperand(MCOperand::CreateImm(8 - Val)); + return MCDisassembler::Success; +} +static DecodeStatus DecodeShiftRightImm16(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + Inst.addOperand(MCOperand::CreateImm(16 - Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeShiftRightImm32(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + Inst.addOperand(MCOperand::CreateImm(32 - Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeShiftRightImm64(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + Inst.addOperand(MCOperand::CreateImm(64 - Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeShiftLeftImm8(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + if (Val > 7) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::CreateImm(Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeShiftLeftImm16(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + if (Val > 15) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::CreateImm(Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeShiftLeftImm32(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + if (Val > 31) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::CreateImm(Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeShiftLeftImm64(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + if (Val > 63) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::CreateImm(Val)); + return MCDisassembler::Success; +} template<int RegWidth> static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst, @@ -553,11 +754,11 @@ static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn, unsigned IsToVec = fieldFromInstruction(Insn, 16, 1); if (IsToVec) { - DecodeVPR128RegisterClass(Inst, Rd, Address, Decoder); + DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder); DecodeGPR64RegisterClass(Inst, Rn, Address, Decoder); } else { DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder); - DecodeVPR128RegisterClass(Inst, Rn, Address, Decoder); + DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder); } // Add the lane @@ -800,4 +1001,572 @@ extern "C" void LLVMInitializeAArch64Disassembler() { createAArch64Disassembler); } +template <A64SE::ShiftExtSpecifiers Ext, bool IsHalf> +static DecodeStatus +DecodeNeonMovImmShiftOperand(llvm::MCInst &Inst, unsigned ShiftAmount, + uint64_t Address, const void *Decoder) { + bool IsLSL = false; + if (Ext == A64SE::LSL) + IsLSL = true; + else if (Ext != A64SE::MSL) + return MCDisassembler::Fail; + + // MSL and LSLH accepts encoded shift amount 0 or 1. + if ((!IsLSL || (IsLSL && IsHalf)) && ShiftAmount != 0 && ShiftAmount != 1) + return MCDisassembler::Fail; + + // LSL accepts encoded shift amount 0, 1, 2 or 3. + if (IsLSL && ShiftAmount > 3) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::CreateImm(ShiftAmount)); + return MCDisassembler::Success; +} + +// Decode post-index vector load/store instructions. +// This is necessary as we need to decode Rm: if Rm == 0b11111, the last +// operand is an immediate equal the the length of vector list in bytes, +// or Rm is decoded to a GPR64noxzr register. +static DecodeStatus DecodeVLDSTPostInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder) { + unsigned Rt = fieldFromInstruction(Insn, 0, 5); + unsigned Rn = fieldFromInstruction(Insn, 5, 5); + unsigned Rm = fieldFromInstruction(Insn, 16, 5); + unsigned Opcode = fieldFromInstruction(Insn, 12, 4); + unsigned IsLoad = fieldFromInstruction(Insn, 22, 1); + // 0 for 64bit vector list, 1 for 128bit vector list + unsigned Is128BitVec = fieldFromInstruction(Insn, 30, 1); + + unsigned NumVecs; + switch (Opcode) { + case 0: // ld4/st4 + case 2: // ld1/st1 with 4 vectors + NumVecs = 4; break; + case 4: // ld3/st3 + case 6: // ld1/st1 with 3 vectors + NumVecs = 3; break; + case 7: // ld1/st1 with 1 vector + NumVecs = 1; break; + case 8: // ld2/st2 + case 10: // ld1/st1 with 2 vectors + NumVecs = 2; break; + default: + llvm_unreachable("Invalid opcode for post-index load/store instructions"); + } + + // Decode vector list of 1/2/3/4 vectors for load instructions. + if (IsLoad) { + switch (NumVecs) { + case 1: + Is128BitVec ? DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder) + : DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder); + break; + case 2: + Is128BitVec ? DecodeQPairRegisterClass(Inst, Rt, Address, Decoder) + : DecodeDPairRegisterClass(Inst, Rt, Address, Decoder); + break; + case 3: + Is128BitVec ? DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder) + : DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder); + break; + case 4: + Is128BitVec ? DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder) + : DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder); + break; + } + } + + // Decode write back register, which is equal to Rn. + DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder); + DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder); + + if (Rm == 31) // If Rm is 0x11111, add the vector list length in byte + Inst.addOperand(MCOperand::CreateImm(NumVecs * (Is128BitVec ? 16 : 8))); + else // Decode Rm + DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder); + + // Decode vector list of 1/2/3/4 vectors for load instructions. + if (!IsLoad) { + switch (NumVecs) { + case 1: + Is128BitVec ? DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder) + : DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder); + break; + case 2: + Is128BitVec ? DecodeQPairRegisterClass(Inst, Rt, Address, Decoder) + : DecodeDPairRegisterClass(Inst, Rt, Address, Decoder); + break; + case 3: + Is128BitVec ? DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder) + : DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder); + break; + case 4: + Is128BitVec ? DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder) + : DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder); + break; + } + } + + return MCDisassembler::Success; +} + +// Decode post-index vector load/store lane instructions. +// This is necessary as we need to decode Rm: if Rm == 0b11111, the last +// operand is an immediate equal the the length of the changed bytes, +// or Rm is decoded to a GPR64noxzr register. +static DecodeStatus DecodeVLDSTLanePostInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder) { + bool Is64bitVec = false; + bool IsLoadDup = false; + bool IsLoad = false; + // The total number of bytes transferred. + // TransferBytes = NumVecs * OneLaneBytes + unsigned TransferBytes = 0; + unsigned NumVecs = 0; + unsigned Opc = Inst.getOpcode(); + switch (Opc) { + case AArch64::LD1R_WB_8B_fixed: case AArch64::LD1R_WB_8B_register: + case AArch64::LD1R_WB_4H_fixed: case AArch64::LD1R_WB_4H_register: + case AArch64::LD1R_WB_2S_fixed: case AArch64::LD1R_WB_2S_register: + case AArch64::LD1R_WB_1D_fixed: case AArch64::LD1R_WB_1D_register: { + switch (Opc) { + case AArch64::LD1R_WB_8B_fixed: case AArch64::LD1R_WB_8B_register: + TransferBytes = 1; break; + case AArch64::LD1R_WB_4H_fixed: case AArch64::LD1R_WB_4H_register: + TransferBytes = 2; break; + case AArch64::LD1R_WB_2S_fixed: case AArch64::LD1R_WB_2S_register: + TransferBytes = 4; break; + case AArch64::LD1R_WB_1D_fixed: case AArch64::LD1R_WB_1D_register: + TransferBytes = 8; break; + } + Is64bitVec = true; + IsLoadDup = true; + NumVecs = 1; + break; + } + + case AArch64::LD1R_WB_16B_fixed: case AArch64::LD1R_WB_16B_register: + case AArch64::LD1R_WB_8H_fixed: case AArch64::LD1R_WB_8H_register: + case AArch64::LD1R_WB_4S_fixed: case AArch64::LD1R_WB_4S_register: + case AArch64::LD1R_WB_2D_fixed: case AArch64::LD1R_WB_2D_register: { + switch (Opc) { + case AArch64::LD1R_WB_16B_fixed: case AArch64::LD1R_WB_16B_register: + TransferBytes = 1; break; + case AArch64::LD1R_WB_8H_fixed: case AArch64::LD1R_WB_8H_register: + TransferBytes = 2; break; + case AArch64::LD1R_WB_4S_fixed: case AArch64::LD1R_WB_4S_register: + TransferBytes = 4; break; + case AArch64::LD1R_WB_2D_fixed: case AArch64::LD1R_WB_2D_register: + TransferBytes = 8; break; + } + IsLoadDup = true; + NumVecs = 1; + break; + } + + case AArch64::LD2R_WB_8B_fixed: case AArch64::LD2R_WB_8B_register: + case AArch64::LD2R_WB_4H_fixed: case AArch64::LD2R_WB_4H_register: + case AArch64::LD2R_WB_2S_fixed: case AArch64::LD2R_WB_2S_register: + case AArch64::LD2R_WB_1D_fixed: case AArch64::LD2R_WB_1D_register: { + switch (Opc) { + case AArch64::LD2R_WB_8B_fixed: case AArch64::LD2R_WB_8B_register: + TransferBytes = 2; break; + case AArch64::LD2R_WB_4H_fixed: case AArch64::LD2R_WB_4H_register: + TransferBytes = 4; break; + case AArch64::LD2R_WB_2S_fixed: case AArch64::LD2R_WB_2S_register: + TransferBytes = 8; break; + case AArch64::LD2R_WB_1D_fixed: case AArch64::LD2R_WB_1D_register: + TransferBytes = 16; break; + } + Is64bitVec = true; + IsLoadDup = true; + NumVecs = 2; + break; + } + + case AArch64::LD2R_WB_16B_fixed: case AArch64::LD2R_WB_16B_register: + case AArch64::LD2R_WB_8H_fixed: case AArch64::LD2R_WB_8H_register: + case AArch64::LD2R_WB_4S_fixed: case AArch64::LD2R_WB_4S_register: + case AArch64::LD2R_WB_2D_fixed: case AArch64::LD2R_WB_2D_register: { + switch (Opc) { + case AArch64::LD2R_WB_16B_fixed: case AArch64::LD2R_WB_16B_register: + TransferBytes = 2; break; + case AArch64::LD2R_WB_8H_fixed: case AArch64::LD2R_WB_8H_register: + TransferBytes = 4; break; + case AArch64::LD2R_WB_4S_fixed: case AArch64::LD2R_WB_4S_register: + TransferBytes = 8; break; + case AArch64::LD2R_WB_2D_fixed: case AArch64::LD2R_WB_2D_register: + TransferBytes = 16; break; + } + IsLoadDup = true; + NumVecs = 2; + break; + } + + case AArch64::LD3R_WB_8B_fixed: case AArch64::LD3R_WB_8B_register: + case AArch64::LD3R_WB_4H_fixed: case AArch64::LD3R_WB_4H_register: + case AArch64::LD3R_WB_2S_fixed: case AArch64::LD3R_WB_2S_register: + case AArch64::LD3R_WB_1D_fixed: case AArch64::LD3R_WB_1D_register: { + switch (Opc) { + case AArch64::LD3R_WB_8B_fixed: case AArch64::LD3R_WB_8B_register: + TransferBytes = 3; break; + case AArch64::LD3R_WB_4H_fixed: case AArch64::LD3R_WB_4H_register: + TransferBytes = 6; break; + case AArch64::LD3R_WB_2S_fixed: case AArch64::LD3R_WB_2S_register: + TransferBytes = 12; break; + case AArch64::LD3R_WB_1D_fixed: case AArch64::LD3R_WB_1D_register: + TransferBytes = 24; break; + } + Is64bitVec = true; + IsLoadDup = true; + NumVecs = 3; + break; + } + + case AArch64::LD3R_WB_16B_fixed: case AArch64::LD3R_WB_16B_register: + case AArch64::LD3R_WB_4S_fixed: case AArch64::LD3R_WB_8H_register: + case AArch64::LD3R_WB_8H_fixed: case AArch64::LD3R_WB_4S_register: + case AArch64::LD3R_WB_2D_fixed: case AArch64::LD3R_WB_2D_register: { + switch (Opc) { + case AArch64::LD3R_WB_16B_fixed: case AArch64::LD3R_WB_16B_register: + TransferBytes = 3; break; + case AArch64::LD3R_WB_8H_fixed: case AArch64::LD3R_WB_8H_register: + TransferBytes = 6; break; + case AArch64::LD3R_WB_4S_fixed: case AArch64::LD3R_WB_4S_register: + TransferBytes = 12; break; + case AArch64::LD3R_WB_2D_fixed: case AArch64::LD3R_WB_2D_register: + TransferBytes = 24; break; + } + IsLoadDup = true; + NumVecs = 3; + break; + } + + case AArch64::LD4R_WB_8B_fixed: case AArch64::LD4R_WB_8B_register: + case AArch64::LD4R_WB_4H_fixed: case AArch64::LD4R_WB_4H_register: + case AArch64::LD4R_WB_2S_fixed: case AArch64::LD4R_WB_2S_register: + case AArch64::LD4R_WB_1D_fixed: case AArch64::LD4R_WB_1D_register: { + switch (Opc) { + case AArch64::LD4R_WB_8B_fixed: case AArch64::LD4R_WB_8B_register: + TransferBytes = 4; break; + case AArch64::LD4R_WB_4H_fixed: case AArch64::LD4R_WB_4H_register: + TransferBytes = 8; break; + case AArch64::LD4R_WB_2S_fixed: case AArch64::LD4R_WB_2S_register: + TransferBytes = 16; break; + case AArch64::LD4R_WB_1D_fixed: case AArch64::LD4R_WB_1D_register: + TransferBytes = 32; break; + } + Is64bitVec = true; + IsLoadDup = true; + NumVecs = 4; + break; + } + + case AArch64::LD4R_WB_16B_fixed: case AArch64::LD4R_WB_16B_register: + case AArch64::LD4R_WB_4S_fixed: case AArch64::LD4R_WB_8H_register: + case AArch64::LD4R_WB_8H_fixed: case AArch64::LD4R_WB_4S_register: + case AArch64::LD4R_WB_2D_fixed: case AArch64::LD4R_WB_2D_register: { + switch (Opc) { + case AArch64::LD4R_WB_16B_fixed: case AArch64::LD4R_WB_16B_register: + TransferBytes = 4; break; + case AArch64::LD4R_WB_8H_fixed: case AArch64::LD4R_WB_8H_register: + TransferBytes = 8; break; + case AArch64::LD4R_WB_4S_fixed: case AArch64::LD4R_WB_4S_register: + TransferBytes = 16; break; + case AArch64::LD4R_WB_2D_fixed: case AArch64::LD4R_WB_2D_register: + TransferBytes = 32; break; + } + IsLoadDup = true; + NumVecs = 4; + break; + } + + case AArch64::LD1LN_WB_B_fixed: case AArch64::LD1LN_WB_B_register: + case AArch64::LD1LN_WB_H_fixed: case AArch64::LD1LN_WB_H_register: + case AArch64::LD1LN_WB_S_fixed: case AArch64::LD1LN_WB_S_register: + case AArch64::LD1LN_WB_D_fixed: case AArch64::LD1LN_WB_D_register: { + switch (Opc) { + case AArch64::LD1LN_WB_B_fixed: case AArch64::LD1LN_WB_B_register: + TransferBytes = 1; break; + case AArch64::LD1LN_WB_H_fixed: case AArch64::LD1LN_WB_H_register: + TransferBytes = 2; break; + case AArch64::LD1LN_WB_S_fixed: case AArch64::LD1LN_WB_S_register: + TransferBytes = 4; break; + case AArch64::LD1LN_WB_D_fixed: case AArch64::LD1LN_WB_D_register: + TransferBytes = 8; break; + } + IsLoad = true; + NumVecs = 1; + break; + } + + case AArch64::LD2LN_WB_B_fixed: case AArch64::LD2LN_WB_B_register: + case AArch64::LD2LN_WB_H_fixed: case AArch64::LD2LN_WB_H_register: + case AArch64::LD2LN_WB_S_fixed: case AArch64::LD2LN_WB_S_register: + case AArch64::LD2LN_WB_D_fixed: case AArch64::LD2LN_WB_D_register: { + switch (Opc) { + case AArch64::LD2LN_WB_B_fixed: case AArch64::LD2LN_WB_B_register: + TransferBytes = 2; break; + case AArch64::LD2LN_WB_H_fixed: case AArch64::LD2LN_WB_H_register: + TransferBytes = 4; break; + case AArch64::LD2LN_WB_S_fixed: case AArch64::LD2LN_WB_S_register: + TransferBytes = 8; break; + case AArch64::LD2LN_WB_D_fixed: case AArch64::LD2LN_WB_D_register: + TransferBytes = 16; break; + } + IsLoad = true; + NumVecs = 2; + break; + } + + case AArch64::LD3LN_WB_B_fixed: case AArch64::LD3LN_WB_B_register: + case AArch64::LD3LN_WB_H_fixed: case AArch64::LD3LN_WB_H_register: + case AArch64::LD3LN_WB_S_fixed: case AArch64::LD3LN_WB_S_register: + case AArch64::LD3LN_WB_D_fixed: case AArch64::LD3LN_WB_D_register: { + switch (Opc) { + case AArch64::LD3LN_WB_B_fixed: case AArch64::LD3LN_WB_B_register: + TransferBytes = 3; break; + case AArch64::LD3LN_WB_H_fixed: case AArch64::LD3LN_WB_H_register: + TransferBytes = 6; break; + case AArch64::LD3LN_WB_S_fixed: case AArch64::LD3LN_WB_S_register: + TransferBytes = 12; break; + case AArch64::LD3LN_WB_D_fixed: case AArch64::LD3LN_WB_D_register: + TransferBytes = 24; break; + } + IsLoad = true; + NumVecs = 3; + break; + } + + case AArch64::LD4LN_WB_B_fixed: case AArch64::LD4LN_WB_B_register: + case AArch64::LD4LN_WB_H_fixed: case AArch64::LD4LN_WB_H_register: + case AArch64::LD4LN_WB_S_fixed: case AArch64::LD4LN_WB_S_register: + case AArch64::LD4LN_WB_D_fixed: case AArch64::LD4LN_WB_D_register: { + switch (Opc) { + case AArch64::LD4LN_WB_B_fixed: case AArch64::LD4LN_WB_B_register: + TransferBytes = 4; break; + case AArch64::LD4LN_WB_H_fixed: case AArch64::LD4LN_WB_H_register: + TransferBytes = 8; break; + case AArch64::LD4LN_WB_S_fixed: case AArch64::LD4LN_WB_S_register: + TransferBytes = 16; break; + case AArch64::LD4LN_WB_D_fixed: case AArch64::LD4LN_WB_D_register: + TransferBytes = 32; break; + } + IsLoad = true; + NumVecs = 4; + break; + } + + case AArch64::ST1LN_WB_B_fixed: case AArch64::ST1LN_WB_B_register: + case AArch64::ST1LN_WB_H_fixed: case AArch64::ST1LN_WB_H_register: + case AArch64::ST1LN_WB_S_fixed: case AArch64::ST1LN_WB_S_register: + case AArch64::ST1LN_WB_D_fixed: case AArch64::ST1LN_WB_D_register: { + switch (Opc) { + case AArch64::ST1LN_WB_B_fixed: case AArch64::ST1LN_WB_B_register: + TransferBytes = 1; break; + case AArch64::ST1LN_WB_H_fixed: case AArch64::ST1LN_WB_H_register: + TransferBytes = 2; break; + case AArch64::ST1LN_WB_S_fixed: case AArch64::ST1LN_WB_S_register: + TransferBytes = 4; break; + case AArch64::ST1LN_WB_D_fixed: case AArch64::ST1LN_WB_D_register: + TransferBytes = 8; break; + } + NumVecs = 1; + break; + } + + case AArch64::ST2LN_WB_B_fixed: case AArch64::ST2LN_WB_B_register: + case AArch64::ST2LN_WB_H_fixed: case AArch64::ST2LN_WB_H_register: + case AArch64::ST2LN_WB_S_fixed: case AArch64::ST2LN_WB_S_register: + case AArch64::ST2LN_WB_D_fixed: case AArch64::ST2LN_WB_D_register: { + switch (Opc) { + case AArch64::ST2LN_WB_B_fixed: case AArch64::ST2LN_WB_B_register: + TransferBytes = 2; break; + case AArch64::ST2LN_WB_H_fixed: case AArch64::ST2LN_WB_H_register: + TransferBytes = 4; break; + case AArch64::ST2LN_WB_S_fixed: case AArch64::ST2LN_WB_S_register: + TransferBytes = 8; break; + case AArch64::ST2LN_WB_D_fixed: case AArch64::ST2LN_WB_D_register: + TransferBytes = 16; break; + } + NumVecs = 2; + break; + } + + case AArch64::ST3LN_WB_B_fixed: case AArch64::ST3LN_WB_B_register: + case AArch64::ST3LN_WB_H_fixed: case AArch64::ST3LN_WB_H_register: + case AArch64::ST3LN_WB_S_fixed: case AArch64::ST3LN_WB_S_register: + case AArch64::ST3LN_WB_D_fixed: case AArch64::ST3LN_WB_D_register: { + switch (Opc) { + case AArch64::ST3LN_WB_B_fixed: case AArch64::ST3LN_WB_B_register: + TransferBytes = 3; break; + case AArch64::ST3LN_WB_H_fixed: case AArch64::ST3LN_WB_H_register: + TransferBytes = 6; break; + case AArch64::ST3LN_WB_S_fixed: case AArch64::ST3LN_WB_S_register: + TransferBytes = 12; break; + case AArch64::ST3LN_WB_D_fixed: case AArch64::ST3LN_WB_D_register: + TransferBytes = 24; break; + } + NumVecs = 3; + break; + } + + case AArch64::ST4LN_WB_B_fixed: case AArch64::ST4LN_WB_B_register: + case AArch64::ST4LN_WB_H_fixed: case AArch64::ST4LN_WB_H_register: + case AArch64::ST4LN_WB_S_fixed: case AArch64::ST4LN_WB_S_register: + case AArch64::ST4LN_WB_D_fixed: case AArch64::ST4LN_WB_D_register: { + switch (Opc) { + case AArch64::ST4LN_WB_B_fixed: case AArch64::ST4LN_WB_B_register: + TransferBytes = 4; break; + case AArch64::ST4LN_WB_H_fixed: case AArch64::ST4LN_WB_H_register: + TransferBytes = 8; break; + case AArch64::ST4LN_WB_S_fixed: case AArch64::ST4LN_WB_S_register: + TransferBytes = 16; break; + case AArch64::ST4LN_WB_D_fixed: case AArch64::ST4LN_WB_D_register: + TransferBytes = 32; break; + } + NumVecs = 4; + break; + } + + default: + return MCDisassembler::Fail; + } // End of switch (Opc) + + unsigned Rt = fieldFromInstruction(Insn, 0, 5); + unsigned Rn = fieldFromInstruction(Insn, 5, 5); + unsigned Rm = fieldFromInstruction(Insn, 16, 5); + + // Decode post-index of load duplicate lane + if (IsLoadDup) { + switch (NumVecs) { + case 1: + Is64bitVec ? DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder) + : DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder); + break; + case 2: + Is64bitVec ? DecodeDPairRegisterClass(Inst, Rt, Address, Decoder) + : DecodeQPairRegisterClass(Inst, Rt, Address, Decoder); + break; + case 3: + Is64bitVec ? DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder) + : DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder); + break; + case 4: + Is64bitVec ? DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder) + : DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder); + } + + // Decode write back register, which is equal to Rn. + DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder); + DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder); + + if (Rm == 31) // If Rm is 0x11111, add the number of transferred bytes + Inst.addOperand(MCOperand::CreateImm(TransferBytes)); + else // Decode Rm + DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder); + + return MCDisassembler::Success; + } + + // Decode post-index of load/store lane + // Loads have a vector list as output. + if (IsLoad) { + switch (NumVecs) { + case 1: + DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder); + break; + case 2: + DecodeQPairRegisterClass(Inst, Rt, Address, Decoder); + break; + case 3: + DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder); + break; + case 4: + DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder); + } + } + + // Decode write back register, which is equal to Rn. + DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder); + DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder); + + if (Rm == 31) // If Rm is 0x11111, add the number of transferred bytes + Inst.addOperand(MCOperand::CreateImm(TransferBytes)); + else // Decode Rm + DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder); + + // Decode the source vector list. + switch (NumVecs) { + case 1: + DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder); + break; + case 2: + DecodeQPairRegisterClass(Inst, Rt, Address, Decoder); + break; + case 3: + DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder); + break; + case 4: + DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder); + } + + // Decode lane + unsigned Q = fieldFromInstruction(Insn, 30, 1); + unsigned S = fieldFromInstruction(Insn, 10, 3); + unsigned lane = 0; + // Calculate the number of lanes by number of vectors and transfered bytes. + // NumLanes = 16 bytes / bytes of each lane + unsigned NumLanes = 16 / (TransferBytes / NumVecs); + switch (NumLanes) { + case 16: // A vector has 16 lanes, each lane is 1 bytes. + lane = (Q << 3) | S; + break; + case 8: + lane = (Q << 2) | (S >> 1); + break; + case 4: + lane = (Q << 1) | (S >> 2); + break; + case 2: + lane = Q; + break; + } + Inst.addOperand(MCOperand::CreateImm(lane)); + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeSHLLInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder) { + unsigned Rd = fieldFromInstruction(Insn, 0, 5); + unsigned Rn = fieldFromInstruction(Insn, 5, 5); + unsigned size = fieldFromInstruction(Insn, 22, 2); + unsigned Q = fieldFromInstruction(Insn, 30, 1); + + DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder); + + if(Q) + DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder); + else + DecodeFPR64RegisterClass(Inst, Rn, Address, Decoder); + + switch (size) { + case 0: + Inst.addOperand(MCOperand::CreateImm(8)); + break; + case 1: + Inst.addOperand(MCOperand::CreateImm(16)); + break; + case 2: + Inst.addOperand(MCOperand::CreateImm(32)); + break; + default : + return MCDisassembler::Fail; + } + return MCDisassembler::Success; +} diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp index 82ce80c..0438de3 100644 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp @@ -368,6 +368,14 @@ AArch64InstPrinter::printSImm7ScaledOperand(const MCInst *MI, unsigned OpNum, O << "#" << (Imm * MemScale); } +void AArch64InstPrinter::printVPRRegister(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned Reg = MI->getOperand(OpNo).getReg(); + std::string Name = getRegisterName(Reg); + Name[0] = 'v'; + O << Name; +} + void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); @@ -406,3 +414,126 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O, printAnnotation(O, Annot); } + +template <A64SE::ShiftExtSpecifiers Ext, bool isHalf> +void AArch64InstPrinter::printNeonMovImmShiftOperand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + + assert(MO.isImm() && + "Immediate operand required for Neon vector immediate inst."); + + bool IsLSL = false; + if (Ext == A64SE::LSL) + IsLSL = true; + else if (Ext != A64SE::MSL) + llvm_unreachable("Invalid shift specifier in movi instruction"); + + int64_t Imm = MO.getImm(); + + // MSL and LSLH accepts encoded shift amount 0 or 1. + if ((!IsLSL || (IsLSL && isHalf)) && Imm != 0 && Imm != 1) + llvm_unreachable("Invalid shift amount in movi instruction"); + + // LSH accepts encoded shift amount 0, 1, 2 or 3. + if (IsLSL && (Imm < 0 || Imm > 3)) + llvm_unreachable("Invalid shift amount in movi instruction"); + + // Print shift amount as multiple of 8 with MSL encoded shift amount + // 0 and 1 printed as 8 and 16. + if (!IsLSL) + Imm++; + Imm *= 8; + + // LSL #0 is not printed + if (IsLSL) { + if (Imm == 0) + return; + O << ", lsl"; + } else + O << ", msl"; + + O << " #" << Imm; +} + +void AArch64InstPrinter::printNeonUImm0Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &o) { + o << "#0x0"; +} + +void AArch64InstPrinter::printUImmHexOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &MOUImm = MI->getOperand(OpNum); + + assert(MOUImm.isImm() && + "Immediate operand required for Neon vector immediate inst."); + + unsigned Imm = MOUImm.getImm(); + + O << "#0x"; + O.write_hex(Imm); +} + +void AArch64InstPrinter::printUImmBareOperand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MOUImm = MI->getOperand(OpNum); + + assert(MOUImm.isImm() + && "Immediate operand required for Neon vector immediate inst."); + + unsigned Imm = MOUImm.getImm(); + O << Imm; +} + +void AArch64InstPrinter::printNeonUImm64MaskOperand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MOUImm8 = MI->getOperand(OpNum); + + assert(MOUImm8.isImm() && + "Immediate operand required for Neon vector immediate bytemask inst."); + + uint32_t UImm8 = MOUImm8.getImm(); + uint64_t Mask = 0; + + // Replicates 0x00 or 0xff byte in a 64-bit vector + for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) { + if ((UImm8 >> ByteNum) & 1) + Mask |= (uint64_t)0xff << (8 * ByteNum); + } + + O << "#0x"; + O.write_hex(Mask); +} + +// If Count > 1, there are two valid kinds of vector list: +// (1) {Vn.layout, Vn+1.layout, ... , Vm.layout} +// (2) {Vn.layout - Vm.layout} +// We choose the first kind as output. +template <A64Layout::VectorLayout Layout, unsigned Count> +void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + assert(Count >= 1 && Count <= 4 && "Invalid Number of Vectors"); + + unsigned Reg = MI->getOperand(OpNum).getReg(); + std::string LayoutStr = A64VectorLayoutToString(Layout); + O << "{"; + if (Count > 1) { // Print sub registers separately + bool IsVec64 = (Layout < A64Layout::VL_16B); + unsigned SubRegIdx = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0; + for (unsigned I = 0; I < Count; I++) { + std::string Name = getRegisterName(MRI.getSubReg(Reg, SubRegIdx++)); + Name[0] = 'v'; + O << Name << LayoutStr; + if (I != Count - 1) + O << ", "; + } + } else { // Print the register directly when NumVecs is 1. + std::string Name = getRegisterName(Reg); + Name[0] = 'v'; + O << Name << LayoutStr; + } + O << "}"; +} diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h index 639fa86..37b7273 100644 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h @@ -157,6 +157,7 @@ public: void printRegExtendOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O, A64SE::ShiftExtSpecifiers Ext); + void printVPRRegister(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot); @@ -164,9 +165,18 @@ public: return RegNo == AArch64::XSP || RegNo == AArch64::WSP; } - + template <A64SE::ShiftExtSpecifiers Ext, bool IsHalf> + void printNeonMovImmShiftOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printNeonUImm0Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printUImmHexOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printUImmBareOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printNeonUImm64MaskOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + + template <A64Layout::VectorLayout Layout, unsigned Count> + void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O); }; - } #endif diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index a3373b1..8a9077c 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -578,8 +578,8 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) { } MCAsmBackend * -llvm::createAArch64AsmBackend(const Target &T, StringRef TT, StringRef CPU) { +llvm::createAArch64AsmBackend(const Target &T, const MCRegisterInfo &MRI, + StringRef TT, StringRef CPU) { Triple TheTriple(TT); - return new ELFAArch64AsmBackend(T, TT, TheTriple.getOS()); } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 3b811df..a64c463 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -55,11 +55,10 @@ namespace { /// by MachO. Beware! class AArch64ELFStreamer : public MCELFStreamer { public: - AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, - raw_ostream &OS, MCCodeEmitter *Emitter) - : MCELFStreamer(Context, TAB, OS, Emitter), - MappingSymbolCounter(0), LastEMS(EMS_None) { - } + AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS, + MCCodeEmitter *Emitter) + : MCELFStreamer(Context, 0, TAB, OS, Emitter), MappingSymbolCounter(0), + LastEMS(EMS_None) {} ~AArch64ELFStreamer() {} @@ -85,18 +84,17 @@ public: /// This is one of the functions used to emit data into an ELF section, so the /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d) /// if necessary. - virtual void EmitBytes(StringRef Data, unsigned AddrSpace) { + virtual void EmitBytes(StringRef Data) { EmitDataMappingSymbol(); - MCELFStreamer::EmitBytes(Data, AddrSpace); + MCELFStreamer::EmitBytes(Data); } /// This is one of the functions used to emit data into an ELF section, so the /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d) /// if necessary. - virtual void EmitValueImpl(const MCExpr *Value, unsigned Size, - unsigned AddrSpace) { + virtual void EmitValueImpl(const MCExpr *Value, unsigned Size) { EmitDataMappingSymbol(); - MCELFStreamer::EmitValueImpl(Value, Size, AddrSpace); + MCELFStreamer::EmitValueImpl(Value, Size); } private: @@ -130,7 +128,7 @@ private: MCELF::SetType(SD, ELF::STT_NOTYPE); MCELF::SetBinding(SD, ELF::STB_LOCAL); SD.setExternal(false); - Symbol->setSection(*getCurrentSection().first); + AssignSection(Symbol, getCurrentSection().first); const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext()); Symbol->setVariableValue(Value); diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index 8ec8cbf..add874c 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -31,11 +31,12 @@ AArch64ELFMCAsmInfo::AArch64ELFMCAsmInfo() { UseDataRegionDirectives = true; - WeakRefDirective = "\t.weak\t"; - HasLEB128 = true; SupportsDebugInformation = true; // Exceptions handling ExceptionsType = ExceptionHandling::DwarfCFI; } + +// Pin the vtable to this file. +void AArch64ELFMCAsmInfo::anchor() {} diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h index a20bc47..d1dd285 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h @@ -14,13 +14,15 @@ #ifndef LLVM_AARCH64TARGETASMINFO_H #define LLVM_AARCH64TARGETASMINFO_H -#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAsmInfoELF.h" namespace llvm { - struct AArch64ELFMCAsmInfo : public MCAsmInfo { - explicit AArch64ELFMCAsmInfo(); - }; +struct AArch64ELFMCAsmInfo : public MCAsmInfoELF { + explicit AArch64ELFMCAsmInfo(); +private: + virtual void anchor(); +}; } // namespace llvm diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index a5c591e..b41c566 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -59,6 +59,23 @@ public: unsigned getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getShiftRightImm8(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getShiftRightImm16(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getShiftRightImm32(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getShiftRightImm64(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + + unsigned getShiftLeftImm8(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getShiftLeftImm16(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getShiftLeftImm32(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getShiftLeftImm64(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; // Labels are handled mostly the same way: a symbol is needed, and // just gets some fixup attached. @@ -152,10 +169,10 @@ getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx, switch (Expr->getKind()) { default: llvm_unreachable("Unexpected operand modifier"); case AArch64MCExpr::VK_AARCH64_LO12: { - unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_lo12, - AArch64::fixup_a64_ldst16_lo12, - AArch64::fixup_a64_ldst32_lo12, - AArch64::fixup_a64_ldst64_lo12, + static const unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_lo12, + AArch64::fixup_a64_ldst16_lo12, + AArch64::fixup_a64_ldst32_lo12, + AArch64::fixup_a64_ldst64_lo12, AArch64::fixup_a64_ldst128_lo12 }; assert(MemSize <= 16 && "Invalid fixup for operation"); FixupKind = FixupsBySize[Log2_32(MemSize)]; @@ -166,19 +183,23 @@ getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx, FixupKind = AArch64::fixup_a64_ld64_got_lo12_nc; break; case AArch64MCExpr::VK_AARCH64_DTPREL_LO12: { - unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_dtprel_lo12, - AArch64::fixup_a64_ldst16_dtprel_lo12, - AArch64::fixup_a64_ldst32_dtprel_lo12, - AArch64::fixup_a64_ldst64_dtprel_lo12 }; + static const unsigned FixupsBySize[] = { + AArch64::fixup_a64_ldst8_dtprel_lo12, + AArch64::fixup_a64_ldst16_dtprel_lo12, + AArch64::fixup_a64_ldst32_dtprel_lo12, + AArch64::fixup_a64_ldst64_dtprel_lo12 + }; assert(MemSize <= 8 && "Invalid fixup for operation"); FixupKind = FixupsBySize[Log2_32(MemSize)]; break; } case AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC: { - unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_dtprel_lo12_nc, - AArch64::fixup_a64_ldst16_dtprel_lo12_nc, - AArch64::fixup_a64_ldst32_dtprel_lo12_nc, - AArch64::fixup_a64_ldst64_dtprel_lo12_nc }; + static const unsigned FixupsBySize[] = { + AArch64::fixup_a64_ldst8_dtprel_lo12_nc, + AArch64::fixup_a64_ldst16_dtprel_lo12_nc, + AArch64::fixup_a64_ldst32_dtprel_lo12_nc, + AArch64::fixup_a64_ldst64_dtprel_lo12_nc + }; assert(MemSize <= 8 && "Invalid fixup for operation"); FixupKind = FixupsBySize[Log2_32(MemSize)]; break; @@ -188,19 +209,23 @@ getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx, FixupKind = AArch64::fixup_a64_ld64_gottprel_lo12_nc; break; case AArch64MCExpr::VK_AARCH64_TPREL_LO12:{ - unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_tprel_lo12, - AArch64::fixup_a64_ldst16_tprel_lo12, - AArch64::fixup_a64_ldst32_tprel_lo12, - AArch64::fixup_a64_ldst64_tprel_lo12 }; + static const unsigned FixupsBySize[] = { + AArch64::fixup_a64_ldst8_tprel_lo12, + AArch64::fixup_a64_ldst16_tprel_lo12, + AArch64::fixup_a64_ldst32_tprel_lo12, + AArch64::fixup_a64_ldst64_tprel_lo12 + }; assert(MemSize <= 8 && "Invalid fixup for operation"); FixupKind = FixupsBySize[Log2_32(MemSize)]; break; } case AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC: { - unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_tprel_lo12_nc, - AArch64::fixup_a64_ldst16_tprel_lo12_nc, - AArch64::fixup_a64_ldst32_tprel_lo12_nc, - AArch64::fixup_a64_ldst64_tprel_lo12_nc }; + static const unsigned FixupsBySize[] = { + AArch64::fixup_a64_ldst8_tprel_lo12_nc, + AArch64::fixup_a64_ldst16_tprel_lo12_nc, + AArch64::fixup_a64_ldst32_tprel_lo12_nc, + AArch64::fixup_a64_ldst64_tprel_lo12_nc + }; assert(MemSize <= 8 && "Invalid fixup for operation"); FixupKind = FixupsBySize[Log2_32(MemSize)]; break; @@ -302,6 +327,45 @@ AArch64MCCodeEmitter::getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx, return ((64 - MO.getImm()) & 0x3f) | (63 - MO.getImm()) << 6; } +unsigned AArch64MCCodeEmitter::getShiftRightImm8( + const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const { + return 8 - MI.getOperand(Op).getImm(); +} + +unsigned AArch64MCCodeEmitter::getShiftRightImm16( + const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const { + return 16 - MI.getOperand(Op).getImm(); +} + +unsigned AArch64MCCodeEmitter::getShiftRightImm32( + const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const { + return 32 - MI.getOperand(Op).getImm(); +} + +unsigned AArch64MCCodeEmitter::getShiftRightImm64( + const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const { + return 64 - MI.getOperand(Op).getImm(); +} + +unsigned AArch64MCCodeEmitter::getShiftLeftImm8( + const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const { + return MI.getOperand(Op).getImm() - 8; +} + +unsigned AArch64MCCodeEmitter::getShiftLeftImm16( + const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const { + return MI.getOperand(Op).getImm() - 16; +} + +unsigned AArch64MCCodeEmitter::getShiftLeftImm32( + const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const { + return MI.getOperand(Op).getImm() - 32; +} + +unsigned AArch64MCCodeEmitter::getShiftLeftImm64( + const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const { + return MI.getOperand(Op).getImm() - 64; +} template<AArch64::Fixups fixupDesired> unsigned AArch64MCCodeEmitter::getLabelOpValue(const MCInst &MI, @@ -346,7 +410,7 @@ AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO, SmallVectorImpl<MCFixup> &Fixups) const { if (MO.isReg()) { - return Ctx.getRegisterInfo().getEncodingValue(MO.getReg()); + return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()); } else if (MO.isImm()) { return static_cast<unsigned>(MO.getImm()); } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index 819eead..58fc95c 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -40,7 +40,7 @@ MCSubtargetInfo *AArch64_MC::createAArch64MCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS) { MCSubtargetInfo *X = new MCSubtargetInfo(); - InitAArch64MCSubtargetInfo(X, TT, CPU, ""); + InitAArch64MCSubtargetInfo(X, TT, CPU, FS); return X; } @@ -57,13 +57,14 @@ static MCRegisterInfo *createAArch64MCRegisterInfo(StringRef Triple) { return X; } -static MCAsmInfo *createAArch64MCAsmInfo(const Target &T, StringRef TT) { +static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI, + StringRef TT) { Triple TheTriple(TT); MCAsmInfo *MAI = new AArch64ELFMCAsmInfo(); - MachineLocation Dst(MachineLocation::VirtualFP); - MachineLocation Src(AArch64::XSP, 0); - MAI->addInitialFrameState(0, Dst, Src); + unsigned Reg = MRI.getDwarfRegNum(AArch64::XSP, true); + MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, Reg, 0); + MAI->addInitialFrameState(Inst); return MAI; } @@ -135,17 +136,17 @@ public: return MCInstrAnalysis::isConditionalBranch(Inst); } - uint64_t evaluateBranch(const MCInst &Inst, uint64_t Addr, - uint64_t Size) const { + bool evaluateBranch(const MCInst &Inst, uint64_t Addr, + uint64_t Size, uint64_t &Target) const { unsigned LblOperand = Inst.getOpcode() == AArch64::Bcc ? 1 : 0; // FIXME: We only handle PCRel branches for now. if (Info->get(Inst.getOpcode()).OpInfo[LblOperand].OperandType != MCOI::OPERAND_PCREL) - return -1ULL; + return false; int64_t Imm = Inst.getOperand(LblOperand).getImm(); - - return Addr + Imm; + Target = Addr + Imm; + return true; } }; diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h index 3849fe3..670e657 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h @@ -43,8 +43,9 @@ MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII, MCObjectWriter *createAArch64ELFObjectWriter(raw_ostream &OS, uint8_t OSABI); -MCAsmBackend *createAArch64AsmBackend(const Target &T, StringRef TT, - StringRef CPU); +MCAsmBackend *createAArch64AsmBackend(const Target &T, + const MCRegisterInfo &MRI, + StringRef TT, StringRef CPU); } // End llvm namespace diff --git a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp index fc706a4..377b533 100644 --- a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp +++ b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp @@ -20,5 +20,5 @@ Target llvm::TheAArch64Target; extern "C" void LLVMInitializeAArch64TargetInfo() { RegisterTarget<Triple::aarch64, /*HasJIT=*/true> - X(TheAArch64Target, "aarch64", "AArch64"); + X(TheAArch64Target, "aarch64", "AArch64 (ARM 64-bit target)"); } diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp index bedccb5..2a97cd6 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -972,7 +972,7 @@ bool A64Imms::isLogicalImm(unsigned RegWidth, uint64_t Imm, uint32_t &Bits) { // Now we have to work out the amount of rotation needed. The first part of // this calculation is actually independent of RepeatWidth, but the complex // case will depend on it. - Rotation = CountTrailingZeros_64(Imm); + Rotation = countTrailingZeros(Imm); if (Rotation == 0) { // There were no leading zeros, which means it's either in place or there // are 1s at each end (e.g. 0x8003 needs rotating). @@ -1105,3 +1105,69 @@ bool A64Imms::isOnlyMOVNImm(int RegWidth, uint64_t Value, return isMOVNImm(RegWidth, Value, UImm16, Shift); } + +// decodeNeonModShiftImm - Decode a Neon OpCmode value into the +// the shift amount and the shift type (shift zeros or ones in) and +// returns whether the OpCmode value implies a shift operation. +bool A64Imms::decodeNeonModShiftImm(unsigned OpCmode, unsigned &ShiftImm, + unsigned &ShiftOnesIn) { + ShiftImm = 0; + ShiftOnesIn = false; + bool HasShift = true; + + if (OpCmode == 0xe) { + // movi byte + HasShift = false; + } else if (OpCmode == 0x1e) { + // movi 64-bit bytemask + HasShift = false; + } else if ((OpCmode & 0xc) == 0x8) { + // shift zeros, per halfword + ShiftImm = ((OpCmode & 0x2) >> 1); + } else if ((OpCmode & 0x8) == 0) { + // shift zeros, per word + ShiftImm = ((OpCmode & 0x6) >> 1); + } else if ((OpCmode & 0xe) == 0xc) { + // shift ones, per word + ShiftOnesIn = true; + ShiftImm = (OpCmode & 0x1); + } else { + // per byte, per bytemask + llvm_unreachable("Unsupported Neon modified immediate"); + } + + return HasShift; +} + +// decodeNeonModImm - Decode a NEON modified immediate and OpCmode values +// into the element value and the element size in bits. +uint64_t A64Imms::decodeNeonModImm(unsigned Val, unsigned OpCmode, + unsigned &EltBits) { + uint64_t DecodedVal = Val; + EltBits = 0; + + if (OpCmode == 0xe) { + // movi byte + EltBits = 8; + } else if (OpCmode == 0x1e) { + // movi 64-bit bytemask + DecodedVal = 0; + for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) { + if ((Val >> ByteNum) & 1) + DecodedVal |= (uint64_t)0xff << (8 * ByteNum); + } + EltBits = 64; + } else if ((OpCmode & 0xc) == 0x8) { + // shift zeros, per halfword + EltBits = 16; + } else if ((OpCmode & 0x8) == 0) { + // shift zeros, per word + EltBits = 32; + } else if ((OpCmode & 0xe) == 0xc) { + // shift ones, per word + EltBits = 32; + } else { + llvm_unreachable("Unsupported Neon modified immediate"); + } + return DecodedVal; +} diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h index 9a1ca61..ce970b0 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -289,6 +289,7 @@ namespace A64SE { enum ShiftExtSpecifiers { Invalid = -1, LSL, + MSL, LSR, ASR, ROR, @@ -305,6 +306,65 @@ namespace A64SE { }; } +namespace A64Layout { + enum VectorLayout { + Invalid = -1, + VL_8B, + VL_4H, + VL_2S, + VL_1D, + + VL_16B, + VL_8H, + VL_4S, + VL_2D, + + // Bare layout for the 128-bit vector + // (only show ".b", ".h", ".s", ".d" without vector number) + VL_B, + VL_H, + VL_S, + VL_D + }; +} + +inline static const char * +A64VectorLayoutToString(A64Layout::VectorLayout Layout) { + switch (Layout) { + case A64Layout::VL_8B: return ".8b"; + case A64Layout::VL_4H: return ".4h"; + case A64Layout::VL_2S: return ".2s"; + case A64Layout::VL_1D: return ".1d"; + case A64Layout::VL_16B: return ".16b"; + case A64Layout::VL_8H: return ".8h"; + case A64Layout::VL_4S: return ".4s"; + case A64Layout::VL_2D: return ".2d"; + case A64Layout::VL_B: return ".b"; + case A64Layout::VL_H: return ".h"; + case A64Layout::VL_S: return ".s"; + case A64Layout::VL_D: return ".d"; + default: llvm_unreachable("Unknown Vector Layout"); + } +} + +inline static A64Layout::VectorLayout +A64StringToVectorLayout(StringRef LayoutStr) { + return StringSwitch<A64Layout::VectorLayout>(LayoutStr) + .Case(".8b", A64Layout::VL_8B) + .Case(".4h", A64Layout::VL_4H) + .Case(".2s", A64Layout::VL_2S) + .Case(".1d", A64Layout::VL_1D) + .Case(".16b", A64Layout::VL_16B) + .Case(".8h", A64Layout::VL_8H) + .Case(".4s", A64Layout::VL_4S) + .Case(".2d", A64Layout::VL_2D) + .Case(".b", A64Layout::VL_B) + .Case(".h", A64Layout::VL_H) + .Case(".s", A64Layout::VL_S) + .Case(".d", A64Layout::VL_D) + .Default(A64Layout::Invalid); +} + namespace A64SysReg { enum SysRegROValues { MDCCSR_EL0 = 0x9808, // 10 011 0000 0001 000 @@ -1068,7 +1128,10 @@ namespace A64Imms { // MOVN but *not* with a MOVZ (because that would take priority). bool isOnlyMOVNImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift); -} + uint64_t decodeNeonModImm(unsigned Val, unsigned OpCmode, unsigned &EltBits); + bool decodeNeonModShiftImm(unsigned OpCmode, unsigned &ShiftImm, + unsigned &ShiftOnesIn); + } } // end namespace llvm; diff --git a/lib/Target/AArch64/Utils/CMakeLists.txt b/lib/Target/AArch64/Utils/CMakeLists.txt index 2c28348..2348e44 100644 --- a/lib/Target/AArch64/Utils/CMakeLists.txt +++ b/lib/Target/AArch64/Utils/CMakeLists.txt @@ -3,3 +3,5 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/ add_llvm_library(LLVMAArch64Utils AArch64BaseInfo.cpp ) + +add_dependencies(LLVMAArch64Utils AArch64CommonTableGen) |