diff options
Diffstat (limited to 'lib/Target/ARM')
-rw-r--r-- | lib/Target/ARM/ARM.td | 21 | ||||
-rw-r--r-- | lib/Target/ARM/ARMBaseInstrInfo.cpp | 31 | ||||
-rw-r--r-- | lib/Target/ARM/ARMISelDAGToDAG.cpp | 96 | ||||
-rw-r--r-- | lib/Target/ARM/ARMISelLowering.cpp | 3 | ||||
-rw-r--r-- | lib/Target/ARM/ARMInstrFormats.td | 195 | ||||
-rw-r--r-- | lib/Target/ARM/ARMInstrInfo.td | 2 | ||||
-rw-r--r-- | lib/Target/ARM/ARMInstrNEON.td | 1169 | ||||
-rw-r--r-- | lib/Target/ARM/ARMInstrVFP.td | 8 | ||||
-rw-r--r-- | lib/Target/ARM/ARMLoadStoreOptimizer.cpp | 6 | ||||
-rw-r--r-- | lib/Target/ARM/ARMSubtarget.cpp | 15 | ||||
-rw-r--r-- | lib/Target/ARM/ARMSubtarget.h | 5 | ||||
-rw-r--r-- | lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp | 11 | ||||
-rw-r--r-- | lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp | 4 | ||||
-rw-r--r-- | lib/Target/ARM/NEONPreAllocPass.cpp | 18 | ||||
-rw-r--r-- | lib/Target/ARM/README.txt | 3 | ||||
-rw-r--r-- | lib/Target/ARM/Thumb1InstrInfo.cpp | 2 | ||||
-rw-r--r-- | lib/Target/ARM/Thumb2InstrInfo.cpp | 4 |
17 files changed, 869 insertions, 724 deletions
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index bbb1dbd..6486a60 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -43,6 +43,21 @@ def FeatureThumb2 : SubtargetFeature<"thumb2", "ThumbMode", "Thumb2", def FeatureFP16 : SubtargetFeature<"fp16", "HasFP16", "true", "Enable half-precision floating point">; +// Some processors have multiply-accumulate instructions that don't +// play nicely with other VFP instructions, and it's generally better +// to just not use them. +// FIXME: Currently, this is only flagged for Cortex-A8. It may be true for +// others as well. We should do more benchmarking and confirm one way or +// the other. +def FeatureHasSlowVMLx : SubtargetFeature<"vmlx", "SlowVMLx", "true", + "Disable VFP MAC instructions">; +// Some processors benefit from using NEON instructions for scalar +// single-precision FP operations. +def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP", + "true", + "Use NEON for single precision FP">; + + //===----------------------------------------------------------------------===// // ARM Processors supported. // @@ -92,7 +107,8 @@ def : ProcNoItin<"iwmmxt", [ArchV5TE]>; // V6 Processors. def : Processor<"arm1136j-s", ARMV6Itineraries, [ArchV6]>; -def : Processor<"arm1136jf-s", ARMV6Itineraries, [ArchV6, FeatureVFP2]>; +def : Processor<"arm1136jf-s", ARMV6Itineraries, [ArchV6, FeatureVFP2, + FeatureHasSlowVMLx]>; def : Processor<"arm1176jz-s", ARMV6Itineraries, [ArchV6]>; def : Processor<"arm1176jzf-s", ARMV6Itineraries, [ArchV6, FeatureVFP2]>; def : Processor<"mpcorenovfp", ARMV6Itineraries, [ArchV6]>; @@ -106,7 +122,8 @@ def : Processor<"arm1156t2f-s", ARMV6Itineraries, // V7 Processors. def : Processor<"cortex-a8", CortexA8Itineraries, - [ArchV7A, FeatureThumb2, FeatureNEON]>; + [ArchV7A, FeatureThumb2, FeatureNEON, FeatureHasSlowVMLx, + FeatureNEONForFP]>; def : ProcNoItin<"cortex-a9", [ArchV7A, FeatureThumb2, FeatureNEON]>; //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index e6ea03a..0a0b0ea 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -204,7 +204,15 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, bool AllowModify) const { // If the block has no terminators, it just falls into the block after it. MachineBasicBlock::iterator I = MBB.end(); - if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) + if (I == MBB.begin()) + return false; + --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return false; + --I; + } + if (!isUnpredicatedTerminator(I)) return false; // Get the last instruction in the block. @@ -275,6 +283,11 @@ unsigned ARMBaseInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { MachineBasicBlock::iterator I = MBB.end(); if (I == MBB.begin()) return 0; --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return 0; + --I; + } if (!isUncondBranchOpcode(I->getOpcode()) && !isCondBranchOpcode(I->getOpcode())) return 0; @@ -738,14 +751,16 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, RC == ARM::QPR_VFP2RegisterClass) && "Unknown regclass!"); // FIXME: Neon instructions should support predicates if (Align >= 16 && (getRegisterInfo().canRealignStack(MF))) { - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q64)) + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q)) .addFrameIndex(FI).addImm(128) .addMemOperand(MMO) .addReg(SrcReg, getKillRegState(isKill))); } else { - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRQ)). + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMQ)). addReg(SrcReg, getKillRegState(isKill)) - .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + .addFrameIndex(FI) + .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4)) + .addMemOperand(MMO)); } } } @@ -788,12 +803,14 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, RC == ARM::QPR_8RegisterClass) && "Unknown regclass!"); if (Align >= 16 && (getRegisterInfo().canRealignStack(MF))) { - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg) + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q), DestReg) .addFrameIndex(FI).addImm(128) .addMemOperand(MMO)); } else { - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRQ), DestReg) - .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQ), DestReg) + .addFrameIndex(FI) + .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4)) + .addMemOperand(MMO)); } } } diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 71207c8..7d48663 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -124,17 +124,17 @@ private: /// SelectDYN_ALLOC - Select dynamic alloc for Thumb. SDNode *SelectDYN_ALLOC(SDNode *N); - /// SelectVLD - Select NEON load intrinsics. NumVecs should - /// be 2, 3 or 4. The opcode arrays specify the instructions used for + /// SelectVLD - Select NEON load intrinsics. NumVecs should be + /// 1, 2, 3 or 4. The opcode arrays specify the instructions used for /// loads of D registers and even subregs and odd subregs of Q registers. - /// For NumVecs == 2, QOpcodes1 is not used. + /// For NumVecs <= 2, QOpcodes1 is not used. SDNode *SelectVLD(SDNode *N, unsigned NumVecs, unsigned *DOpcodes, unsigned *QOpcodes0, unsigned *QOpcodes1); /// SelectVST - Select NEON store intrinsics. NumVecs should - /// be 2, 3 or 4. The opcode arrays specify the instructions used for + /// be 1, 2, 3 or 4. The opcode arrays specify the instructions used for /// stores of D registers and even subregs and odd subregs of Q registers. - /// For NumVecs == 2, QOpcodes1 is not used. + /// For NumVecs <= 2, QOpcodes1 is not used. SDNode *SelectVST(SDNode *N, unsigned NumVecs, unsigned *DOpcodes, unsigned *QOpcodes0, unsigned *QOpcodes1); @@ -1022,7 +1022,7 @@ static EVT GetNEONSubregVT(EVT VT) { SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, unsigned *DOpcodes, unsigned *QOpcodes0, unsigned *QOpcodes1) { - assert(NumVecs >=2 && NumVecs <= 4 && "VLD NumVecs out-of-range"); + assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range"); DebugLoc dl = N->getDebugLoc(); SDValue MemAddr, Align; @@ -1047,6 +1047,9 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, case MVT::v8i16: OpcodeIndex = 1; break; case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 2; break; + case MVT::v2i64: OpcodeIndex = 3; + assert(NumVecs == 1 && "v2i64 type only supported for VLD1"); + break; } SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32); @@ -1060,15 +1063,15 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, } EVT RegVT = GetNEONSubregVT(VT); - if (NumVecs == 2) { - // Quad registers are directly supported for VLD2, - // loading 2 pairs of D regs. + if (NumVecs <= 2) { + // Quad registers are directly supported for VLD1 and VLD2, + // loading pairs of D regs. unsigned Opc = QOpcodes0[OpcodeIndex]; const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain }; - std::vector<EVT> ResTys(4, VT); + std::vector<EVT> ResTys(2 * NumVecs, RegVT); ResTys.push_back(MVT::Other); SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops, 5); - Chain = SDValue(VLd, 4); + Chain = SDValue(VLd, 2 * NumVecs); // Combine the even and odd subregs to produce the result. for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { @@ -1109,7 +1112,7 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, unsigned *DOpcodes, unsigned *QOpcodes0, unsigned *QOpcodes1) { - assert(NumVecs >=2 && NumVecs <= 4 && "VST NumVecs out-of-range"); + assert(NumVecs >=1 && NumVecs <= 4 && "VST NumVecs out-of-range"); DebugLoc dl = N->getDebugLoc(); SDValue MemAddr, Align; @@ -1134,6 +1137,9 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, case MVT::v8i16: OpcodeIndex = 1; break; case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 2; break; + case MVT::v2i64: OpcodeIndex = 3; + assert(NumVecs == 1 && "v2i64 type only supported for VST1"); + break; } SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32); @@ -1154,9 +1160,9 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, } EVT RegVT = GetNEONSubregVT(VT); - if (NumVecs == 2) { - // Quad registers are directly supported for VST2, - // storing 2 pairs of D regs. + if (NumVecs <= 2) { + // Quad registers are directly supported for VST1 and VST2, + // storing pairs of D regs. unsigned Opc = QOpcodes0[OpcodeIndex]; for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT, @@ -1167,7 +1173,8 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, Ops.push_back(Pred); Ops.push_back(Reg0); // predicate register Ops.push_back(Chain); - return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), 9); + return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), + 5 + 2 * NumVecs); } // Otherwise, quad registers are stored with two separate instructions, @@ -1694,6 +1701,35 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ResNode = SelectARMIndexedLoad(N); if (ResNode) return ResNode; + + // VLDMQ must be custom-selected for "v2f64 load" to set the AM5Opc value. + if (Subtarget->hasVFP2() && + N->getValueType(0).getSimpleVT().SimpleTy == MVT::v2f64) { + SDValue Chain = N->getOperand(0); + SDValue AM5Opc = + CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::ia, 4), MVT::i32); + SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32); + SDValue PredReg = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { N->getOperand(1), AM5Opc, Pred, PredReg, Chain }; + return CurDAG->getMachineNode(ARM::VLDMQ, dl, MVT::v2f64, MVT::Other, + Ops, 5); + } + // Other cases are autogenerated. + break; + } + case ISD::STORE: { + // VSTMQ must be custom-selected for "v2f64 store" to set the AM5Opc value. + if (Subtarget->hasVFP2() && + N->getOperand(1).getValueType().getSimpleVT().SimpleTy == MVT::v2f64) { + SDValue Chain = N->getOperand(0); + SDValue AM5Opc = + CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::ia, 4), MVT::i32); + SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32); + SDValue PredReg = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { N->getOperand(1), N->getOperand(2), + AM5Opc, Pred, PredReg, Chain }; + return CurDAG->getMachineNode(ARM::VSTMQ, dl, MVT::Other, Ops, 6); + } // Other cases are autogenerated. break; } @@ -1831,16 +1867,24 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { default: break; + case Intrinsic::arm_neon_vld1: { + unsigned DOpcodes[] = { ARM::VLD1d8, ARM::VLD1d16, + ARM::VLD1d32, ARM::VLD1d64 }; + unsigned QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16, + ARM::VLD1q32, ARM::VLD1q64 }; + return SelectVLD(N, 1, DOpcodes, QOpcodes, 0); + } + case Intrinsic::arm_neon_vld2: { unsigned DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16, - ARM::VLD2d32, ARM::VLD2d64 }; + ARM::VLD2d32, ARM::VLD1q64 }; unsigned QOpcodes[] = { ARM::VLD2q8, ARM::VLD2q16, ARM::VLD2q32 }; return SelectVLD(N, 2, DOpcodes, QOpcodes, 0); } case Intrinsic::arm_neon_vld3: { unsigned DOpcodes[] = { ARM::VLD3d8, ARM::VLD3d16, - ARM::VLD3d32, ARM::VLD3d64 }; + ARM::VLD3d32, ARM::VLD1d64T }; unsigned QOpcodes0[] = { ARM::VLD3q8_UPD, ARM::VLD3q16_UPD, ARM::VLD3q32_UPD }; @@ -1852,7 +1896,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { case Intrinsic::arm_neon_vld4: { unsigned DOpcodes[] = { ARM::VLD4d8, ARM::VLD4d16, - ARM::VLD4d32, ARM::VLD4d64 }; + ARM::VLD4d32, ARM::VLD1d64Q }; unsigned QOpcodes0[] = { ARM::VLD4q8_UPD, ARM::VLD4q16_UPD, ARM::VLD4q32_UPD }; @@ -1883,16 +1927,24 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { return SelectVLDSTLane(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); } + case Intrinsic::arm_neon_vst1: { + unsigned DOpcodes[] = { ARM::VST1d8, ARM::VST1d16, + ARM::VST1d32, ARM::VST1d64 }; + unsigned QOpcodes[] = { ARM::VST1q8, ARM::VST1q16, + ARM::VST1q32, ARM::VST1q64 }; + return SelectVST(N, 1, DOpcodes, QOpcodes, 0); + } + case Intrinsic::arm_neon_vst2: { unsigned DOpcodes[] = { ARM::VST2d8, ARM::VST2d16, - ARM::VST2d32, ARM::VST2d64 }; + ARM::VST2d32, ARM::VST1q64 }; unsigned QOpcodes[] = { ARM::VST2q8, ARM::VST2q16, ARM::VST2q32 }; return SelectVST(N, 2, DOpcodes, QOpcodes, 0); } case Intrinsic::arm_neon_vst3: { unsigned DOpcodes[] = { ARM::VST3d8, ARM::VST3d16, - ARM::VST3d32, ARM::VST3d64 }; + ARM::VST3d32, ARM::VST1d64T }; unsigned QOpcodes0[] = { ARM::VST3q8_UPD, ARM::VST3q16_UPD, ARM::VST3q32_UPD }; @@ -1904,7 +1956,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { case Intrinsic::arm_neon_vst4: { unsigned DOpcodes[] = { ARM::VST4d8, ARM::VST4d16, - ARM::VST4d32, ARM::VST4d64 }; + ARM::VST4d32, ARM::VST1d64Q }; unsigned QOpcodes0[] = { ARM::VST4q8_UPD, ARM::VST4q16_UPD, ARM::VST4q32_UPD }; diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 0d0a004..b6c81f6 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -456,6 +456,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) // Generic (and overly aggressive) if-conversion limits. setIfCvtBlockSizeLimit(10); setIfCvtDupBlockSizeLimit(2); + } else if (Subtarget->hasV7Ops()) { + setIfCvtBlockSizeLimit(3); + setIfCvtDupBlockSizeLimit(1); } else if (Subtarget->hasV6Ops()) { setIfCvtBlockSizeLimit(2); setIfCvtDupBlockSizeLimit(1); diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td index 4f6f05d..4427e50 100644 --- a/lib/Target/ARM/ARMInstrFormats.td +++ b/lib/Target/ARM/ARMInstrFormats.td @@ -1,10 +1,10 @@ //===- ARMInstrFormats.td - ARM Instruction Formats --*- tablegen -*---------=// -// +// // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. -// +// //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// @@ -59,7 +59,18 @@ def NEONDupFrm : Format<28>; def MiscFrm : Format<29>; def ThumbMiscFrm : Format<30>; -def NLdStFrm : Format<31>; +def NLdStFrm : Format<31>; +def N1RegModImmFrm : Format<32>; +def N2RegFrm : Format<33>; +def NVCVTFrm : Format<34>; +def NVDupLnFrm : Format<35>; +def N2RegVShLFrm : Format<36>; +def N2RegVShRFrm : Format<37>; +def N3RegFrm : Format<38>; +def N3RegVShFrm : Format<39>; +def NVExtFrm : Format<40>; +def NVMulSLFrm : Format<41>; +def NVTBLFrm : Format<42>; // Misc flags. @@ -177,13 +188,13 @@ class InstTemplate<AddrMode am, SizeFlagVal sz, IndexMode im, // TSFlagsFields AddrMode AM = am; bits<4> AddrModeBits = AM.Value; - + SizeFlagVal SZ = sz; bits<3> SizeFlag = SZ.Value; IndexMode IM = im; bits<2> IndexModeBits = IM.Value; - + Format F = f; bits<6> Form = F.Value; @@ -195,7 +206,7 @@ class InstTemplate<AddrMode am, SizeFlagVal sz, IndexMode im, // bit isUnaryDataProc = 0; bit canXformTo16Bit = 0; - + let Constraints = cstr; let Itinerary = itin; } @@ -214,9 +225,9 @@ class InstThumb<AddrMode am, SizeFlagVal sz, IndexMode im, Format f, Domain d, string cstr, InstrItinClass itin> : InstTemplate<am, sz, im, f, d, cstr, itin>; -class PseudoInst<dag oops, dag iops, InstrItinClass itin, +class PseudoInst<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern> - : InstARM<AddrModeNone, SizeSpecial, IndexModeNone, Pseudo, GenericDomain, + : InstARM<AddrModeNone, SizeSpecial, IndexModeNone, Pseudo, GenericDomain, "", itin> { let OutOperandList = oops; let InOperandList = iops; @@ -226,7 +237,7 @@ class PseudoInst<dag oops, dag iops, InstrItinClass itin, // Almost all ARM instructions are predicable. class I<dag oops, dag iops, AddrMode am, SizeFlagVal sz, - IndexMode im, Format f, InstrItinClass itin, + IndexMode im, Format f, InstrItinClass itin, string opc, string asm, string cstr, list<dag> pattern> : InstARM<am, sz, im, f, GenericDomain, cstr, itin> { @@ -238,9 +249,9 @@ class I<dag oops, dag iops, AddrMode am, SizeFlagVal sz, } // A few are not predicable class InoP<dag oops, dag iops, AddrMode am, SizeFlagVal sz, - IndexMode im, Format f, InstrItinClass itin, - string opc, string asm, string cstr, - list<dag> pattern> + IndexMode im, Format f, InstrItinClass itin, + string opc, string asm, string cstr, + list<dag> pattern> : InstARM<am, sz, im, f, GenericDomain, cstr, itin> { let OutOperandList = oops; let InOperandList = iops; @@ -290,9 +301,9 @@ class AXI<dag oops, dag iops, Format f, InstrItinClass itin, : XI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin, asm, "", pattern>; class AInoP<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, list<dag> pattern> + string opc, string asm, list<dag> pattern> : InoP<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin, - opc, asm, "", pattern>; + opc, asm, "", pattern>; // Ctrl flow instructions class ABI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin, @@ -362,7 +373,7 @@ class AXI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin, let Inst{24-21} = opcod; let Inst{27-26} = {0,0}; } -class AI1x2<dag oops, dag iops, Format f, InstrItinClass itin, +class AI1x2<dag oops, dag iops, Format f, InstrItinClass itin, string opc, string asm, list<dag> pattern> : I<oops, iops, AddrMode1, Size8Bytes, IndexModeNone, f, itin, opc, asm, "", pattern>; @@ -387,7 +398,7 @@ class AI2ldw<dag oops, dag iops, Format f, InstrItinClass itin, let Inst{24} = 1; // P bit let Inst{27-26} = {0,1}; } -class AXI2ldw<dag oops, dag iops, Format f, InstrItinClass itin, +class AXI2ldw<dag oops, dag iops, Format f, InstrItinClass itin, string asm, list<dag> pattern> : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, asm, "", pattern> { @@ -407,7 +418,7 @@ class AI2ldb<dag oops, dag iops, Format f, InstrItinClass itin, let Inst{24} = 1; // P bit let Inst{27-26} = {0,1}; } -class AXI2ldb<dag oops, dag iops, Format f, InstrItinClass itin, +class AXI2ldb<dag oops, dag iops, Format f, InstrItinClass itin, string asm, list<dag> pattern> : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, asm, "", pattern> { @@ -549,7 +560,7 @@ class AI2stbpo<dag oops, dag iops, Format f, InstrItinClass itin, } // addrmode3 instructions -class AI3<dag oops, dag iops, Format f, InstrItinClass itin, +class AI3<dag oops, dag iops, Format f, InstrItinClass itin, string opc, string asm, list<dag> pattern> : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, opc, asm, "", pattern>; @@ -853,7 +864,6 @@ class AI3stdpo<dag oops, dag iops, Format f, InstrItinClass itin, let Inst{27-25} = 0b000; } - // addrmode4 instructions class AXI4ld<dag oops, dag iops, IndexMode im, Format f, InstrItinClass itin, string asm, string cstr, list<dag> pattern> @@ -961,20 +971,25 @@ class TI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern> : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "", pattern>; // Two-address instructions -class TIt<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern> - : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "$lhs = $dst", pattern>; +class TIt<dag oops, dag iops, InstrItinClass itin, string asm, + list<dag> pattern> + : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "$lhs = $dst", + pattern>; // tBL, tBX 32-bit instructions class TIx2<bits<5> opcod1, bits<2> opcod2, bit opcod3, - dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern> - : ThumbI<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>, Encoding { + dag oops, dag iops, InstrItinClass itin, string asm, + list<dag> pattern> + : ThumbI<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>, + Encoding { let Inst{31-27} = opcod1; let Inst{15-14} = opcod2; let Inst{12} = opcod3; } // BR_JT instructions -class TJTI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern> +class TJTI<dag oops, dag iops, InstrItinClass itin, string asm, + list<dag> pattern> : ThumbI<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>; // Thumb1 only @@ -1001,7 +1016,7 @@ class T1JTI<dag oops, dag iops, InstrItinClass itin, // Two-address instructions class T1It<dag oops, dag iops, InstrItinClass itin, string asm, string cstr, list<dag> pattern> - : Thumb1I<oops, iops, AddrModeNone, Size2Bytes, itin, + : Thumb1I<oops, iops, AddrModeNone, Size2Bytes, itin, asm, cstr, pattern>; // Thumb1 instruction that can either be predicated or set CPSR. @@ -1024,7 +1039,7 @@ class T1sI<dag oops, dag iops, InstrItinClass itin, class T1sIt<dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : Thumb1sI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm, - "$lhs = $dst", pattern>; + "$lhs = $dst", pattern>; // Thumb1 instruction that can be predicated. class Thumb1pI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, @@ -1046,7 +1061,7 @@ class T1pI<dag oops, dag iops, InstrItinClass itin, class T1pIt<dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : Thumb1pI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm, - "$lhs = $dst", pattern>; + "$lhs = $dst", pattern>; class T1pI1<dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> @@ -1057,7 +1072,7 @@ class T1pI2<dag oops, dag iops, InstrItinClass itin, class T1pI4<dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : Thumb1pI<oops, iops, AddrModeT1_4, Size2Bytes, itin, opc, asm, "", pattern>; -class T1pIs<dag oops, dag iops, +class T1pIs<dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : Thumb1pI<oops, iops, AddrModeT1_s, Size2Bytes, itin, opc, asm, "", pattern>; @@ -1146,8 +1161,8 @@ class Thumb2XI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, } class ThumbXI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, - InstrItinClass itin, - string asm, string cstr, list<dag> pattern> + InstrItinClass itin, + string asm, string cstr, list<dag> pattern> : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { let OutOperandList = oops; let InOperandList = iops; @@ -1161,7 +1176,7 @@ class T2I<dag oops, dag iops, InstrItinClass itin, : Thumb2I<oops, iops, AddrModeNone, Size4Bytes, itin, opc, asm, "", pattern>; class T2Ii12<dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> - : Thumb2I<oops, iops, AddrModeT2_i12, Size4Bytes, itin, opc, asm, "", pattern>; + : Thumb2I<oops, iops, AddrModeT2_i12, Size4Bytes, itin, opc, asm, "",pattern>; class T2Ii8<dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : Thumb2I<oops, iops, AddrModeT2_i8, Size4Bytes, itin, opc, asm, "", pattern>; @@ -1196,7 +1211,7 @@ class T2JTI<dag oops, dag iops, InstrItinClass itin, : Thumb2XI<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>; class T2Ix2<dag oops, dag iops, InstrItinClass itin, - string opc, string asm, list<dag> pattern> + string opc, string asm, list<dag> pattern> : Thumb2I<oops, iops, AddrModeNone, Size8Bytes, itin, opc, asm, "", pattern>; // Two-address instructions @@ -1295,7 +1310,7 @@ class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : VFPI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone, - VFPLdStFrm, itin, opc, asm, "", pattern> { + VFPLdStFrm, itin, opc, asm, "", pattern> { // TODO: Mark the instructions with the appropriate subtarget info. let Inst{27-24} = opcod1; let Inst{21-20} = opcod2; @@ -1309,7 +1324,7 @@ class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : VFPI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone, - VFPLdStFrm, itin, opc, asm, "", pattern> { + VFPLdStFrm, itin, opc, asm, "", pattern> { // TODO: Mark the instructions with the appropriate subtarget info. let Inst{27-24} = opcod1; let Inst{21-20} = opcod2; @@ -1320,7 +1335,7 @@ class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops, class AXDI5<dag oops, dag iops, IndexMode im, InstrItinClass itin, string asm, string cstr, list<dag> pattern> : VFPXI<oops, iops, AddrMode5, Size4Bytes, im, - VFPLdStMulFrm, itin, asm, cstr, pattern> { + VFPLdStMulFrm, itin, asm, cstr, pattern> { // TODO: Mark the instructions with the appropriate subtarget info. let Inst{27-25} = 0b110; let Inst{11-8} = 0b1011; @@ -1332,7 +1347,7 @@ class AXDI5<dag oops, dag iops, IndexMode im, InstrItinClass itin, class AXSI5<dag oops, dag iops, IndexMode im, InstrItinClass itin, string asm, string cstr, list<dag> pattern> : VFPXI<oops, iops, AddrMode5, Size4Bytes, im, - VFPLdStMulFrm, itin, asm, cstr, pattern> { + VFPLdStMulFrm, itin, asm, cstr, pattern> { // TODO: Mark the instructions with the appropriate subtarget info. let Inst{27-25} = 0b110; let Inst{11-8} = 0b1010; @@ -1353,7 +1368,8 @@ class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, // Double precision, binary class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, - dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> + dag iops, InstrItinClass itin, string opc, string asm, + list<dag> pattern> : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> { let Inst{27-23} = opcod1; let Inst{21-20} = opcod2; @@ -1362,6 +1378,20 @@ class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, let Inst{4} = op4; } +// Double precision, binary, VML[AS] (for additional predicate) +class ADbI_vmlX<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, + dag iops, InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> { + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{11-8} = 0b1011; + let Inst{6} = op6; + let Inst{4} = op4; + list<Predicate> Predicates = [HasVFP2, UseVMLx]; +} + + // Single precision, unary class ASuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc, @@ -1399,7 +1429,8 @@ class ASbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops, // Single precision binary, if no NEON // Same as ASbI except not available if NEON is enabled class ASbIn<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, - dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> + dag iops, InstrItinClass itin, string opc, string asm, + list<dag> pattern> : ASbI<opcod1, opcod2, op6, op4, oops, iops, itin, opc, asm, pattern> { list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP]; } @@ -1419,8 +1450,8 @@ class AVConv1I<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4, // VFP conversion between floating-point and fixed-point class AVConv1XI<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, bit op5, - dag oops, dag iops, InstrItinClass itin, string opc, string asm, - list<dag> pattern> + dag oops, dag iops, InstrItinClass itin, string opc, string asm, + list<dag> pattern> : AVConv1I<op1, op2, op3, op4, oops, iops, itin, opc, asm, pattern> { // size (fixed-point number): sx == 0 ? 16 : 32 let Inst{7} = op5; // sx @@ -1448,7 +1479,7 @@ class AVConv2I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : AVConvXI<opcod1, opcod2, oops, iops, VFPConv2Frm, itin, opc, asm, pattern>; -class AVConv3I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, +class AVConv3I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : AVConvXI<opcod1, opcod2, oops, iops, VFPConv3Frm, itin, opc, asm, pattern>; @@ -1480,9 +1511,10 @@ class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, Format f, } // Same as NeonI except it does not have a "data type" specifier. -class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : InstARM<am, Size4Bytes, im, NEONFrm, NeonDomain, cstr, itin> { +class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, Format f, + InstrItinClass itin, string opc, string asm, string cstr, + list<dag> pattern> + : InstARM<am, Size4Bytes, im, f, NeonDomain, cstr, itin> { let OutOperandList = oops; let InOperandList = !con(iops, (ins pred:$p)); let AsmString = !strconcat(!strconcat(opc, "${p}"), !strconcat("\t", asm)); @@ -1490,18 +1522,6 @@ class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, InstrItinClass itin, list<Predicate> Predicates = [HasNEON]; } -class NI<dag oops, dag iops, InstrItinClass itin, string opc, string asm, - list<dag> pattern> - : NeonXI<oops, iops, AddrModeNone, IndexModeNone, itin, opc, asm, "", - pattern> { -} - -class NI4<dag oops, dag iops, InstrItinClass itin, string opc, - string asm, list<dag> pattern> - : NeonXI<oops, iops, AddrMode4, IndexModeNone, itin, opc, asm, "", - pattern> { -} - class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4, dag oops, dag iops, InstrItinClass itin, string opc, string dt, string asm, string cstr, list<dag> pattern> @@ -1514,17 +1534,17 @@ class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4, let Inst{7-4} = op7_4; } -class NDataI<dag oops, dag iops, InstrItinClass itin, +class NDataI<dag oops, dag iops, Format f, InstrItinClass itin, string opc, string dt, string asm, string cstr, list<dag> pattern> - : NeonI<oops, iops, AddrModeNone, IndexModeNone, NEONFrm, itin, opc, dt, asm, - cstr, pattern> { + : NeonI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, dt, asm, cstr, + pattern> { let Inst{31-25} = 0b1111001; } -class NDataXI<dag oops, dag iops, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : NeonXI<oops, iops, AddrModeNone, IndexModeNone, itin, opc, asm, - cstr, pattern> { +class NDataXI<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : NeonXI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, asm, + cstr, pattern> { let Inst{31-25} = 0b1111001; } @@ -1532,8 +1552,9 @@ class NDataXI<dag oops, dag iops, InstrItinClass itin, class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6, bit op5, bit op4, dag oops, dag iops, InstrItinClass itin, - string opc, string dt, string asm, string cstr, list<dag> pattern> - : NDataI<oops, iops, itin, opc, dt, asm, cstr, pattern> { + string opc, string dt, string asm, string cstr, + list<dag> pattern> + : NDataI<oops, iops, N1RegModImmFrm, itin, opc, dt, asm, cstr, pattern> { let Inst{23} = op23; let Inst{21-19} = op21_19; let Inst{11-8} = op11_8; @@ -1548,7 +1569,7 @@ class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, dag oops, dag iops, InstrItinClass itin, string opc, string dt, string asm, string cstr, list<dag> pattern> - : NDataI<oops, iops, itin, opc, dt, asm, cstr, pattern> { + : NDataI<oops, iops, N2RegFrm, itin, opc, dt, asm, cstr, pattern> { let Inst{24-23} = op24_23; let Inst{21-20} = op21_20; let Inst{19-18} = op19_18; @@ -1560,10 +1581,10 @@ class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, // Same as N2V except it doesn't have a datatype suffix. class N2VX<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, - bits<5> op11_7, bit op6, bit op4, - dag oops, dag iops, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : NDataXI<oops, iops, itin, opc, asm, cstr, pattern> { + bits<5> op11_7, bit op6, bit op4, + dag oops, dag iops, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : NDataXI<oops, iops, N2RegFrm, itin, opc, asm, cstr, pattern> { let Inst{24-23} = op24_23; let Inst{21-20} = op21_20; let Inst{19-18} = op19_18; @@ -1575,9 +1596,9 @@ class N2VX<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, // NEON 2 vector register with immediate. class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, - dag oops, dag iops, InstrItinClass itin, + dag oops, dag iops, Format f, InstrItinClass itin, string opc, string dt, string asm, string cstr, list<dag> pattern> - : NDataI<oops, iops, itin, opc, dt, asm, cstr, pattern> { + : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> { let Inst{24} = op24; let Inst{23} = op23; let Inst{11-8} = op11_8; @@ -1588,9 +1609,9 @@ class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, // NEON 3 vector register format. class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, - dag oops, dag iops, InstrItinClass itin, + dag oops, dag iops, Format f, InstrItinClass itin, string opc, string dt, string asm, string cstr, list<dag> pattern> - : NDataI<oops, iops, itin, opc, dt, asm, cstr, pattern> { + : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> { let Inst{24} = op24; let Inst{23} = op23; let Inst{21-20} = op21_20; @@ -1599,11 +1620,12 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, let Inst{4} = op4; } -// Same as N3VX except it doesn't have a data type suffix. -class N3VX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, - dag oops, dag iops, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : NDataXI<oops, iops, itin, opc, asm, cstr, pattern> { +// Same as N3V except it doesn't have a data type suffix. +class N3VX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, + dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : NDataXI<oops, iops, f, itin, opc, asm, cstr, pattern> { let Inst{24} = op24; let Inst{23} = op23; let Inst{21-20} = op21_20; @@ -1617,7 +1639,7 @@ class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, dag oops, dag iops, Format f, InstrItinClass itin, string opc, string dt, string asm, list<dag> pattern> : InstARM<AddrModeNone, Size4Bytes, IndexModeNone, f, GenericDomain, - "", itin> { + "", itin> { let Inst{27-20} = opcod1; let Inst{11-8} = opcod2; let Inst{6-5} = opcod3; @@ -1647,6 +1669,19 @@ class NVDup<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONDupFrm, itin, opc, dt, asm, pattern>; +// Vector Duplicate Lane (from scalar to all elements) +class NVDupLane<bits<4> op19_16, bit op6, dag oops, dag iops, + InstrItinClass itin, string opc, string dt, string asm, + list<dag> pattern> + : NDataI<oops, iops, NVDupLnFrm, itin, opc, dt, asm, "", pattern> { + let Inst{24-23} = 0b11; + let Inst{21-20} = 0b11; + let Inst{19-16} = op19_16; + let Inst{11-7} = 0b11000; + let Inst{6} = op6; + let Inst{4} = 0; +} + // NEONFPPat - Same as Pat<>, but requires that the compiler be using NEON // for single-precision FP. class NEONFPPat<dag pattern, dag result> : Pat<pattern, result> { diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 26a2806..f2ab06f 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -140,6 +140,8 @@ def IsNotDarwin : Predicate<"!Subtarget->isTargetDarwin()">; def UseMovt : Predicate<"Subtarget->useMovt()">; def DontUseMovt : Predicate<"!Subtarget->useMovt()">; +def UseVMLx : Predicate<"Subtarget->useVMLx()">; + //===----------------------------------------------------------------------===// // ARM Flag Definitions. diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index c977cc3..ed9d31d 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -115,51 +115,80 @@ def h64imm : Operand<i64> { // NEON load / store instructions //===----------------------------------------------------------------------===// +let mayLoad = 1 in { // Use vldmia to load a Q register as a D register pair. -def VLDRQ : NI4<(outs QPR:$dst), (ins addrmode4:$addr), IIC_fpLoadm, - "vldmia", "$addr, ${dst:dregpair}", - [(set QPR:$dst, (v2f64 (load addrmode4:$addr)))]> { - let Inst{27-25} = 0b110; - let Inst{24} = 0; // P bit - let Inst{23} = 1; // U bit - let Inst{20} = 1; - let Inst{11-8} = 0b1011; -} +// This is equivalent to VLDMD except that it has a Q register operand +// instead of a pair of D registers. +def VLDMQ + : AXDI5<(outs QPR:$dst), (ins addrmode5:$addr, pred:$p), + IndexModeNone, IIC_fpLoadm, + "vldm${addr:submode}${p}\t${addr:base}, ${dst:dregpair}", "", []>; +def VLDMQ_UPD + : AXDI5<(outs QPR:$dst, GPR:$wb), (ins addrmode5:$addr, pred:$p), + IndexModeUpd, IIC_fpLoadm, + "vldm${addr:submode}${p}\t${addr:base}!, ${dst:dregpair}", + "$addr.base = $wb", []>; + +// Use vld1 to load a Q register as a D register pair. +// This alternative to VLDMQ allows an alignment to be specified. +// This is equivalent to VLD1q64 except that it has a Q register operand. +def VLD1q + : NLdSt<0,0b10,0b1010,0b1100, (outs QPR:$dst), (ins addrmode6:$addr), + IIC_VLD1, "vld1", "64", "${dst:dregpair}, $addr", "", []>; +def VLD1q_UPD + : NLdSt<0,0b10,0b1010,0b1100, (outs QPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, "vld1", "64", + "${dst:dregpair}, $addr$offset", "$addr.addr = $wb", []>; +} // mayLoad = 1 +let mayStore = 1 in { // Use vstmia to store a Q register as a D register pair. -def VSTRQ : NI4<(outs), (ins QPR:$src, addrmode4:$addr), IIC_fpStorem, - "vstmia", "$addr, ${src:dregpair}", - [(store (v2f64 QPR:$src), addrmode4:$addr)]> { - let Inst{27-25} = 0b110; - let Inst{24} = 0; // P bit - let Inst{23} = 1; // U bit - let Inst{20} = 0; - let Inst{11-8} = 0b1011; -} +// This is equivalent to VSTMD except that it has a Q register operand +// instead of a pair of D registers. +def VSTMQ + : AXDI5<(outs), (ins QPR:$src, addrmode5:$addr, pred:$p), + IndexModeNone, IIC_fpStorem, + "vstm${addr:submode}${p}\t${addr:base}, ${src:dregpair}", "", []>; +def VSTMQ_UPD + : AXDI5<(outs GPR:$wb), (ins QPR:$src, addrmode5:$addr, pred:$p), + IndexModeUpd, IIC_fpStorem, + "vstm${addr:submode}${p}\t${addr:base}!, ${src:dregpair}", + "$addr.base = $wb", []>; + +// Use vst1 to store a Q register as a D register pair. +// This alternative to VSTMQ allows an alignment to be specified. +// This is equivalent to VST1q64 except that it has a Q register operand. +def VST1q + : NLdSt<0,0b00,0b1010,0b1100, (outs), (ins addrmode6:$addr, QPR:$src), + IIC_VST, "vst1", "64", "${src:dregpair}, $addr", "", []>; +def VST1q_UPD + : NLdSt<0,0b00,0b1010,0b1100, (outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QPR:$src), + IIC_VST, "vst1", "64", "{$src:dregpair}, $addr$offset", + "$addr.addr = $wb", []>; +} // mayStore = 1 -// VLD1 : Vector Load (multiple single elements) -class VLD1D<bits<4> op7_4, string Dt, ValueType Ty> - : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$dst), (ins addrmode6:$addr), IIC_VLD1, - "vld1", Dt, "\\{$dst\\}, $addr", "", - [(set DPR:$dst, (Ty (int_arm_neon_vld1 addrmode6:$addr)))]>; -class VLD1Q<bits<4> op7_4, string Dt, ValueType Ty> - : NLdSt<0,0b10,0b1010,op7_4, (outs QPR:$dst), (ins addrmode6:$addr), IIC_VLD1, - "vld1", Dt, "${dst:dregpair}, $addr", "", - [(set QPR:$dst, (Ty (int_arm_neon_vld1 addrmode6:$addr)))]>; - -def VLD1d8 : VLD1D<0b0000, "8", v8i8>; -def VLD1d16 : VLD1D<0b0100, "16", v4i16>; -def VLD1d32 : VLD1D<0b1000, "32", v2i32>; -def VLD1df : VLD1D<0b1000, "32", v2f32>; -def VLD1d64 : VLD1D<0b1100, "64", v1i64>; - -def VLD1q8 : VLD1Q<0b0000, "8", v16i8>; -def VLD1q16 : VLD1Q<0b0100, "16", v8i16>; -def VLD1q32 : VLD1Q<0b1000, "32", v4i32>; -def VLD1qf : VLD1Q<0b1000, "32", v4f32>; -def VLD1q64 : VLD1Q<0b1100, "64", v2i64>; +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in { -let mayLoad = 1 in { +// VLD1 : Vector Load (multiple single elements) +class VLD1D<bits<4> op7_4, string Dt> + : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$dst), + (ins addrmode6:$addr), IIC_VLD1, + "vld1", Dt, "\\{$dst\\}, $addr", "", []>; +class VLD1Q<bits<4> op7_4, string Dt> + : NLdSt<0,0b10,0b1010,op7_4, (outs DPR:$dst1, DPR:$dst2), + (ins addrmode6:$addr), IIC_VLD1, + "vld1", Dt, "\\{$dst1, $dst2\\}, $addr", "", []>; + +def VLD1d8 : VLD1D<0b0000, "8">; +def VLD1d16 : VLD1D<0b0100, "16">; +def VLD1d32 : VLD1D<0b1000, "32">; +def VLD1d64 : VLD1D<0b1100, "64">; + +def VLD1q8 : VLD1Q<0b0000, "8">; +def VLD1q16 : VLD1Q<0b0100, "16">; +def VLD1q32 : VLD1Q<0b1000, "32">; +def VLD1q64 : VLD1Q<0b1100, "64">; // ...with address register writeback: class VLD1DWB<bits<4> op7_4, string Dt> @@ -182,54 +211,48 @@ def VLD1q8_UPD : VLD1QWB<0b0000, "8">; def VLD1q16_UPD : VLD1QWB<0b0100, "16">; def VLD1q32_UPD : VLD1QWB<0b1000, "32">; def VLD1q64_UPD : VLD1QWB<0b1100, "64">; -} // mayLoad = 1 -let mayLoad = 1, hasExtraDefRegAllocReq = 1 in { - -// These (dreg triple/quadruple) are for disassembly only. +// ...with 3 registers (some of these are only for the disassembler): class VLD1D3<bits<4> op7_4, string Dt> : NLdSt<0,0b10,0b0110,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3), (ins addrmode6:$addr), IIC_VLD1, "vld1", Dt, - "\\{$dst1, $dst2, $dst3\\}, $addr", "", - [/* For disassembly only; pattern left blank */]>; -class VLD1D4<bits<4> op7_4, string Dt> - : NLdSt<0,0b10,0b0010,op7_4,(outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4), - (ins addrmode6:$addr), IIC_VLD1, "vld1", Dt, - "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr", "", - [/* For disassembly only; pattern left blank */]>; - -def VLD1d8T : VLD1D3<0b0000, "8">; -def VLD1d16T : VLD1D3<0b0100, "16">; -def VLD1d32T : VLD1D3<0b1000, "32">; -// VLD1d64T : implemented as VLD3d64 - -def VLD1d8Q : VLD1D4<0b0000, "8">; -def VLD1d16Q : VLD1D4<0b0100, "16">; -def VLD1d32Q : VLD1D4<0b1000, "32">; -// VLD1d64Q : implemented as VLD4d64 - -// ...with address register writeback: + "\\{$dst1, $dst2, $dst3\\}, $addr", "", []>; class VLD1D3WB<bits<4> op7_4, string Dt> : NLdSt<0,0b10,0b0110,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb), (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, "vld1", Dt, - "\\{$dst1, $dst2, $dst3\\}, $addr$offset", "$addr.addr = $wb", - [/* For disassembly only; pattern left blank */]>; + "\\{$dst1, $dst2, $dst3\\}, $addr$offset", "$addr.addr = $wb", []>; + +def VLD1d8T : VLD1D3<0b0000, "8">; +def VLD1d16T : VLD1D3<0b0100, "16">; +def VLD1d32T : VLD1D3<0b1000, "32">; +def VLD1d64T : VLD1D3<0b1100, "64">; + +def VLD1d8T_UPD : VLD1D3WB<0b0000, "8">; +def VLD1d16T_UPD : VLD1D3WB<0b0100, "16">; +def VLD1d32T_UPD : VLD1D3WB<0b1000, "32">; +def VLD1d64T_UPD : VLD1D3WB<0b1100, "64">; + +// ...with 4 registers (some of these are only for the disassembler): +class VLD1D4<bits<4> op7_4, string Dt> + : NLdSt<0,0b10,0b0010,op7_4,(outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6:$addr), IIC_VLD1, "vld1", Dt, + "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr", "", []>; class VLD1D4WB<bits<4> op7_4, string Dt> : NLdSt<0,0b10,0b0010,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, "vld1", Dt, "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr$offset", "$addr.addr = $wb", - [/* For disassembly only; pattern left blank */]>; + []>; -def VLD1d8T_UPD : VLD1D3WB<0b0000, "8">; -def VLD1d16T_UPD : VLD1D3WB<0b0100, "16">; -def VLD1d32T_UPD : VLD1D3WB<0b1000, "32">; -// VLD1d64T_UPD : implemented as VLD3d64_UPD +def VLD1d8Q : VLD1D4<0b0000, "8">; +def VLD1d16Q : VLD1D4<0b0100, "16">; +def VLD1d32Q : VLD1D4<0b1000, "32">; +def VLD1d64Q : VLD1D4<0b1100, "64">; def VLD1d8Q_UPD : VLD1D4WB<0b0000, "8">; def VLD1d16Q_UPD : VLD1D4WB<0b0100, "16">; def VLD1d32Q_UPD : VLD1D4WB<0b1000, "32">; -// VLD1d64Q_UPD : implemented as VLD4d64_UPD +def VLD1d64Q_UPD : VLD1D4WB<0b1100, "64">; // VLD2 : Vector Load (multiple 2-element structures) class VLD2D<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -245,9 +268,6 @@ class VLD2Q<bits<4> op7_4, string Dt> def VLD2d8 : VLD2D<0b1000, 0b0000, "8">; def VLD2d16 : VLD2D<0b1000, 0b0100, "16">; def VLD2d32 : VLD2D<0b1000, 0b1000, "32">; -def VLD2d64 : NLdSt<0,0b10,0b1010,0b1100, (outs DPR:$dst1, DPR:$dst2), - (ins addrmode6:$addr), IIC_VLD1, - "vld1", "64", "\\{$dst1, $dst2\\}, $addr", "", []>; def VLD2q8 : VLD2Q<0b0000, "8">; def VLD2q16 : VLD2Q<0b0100, "16">; @@ -269,11 +289,6 @@ class VLD2QWB<bits<4> op7_4, string Dt> def VLD2d8_UPD : VLD2DWB<0b1000, 0b0000, "8">; def VLD2d16_UPD : VLD2DWB<0b1000, 0b0100, "16">; def VLD2d32_UPD : VLD2DWB<0b1000, 0b1000, "32">; -def VLD2d64_UPD : NLdSt<0,0b10,0b1010,0b1100, - (outs DPR:$dst1, DPR:$dst2, GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, - "vld1", "64", "\\{$dst1, $dst2\\}, $addr$offset", - "$addr.addr = $wb", []>; def VLD2q8_UPD : VLD2QWB<0b0000, "8">; def VLD2q16_UPD : VLD2QWB<0b0100, "16">; @@ -296,10 +311,6 @@ class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt> def VLD3d8 : VLD3D<0b0100, 0b0000, "8">; def VLD3d16 : VLD3D<0b0100, 0b0100, "16">; def VLD3d32 : VLD3D<0b0100, 0b1000, "32">; -def VLD3d64 : NLdSt<0,0b10,0b0110,0b1100, - (outs DPR:$dst1, DPR:$dst2, DPR:$dst3), - (ins addrmode6:$addr), IIC_VLD1, - "vld1", "64", "\\{$dst1, $dst2, $dst3\\}, $addr", "", []>; // ...with address register writeback: class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -312,11 +323,6 @@ class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt> def VLD3d8_UPD : VLD3DWB<0b0100, 0b0000, "8">; def VLD3d16_UPD : VLD3DWB<0b0100, 0b0100, "16">; def VLD3d32_UPD : VLD3DWB<0b0100, 0b1000, "32">; -def VLD3d64_UPD : NLdSt<0,0b10,0b0110,0b1100, - (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, - "vld1", "64", "\\{$dst1, $dst2, $dst3\\}, $addr$offset", - "$addr.addr = $wb", []>; // ...with double-spaced registers (non-updating versions for disassembly only): def VLD3q8 : VLD3D<0b0101, 0b0000, "8">; @@ -341,11 +347,6 @@ class VLD4D<bits<4> op11_8, bits<4> op7_4, string Dt> def VLD4d8 : VLD4D<0b0000, 0b0000, "8">; def VLD4d16 : VLD4D<0b0000, 0b0100, "16">; def VLD4d32 : VLD4D<0b0000, 0b1000, "32">; -def VLD4d64 : NLdSt<0,0b10,0b0010,0b1100, - (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4), - (ins addrmode6:$addr), IIC_VLD1, - "vld1", "64", "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr", - "", []>; // ...with address register writeback: class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -358,13 +359,6 @@ class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt> def VLD4d8_UPD : VLD4DWB<0b0000, 0b0000, "8">; def VLD4d16_UPD : VLD4DWB<0b0000, 0b0100, "16">; def VLD4d32_UPD : VLD4DWB<0b0000, 0b1000, "32">; -def VLD4d64_UPD : NLdSt<0,0b10,0b0010,0b1100, - (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, - GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, - "vld1", "64", - "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr$offset", - "$addr.addr = $wb", []>; // ...with double-spaced registers (non-updating versions for disassembly only): def VLD4q8 : VLD4D<0b0001, 0b0000, "8">; @@ -383,62 +377,62 @@ def VLD4q32odd_UPD : VLD4DWB<0b0001, 0b1000, "32">; // FIXME: Not yet implemented. // VLD2LN : Vector Load (single 2-element structure to one lane) -class VLD2LN<bits<4> op11_8, string Dt> - : NLdSt<1, 0b10, op11_8, {?,?,?,?}, (outs DPR:$dst1, DPR:$dst2), +class VLD2LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2), (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VLD2, "vld2", Dt, "\\{$dst1[$lane], $dst2[$lane]\\}, $addr", "$src1 = $dst1, $src2 = $dst2", []>; -def VLD2LNd8 : VLD2LN<0b0001, "8">; -def VLD2LNd16 : VLD2LN<0b0101, "16"> { let Inst{5} = 0; } -def VLD2LNd32 : VLD2LN<0b1001, "32"> { let Inst{6} = 0; } +def VLD2LNd8 : VLD2LN<0b0001, {?,?,?,?}, "8">; +def VLD2LNd16 : VLD2LN<0b0101, {?,?,0,?}, "16">; +def VLD2LNd32 : VLD2LN<0b1001, {?,0,?,?}, "32">; // ...with double-spaced registers: -def VLD2LNq16 : VLD2LN<0b0101, "16"> { let Inst{5} = 1; } -def VLD2LNq32 : VLD2LN<0b1001, "32"> { let Inst{6} = 1; } +def VLD2LNq16 : VLD2LN<0b0101, {?,?,1,?}, "16">; +def VLD2LNq32 : VLD2LN<0b1001, {?,1,?,?}, "32">; // ...alternate versions to be allocated odd register numbers: -def VLD2LNq16odd : VLD2LN<0b0101, "16"> { let Inst{5} = 1; } -def VLD2LNq32odd : VLD2LN<0b1001, "32"> { let Inst{6} = 1; } +def VLD2LNq16odd : VLD2LN<0b0101, {?,?,1,?}, "16">; +def VLD2LNq32odd : VLD2LN<0b1001, {?,1,?,?}, "32">; // ...with address register writeback: -class VLD2LNWB<bits<4> op11_8, string Dt> - : NLdSt<1, 0b10, op11_8, {?,?,?,?}, (outs DPR:$dst1, DPR:$dst2, GPR:$wb), +class VLD2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, GPR:$wb), (ins addrmode6:$addr, am6offset:$offset, DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VLD2, "vld2", Dt, "\\{$dst1[$lane], $dst2[$lane]\\}, $addr$offset", "$src1 = $dst1, $src2 = $dst2, $addr.addr = $wb", []>; -def VLD2LNd8_UPD : VLD2LNWB<0b0001, "8">; -def VLD2LNd16_UPD : VLD2LNWB<0b0101, "16"> { let Inst{5} = 0; } -def VLD2LNd32_UPD : VLD2LNWB<0b1001, "32"> { let Inst{6} = 0; } +def VLD2LNd8_UPD : VLD2LNWB<0b0001, {?,?,?,?}, "8">; +def VLD2LNd16_UPD : VLD2LNWB<0b0101, {?,?,0,?}, "16">; +def VLD2LNd32_UPD : VLD2LNWB<0b1001, {?,0,?,?}, "32">; -def VLD2LNq16_UPD : VLD2LNWB<0b0101, "16"> { let Inst{5} = 1; } -def VLD2LNq32_UPD : VLD2LNWB<0b1001, "32"> { let Inst{6} = 1; } +def VLD2LNq16_UPD : VLD2LNWB<0b0101, {?,?,1,?}, "16">; +def VLD2LNq32_UPD : VLD2LNWB<0b1001, {?,1,?,?}, "32">; // VLD3LN : Vector Load (single 3-element structure to one lane) -class VLD3LN<bits<4> op11_8, string Dt> - : NLdSt<1, 0b10, op11_8, {?,?,?,?}, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3), +class VLD3LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3), (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane), IIC_VLD3, "vld3", Dt, "\\{$dst1[$lane], $dst2[$lane], $dst3[$lane]\\}, $addr", "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3", []>; -def VLD3LNd8 : VLD3LN<0b0010, "8"> { let Inst{4} = 0; } -def VLD3LNd16 : VLD3LN<0b0110, "16"> { let Inst{5-4} = 0b00; } -def VLD3LNd32 : VLD3LN<0b1010, "32"> { let Inst{6-4} = 0b000; } +def VLD3LNd8 : VLD3LN<0b0010, {?,?,?,0}, "8">; +def VLD3LNd16 : VLD3LN<0b0110, {?,?,0,0}, "16">; +def VLD3LNd32 : VLD3LN<0b1010, {?,0,0,0}, "32">; // ...with double-spaced registers: -def VLD3LNq16 : VLD3LN<0b0110, "16"> { let Inst{5-4} = 0b10; } -def VLD3LNq32 : VLD3LN<0b1010, "32"> { let Inst{6-4} = 0b100; } +def VLD3LNq16 : VLD3LN<0b0110, {?,?,1,0}, "16">; +def VLD3LNq32 : VLD3LN<0b1010, {?,1,0,0}, "32">; // ...alternate versions to be allocated odd register numbers: -def VLD3LNq16odd : VLD3LN<0b0110, "16"> { let Inst{5-4} = 0b10; } -def VLD3LNq32odd : VLD3LN<0b1010, "32"> { let Inst{6-4} = 0b100; } +def VLD3LNq16odd : VLD3LN<0b0110, {?,?,1,0}, "16">; +def VLD3LNq32odd : VLD3LN<0b1010, {?,1,0,0}, "32">; // ...with address register writeback: -class VLD3LNWB<bits<4> op11_8, string Dt> - : NLdSt<1, 0b10, op11_8, {?,?,?,?}, +class VLD3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb), (ins addrmode6:$addr, am6offset:$offset, DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane), @@ -447,37 +441,37 @@ class VLD3LNWB<bits<4> op11_8, string Dt> "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $addr.addr = $wb", []>; -def VLD3LNd8_UPD : VLD3LNWB<0b0010, "8"> { let Inst{4} = 0; } -def VLD3LNd16_UPD : VLD3LNWB<0b0110, "16"> { let Inst{5-4} = 0b00; } -def VLD3LNd32_UPD : VLD3LNWB<0b1010, "32"> { let Inst{6-4} = 0b000; } +def VLD3LNd8_UPD : VLD3LNWB<0b0010, {?,?,?,0}, "8">; +def VLD3LNd16_UPD : VLD3LNWB<0b0110, {?,?,0,0}, "16">; +def VLD3LNd32_UPD : VLD3LNWB<0b1010, {?,0,0,0}, "32">; -def VLD3LNq16_UPD : VLD3LNWB<0b0110, "16"> { let Inst{5-4} = 0b10; } -def VLD3LNq32_UPD : VLD3LNWB<0b1010, "32"> { let Inst{6-4} = 0b100; } +def VLD3LNq16_UPD : VLD3LNWB<0b0110, {?,?,1,0}, "16">; +def VLD3LNq32_UPD : VLD3LNWB<0b1010, {?,1,0,0}, "32">; // VLD4LN : Vector Load (single 4-element structure to one lane) -class VLD4LN<bits<4> op11_8, string Dt> - : NLdSt<1, 0b10, op11_8, {?,?,?,?}, +class VLD4LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4), (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane), IIC_VLD4, "vld4", Dt, "\\{$dst1[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $addr", "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []>; -def VLD4LNd8 : VLD4LN<0b0011, "8">; -def VLD4LNd16 : VLD4LN<0b0111, "16"> { let Inst{5} = 0; } -def VLD4LNd32 : VLD4LN<0b1011, "32"> { let Inst{6} = 0; } +def VLD4LNd8 : VLD4LN<0b0011, {?,?,?,?}, "8">; +def VLD4LNd16 : VLD4LN<0b0111, {?,?,0,?}, "16">; +def VLD4LNd32 : VLD4LN<0b1011, {?,0,?,?}, "32">; // ...with double-spaced registers: -def VLD4LNq16 : VLD4LN<0b0111, "16"> { let Inst{5} = 1; } -def VLD4LNq32 : VLD4LN<0b1011, "32"> { let Inst{6} = 1; } +def VLD4LNq16 : VLD4LN<0b0111, {?,?,1,?}, "16">; +def VLD4LNq32 : VLD4LN<0b1011, {?,1,?,?}, "32">; // ...alternate versions to be allocated odd register numbers: -def VLD4LNq16odd : VLD4LN<0b0111, "16"> { let Inst{5} = 1; } -def VLD4LNq32odd : VLD4LN<0b1011, "32"> { let Inst{6} = 1; } +def VLD4LNq16odd : VLD4LN<0b0111, {?,?,1,?}, "16">; +def VLD4LNq32odd : VLD4LN<0b1011, {?,1,?,?}, "32">; // ...with address register writeback: -class VLD4LNWB<bits<4> op11_8, string Dt> - : NLdSt<1, 0b10, op11_8, {?,?,?,?}, +class VLD4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), (ins addrmode6:$addr, am6offset:$offset, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane), @@ -486,12 +480,12 @@ class VLD4LNWB<bits<4> op11_8, string Dt> "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4, $addr.addr = $wb", []>; -def VLD4LNd8_UPD : VLD4LNWB<0b0011, "8">; -def VLD4LNd16_UPD : VLD4LNWB<0b0111, "16"> { let Inst{5} = 0; } -def VLD4LNd32_UPD : VLD4LNWB<0b1011, "32"> { let Inst{6} = 0; } +def VLD4LNd8_UPD : VLD4LNWB<0b0011, {?,?,?,?}, "8">; +def VLD4LNd16_UPD : VLD4LNWB<0b0111, {?,?,0,?}, "16">; +def VLD4LNd32_UPD : VLD4LNWB<0b1011, {?,0,?,?}, "32">; -def VLD4LNq16_UPD : VLD4LNWB<0b0111, "16"> { let Inst{5} = 1; } -def VLD4LNq32_UPD : VLD4LNWB<0b1011, "32"> { let Inst{6} = 1; } +def VLD4LNq16_UPD : VLD4LNWB<0b0111, {?,?,1,?}, "16">; +def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32">; // VLD1DUP : Vector Load (single element to all lanes) // VLD2DUP : Vector Load (single 2-element structure to all lanes) @@ -500,31 +494,26 @@ def VLD4LNq32_UPD : VLD4LNWB<0b1011, "32"> { let Inst{6} = 1; } // FIXME: Not yet implemented. } // mayLoad = 1, hasExtraDefRegAllocReq = 1 +let mayStore = 1, hasExtraSrcRegAllocReq = 1 in { + // VST1 : Vector Store (multiple single elements) -class VST1D<bits<4> op7_4, string Dt, ValueType Ty> +class VST1D<bits<4> op7_4, string Dt> : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins addrmode6:$addr, DPR:$src), IIC_VST, - "vst1", Dt, "\\{$src\\}, $addr", "", - [(int_arm_neon_vst1 addrmode6:$addr, (Ty DPR:$src))]>; -class VST1Q<bits<4> op7_4, string Dt, ValueType Ty> - : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins addrmode6:$addr, QPR:$src), IIC_VST, - "vst1", Dt, "${src:dregpair}, $addr", "", - [(int_arm_neon_vst1 addrmode6:$addr, (Ty QPR:$src))]>; - -let hasExtraSrcRegAllocReq = 1 in { -def VST1d8 : VST1D<0b0000, "8", v8i8>; -def VST1d16 : VST1D<0b0100, "16", v4i16>; -def VST1d32 : VST1D<0b1000, "32", v2i32>; -def VST1df : VST1D<0b1000, "32", v2f32>; -def VST1d64 : VST1D<0b1100, "64", v1i64>; - -def VST1q8 : VST1Q<0b0000, "8", v16i8>; -def VST1q16 : VST1Q<0b0100, "16", v8i16>; -def VST1q32 : VST1Q<0b1000, "32", v4i32>; -def VST1qf : VST1Q<0b1000, "32", v4f32>; -def VST1q64 : VST1Q<0b1100, "64", v2i64>; -} // hasExtraSrcRegAllocReq - -let mayStore = 1, hasExtraSrcRegAllocReq = 1 in { + "vst1", Dt, "\\{$src\\}, $addr", "", []>; +class VST1Q<bits<4> op7_4, string Dt> + : NLdSt<0,0b00,0b1010,op7_4, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2), IIC_VST, + "vst1", Dt, "\\{$src1, $src2\\}, $addr", "", []>; + +def VST1d8 : VST1D<0b0000, "8">; +def VST1d16 : VST1D<0b0100, "16">; +def VST1d32 : VST1D<0b1000, "32">; +def VST1d64 : VST1D<0b1100, "64">; + +def VST1q8 : VST1Q<0b0000, "8">; +def VST1q16 : VST1Q<0b0100, "16">; +def VST1q32 : VST1Q<0b1000, "32">; +def VST1q64 : VST1Q<0b1100, "64">; // ...with address register writeback: class VST1DWB<bits<4> op7_4, string Dt> @@ -546,53 +535,50 @@ def VST1q16_UPD : VST1QWB<0b0100, "16">; def VST1q32_UPD : VST1QWB<0b1000, "32">; def VST1q64_UPD : VST1QWB<0b1100, "64">; -// These (dreg triple/quadruple) are for disassembly only. +// ...with 3 registers (some of these are only for the disassembler): class VST1D3<bits<4> op7_4, string Dt> : NLdSt<0, 0b00, 0b0110, op7_4, (outs), (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3), - IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3\\}, $addr", "", - [/* For disassembly only; pattern left blank */]>; -class VST1D4<bits<4> op7_4, string Dt> - : NLdSt<0, 0b00, 0b0010, op7_4, (outs), - (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), - IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr", "", - [/* For disassembly only; pattern left blank */]>; - -def VST1d8T : VST1D3<0b0000, "8">; -def VST1d16T : VST1D3<0b0100, "16">; -def VST1d32T : VST1D3<0b1000, "32">; -// VST1d64T : implemented as VST3d64 - -def VST1d8Q : VST1D4<0b0000, "8">; -def VST1d16Q : VST1D4<0b0100, "16">; -def VST1d32Q : VST1D4<0b1000, "32">; -// VST1d64Q : implemented as VST4d64 - -// ...with address register writeback: + IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3\\}, $addr", "", []>; class VST1D3WB<bits<4> op7_4, string Dt> : NLdSt<0, 0b00, 0b0110, op7_4, (outs GPR:$wb), (ins addrmode6:$addr, am6offset:$offset, DPR:$src1, DPR:$src2, DPR:$src3), IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3\\}, $addr$offset", - "$addr.addr = $wb", - [/* For disassembly only; pattern left blank */]>; + "$addr.addr = $wb", []>; + +def VST1d8T : VST1D3<0b0000, "8">; +def VST1d16T : VST1D3<0b0100, "16">; +def VST1d32T : VST1D3<0b1000, "32">; +def VST1d64T : VST1D3<0b1100, "64">; + +def VST1d8T_UPD : VST1D3WB<0b0000, "8">; +def VST1d16T_UPD : VST1D3WB<0b0100, "16">; +def VST1d32T_UPD : VST1D3WB<0b1000, "32">; +def VST1d64T_UPD : VST1D3WB<0b1100, "64">; + +// ...with 4 registers (some of these are only for the disassembler): +class VST1D4<bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, 0b0010, op7_4, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), + IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr", "", + []>; class VST1D4WB<bits<4> op7_4, string Dt> : NLdSt<0, 0b00, 0b0010, op7_4, (outs GPR:$wb), (ins addrmode6:$addr, am6offset:$offset, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr$offset", - "$addr.addr = $wb", - [/* For disassembly only; pattern left blank */]>; + "$addr.addr = $wb", []>; -def VST1d8T_UPD : VST1D3WB<0b0000, "8">; -def VST1d16T_UPD : VST1D3WB<0b0100, "16">; -def VST1d32T_UPD : VST1D3WB<0b1000, "32">; -// VST1d64T_UPD : implemented as VST3d64_UPD +def VST1d8Q : VST1D4<0b0000, "8">; +def VST1d16Q : VST1D4<0b0100, "16">; +def VST1d32Q : VST1D4<0b1000, "32">; +def VST1d64Q : VST1D4<0b1100, "64">; def VST1d8Q_UPD : VST1D4WB<0b0000, "8">; def VST1d16Q_UPD : VST1D4WB<0b0100, "16">; def VST1d32Q_UPD : VST1D4WB<0b1000, "32">; -// VST1d64Q_UPD : implemented as VST4d64_UPD +def VST1d64Q_UPD : VST1D4WB<0b1100, "64">; // VST2 : Vector Store (multiple 2-element structures) class VST2D<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -608,9 +594,6 @@ class VST2Q<bits<4> op7_4, string Dt> def VST2d8 : VST2D<0b1000, 0b0000, "8">; def VST2d16 : VST2D<0b1000, 0b0100, "16">; def VST2d32 : VST2D<0b1000, 0b1000, "32">; -def VST2d64 : NLdSt<0,0b00,0b1010,0b1100, (outs), - (ins addrmode6:$addr, DPR:$src1, DPR:$src2), IIC_VST, - "vst1", "64", "\\{$src1, $src2\\}, $addr", "", []>; def VST2q8 : VST2Q<0b0000, "8">; def VST2q16 : VST2Q<0b0100, "16">; @@ -632,11 +615,6 @@ class VST2QWB<bits<4> op7_4, string Dt> def VST2d8_UPD : VST2DWB<0b1000, 0b0000, "8">; def VST2d16_UPD : VST2DWB<0b1000, 0b0100, "16">; def VST2d32_UPD : VST2DWB<0b1000, 0b1000, "32">; -def VST2d64_UPD : NLdSt<0,0b00,0b1010,0b1100, (outs GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset, - DPR:$src1, DPR:$src2), IIC_VST, - "vst1", "64", "\\{$src1, $src2\\}, $addr$offset", - "$addr.addr = $wb", []>; def VST2q8_UPD : VST2QWB<0b0000, "8">; def VST2q16_UPD : VST2QWB<0b0100, "16">; @@ -659,10 +637,6 @@ class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt> def VST3d8 : VST3D<0b0100, 0b0000, "8">; def VST3d16 : VST3D<0b0100, 0b0100, "16">; def VST3d32 : VST3D<0b0100, 0b1000, "32">; -def VST3d64 : NLdSt<0,0b00,0b0110,0b1100, (outs), - (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3), - IIC_VST, - "vst1", "64", "\\{$src1, $src2, $src3\\}, $addr", "", []>; // ...with address register writeback: class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -675,11 +649,6 @@ class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt> def VST3d8_UPD : VST3DWB<0b0100, 0b0000, "8">; def VST3d16_UPD : VST3DWB<0b0100, 0b0100, "16">; def VST3d32_UPD : VST3DWB<0b0100, 0b1000, "32">; -def VST3d64_UPD : NLdSt<0,0b00,0b0110,0b1100, (outs GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset, - DPR:$src1, DPR:$src2, DPR:$src3), IIC_VST, - "vst1", "64", "\\{$src1, $src2, $src3\\}, $addr$offset", - "$addr.addr = $wb", []>; // ...with double-spaced registers (non-updating versions for disassembly only): def VST3q8 : VST3D<0b0101, 0b0000, "8">; @@ -704,11 +673,6 @@ class VST4D<bits<4> op11_8, bits<4> op7_4, string Dt> def VST4d8 : VST4D<0b0000, 0b0000, "8">; def VST4d16 : VST4D<0b0000, 0b0100, "16">; def VST4d32 : VST4D<0b0000, 0b1000, "32">; -def VST4d64 : NLdSt<0,0b00,0b0010,0b1100, (outs), - (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, - DPR:$src4), IIC_VST, - "vst1", "64", "\\{$src1, $src2, $src3, $src4\\}, $addr", - "", []>; // ...with address register writeback: class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -721,12 +685,6 @@ class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt> def VST4d8_UPD : VST4DWB<0b0000, 0b0000, "8">; def VST4d16_UPD : VST4DWB<0b0000, 0b0100, "16">; def VST4d32_UPD : VST4DWB<0b0000, 0b1000, "32">; -def VST4d64_UPD : NLdSt<0,0b00,0b0010,0b1100, (outs GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset, - DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST, - "vst1", "64", - "\\{$src1, $src2, $src3, $src4\\}, $addr$offset", - "$addr.addr = $wb", []>; // ...with double-spaced registers (non-updating versions for disassembly only): def VST4q8 : VST4D<0b0001, 0b0000, "8">; @@ -745,109 +703,109 @@ def VST4q32odd_UPD : VST4DWB<0b0001, 0b1000, "32">; // FIXME: Not yet implemented. // VST2LN : Vector Store (single 2-element structure from one lane) -class VST2LN<bits<4> op11_8, string Dt> - : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs), +class VST2LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b00, op11_8, op7_4, (outs), (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VST, "vst2", Dt, "\\{$src1[$lane], $src2[$lane]\\}, $addr", "", []>; -def VST2LNd8 : VST2LN<0b0001, "8">; -def VST2LNd16 : VST2LN<0b0101, "16"> { let Inst{5} = 0; } -def VST2LNd32 : VST2LN<0b1001, "32"> { let Inst{6} = 0; } +def VST2LNd8 : VST2LN<0b0001, {?,?,?,?}, "8">; +def VST2LNd16 : VST2LN<0b0101, {?,?,0,?}, "16">; +def VST2LNd32 : VST2LN<0b1001, {?,0,?,?}, "32">; // ...with double-spaced registers: -def VST2LNq16 : VST2LN<0b0101, "16"> { let Inst{5} = 1; } -def VST2LNq32 : VST2LN<0b1001, "32"> { let Inst{6} = 1; } +def VST2LNq16 : VST2LN<0b0101, {?,?,1,?}, "16">; +def VST2LNq32 : VST2LN<0b1001, {?,1,?,?}, "32">; // ...alternate versions to be allocated odd register numbers: -def VST2LNq16odd : VST2LN<0b0101, "16"> { let Inst{5} = 1; } -def VST2LNq32odd : VST2LN<0b1001, "32"> { let Inst{6} = 1; } +def VST2LNq16odd : VST2LN<0b0101, {?,?,1,?}, "16">; +def VST2LNq32odd : VST2LN<0b1001, {?,1,?,?}, "32">; // ...with address register writeback: -class VST2LNWB<bits<4> op11_8, string Dt> - : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs GPR:$wb), +class VST2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b00, op11_8, op7_4, (outs GPR:$wb), (ins addrmode6:$addr, am6offset:$offset, DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VST, "vst2", Dt, "\\{$src1[$lane], $src2[$lane]\\}, $addr$offset", "$addr.addr = $wb", []>; -def VST2LNd8_UPD : VST2LNWB<0b0001, "8">; -def VST2LNd16_UPD : VST2LNWB<0b0101, "16"> { let Inst{5} = 0; } -def VST2LNd32_UPD : VST2LNWB<0b1001, "32"> { let Inst{6} = 0; } +def VST2LNd8_UPD : VST2LNWB<0b0001, {?,?,?,?}, "8">; +def VST2LNd16_UPD : VST2LNWB<0b0101, {?,?,0,?}, "16">; +def VST2LNd32_UPD : VST2LNWB<0b1001, {?,0,?,?}, "32">; -def VST2LNq16_UPD : VST2LNWB<0b0101, "16"> { let Inst{5} = 1; } -def VST2LNq32_UPD : VST2LNWB<0b1001, "32"> { let Inst{6} = 1; } +def VST2LNq16_UPD : VST2LNWB<0b0101, {?,?,1,?}, "16">; +def VST2LNq32_UPD : VST2LNWB<0b1001, {?,1,?,?}, "32">; // VST3LN : Vector Store (single 3-element structure from one lane) -class VST3LN<bits<4> op11_8, string Dt> - : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs), +class VST3LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b00, op11_8, op7_4, (outs), (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane), IIC_VST, "vst3", Dt, "\\{$src1[$lane], $src2[$lane], $src3[$lane]\\}, $addr", "", []>; -def VST3LNd8 : VST3LN<0b0010, "8"> { let Inst{4} = 0; } -def VST3LNd16 : VST3LN<0b0110, "16"> { let Inst{5-4} = 0b00; } -def VST3LNd32 : VST3LN<0b1010, "32"> { let Inst{6-4} = 0b000; } +def VST3LNd8 : VST3LN<0b0010, {?,?,?,0}, "8">; +def VST3LNd16 : VST3LN<0b0110, {?,?,0,0}, "16">; +def VST3LNd32 : VST3LN<0b1010, {?,0,0,0}, "32">; // ...with double-spaced registers: -def VST3LNq16 : VST3LN<0b0110, "16"> { let Inst{5-4} = 0b10; } -def VST3LNq32 : VST3LN<0b1010, "32"> { let Inst{6-4} = 0b100; } +def VST3LNq16 : VST3LN<0b0110, {?,?,1,0}, "16">; +def VST3LNq32 : VST3LN<0b1010, {?,1,0,0}, "32">; // ...alternate versions to be allocated odd register numbers: -def VST3LNq16odd : VST3LN<0b0110, "16"> { let Inst{5-4} = 0b10; } -def VST3LNq32odd : VST3LN<0b1010, "32"> { let Inst{6-4} = 0b100; } +def VST3LNq16odd : VST3LN<0b0110, {?,?,1,0}, "16">; +def VST3LNq32odd : VST3LN<0b1010, {?,1,0,0}, "32">; // ...with address register writeback: -class VST3LNWB<bits<4> op11_8, string Dt> - : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs GPR:$wb), +class VST3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b00, op11_8, op7_4, (outs GPR:$wb), (ins addrmode6:$addr, am6offset:$offset, DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane), IIC_VST, "vst3", Dt, "\\{$src1[$lane], $src2[$lane], $src3[$lane]\\}, $addr$offset", "$addr.addr = $wb", []>; -def VST3LNd8_UPD : VST3LNWB<0b0010, "8"> { let Inst{4} = 0; } -def VST3LNd16_UPD : VST3LNWB<0b0110, "16"> { let Inst{5-4} = 0b00; } -def VST3LNd32_UPD : VST3LNWB<0b1010, "32"> { let Inst{6-4} = 0b000; } +def VST3LNd8_UPD : VST3LNWB<0b0010, {?,?,?,0}, "8">; +def VST3LNd16_UPD : VST3LNWB<0b0110, {?,?,0,0}, "16">; +def VST3LNd32_UPD : VST3LNWB<0b1010, {?,0,0,0}, "32">; -def VST3LNq16_UPD : VST3LNWB<0b0110, "16"> { let Inst{5-4} = 0b10; } -def VST3LNq32_UPD : VST3LNWB<0b1010, "32"> { let Inst{6-4} = 0b100; } +def VST3LNq16_UPD : VST3LNWB<0b0110, {?,?,1,0}, "16">; +def VST3LNq32_UPD : VST3LNWB<0b1010, {?,1,0,0}, "32">; // VST4LN : Vector Store (single 4-element structure from one lane) -class VST4LN<bits<4> op11_8, string Dt> - : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs), +class VST4LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b00, op11_8, op7_4, (outs), (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane), IIC_VST, "vst4", Dt, "\\{$src1[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $addr", "", []>; -def VST4LNd8 : VST4LN<0b0011, "8">; -def VST4LNd16 : VST4LN<0b0111, "16"> { let Inst{5} = 0; } -def VST4LNd32 : VST4LN<0b1011, "32"> { let Inst{6} = 0; } +def VST4LNd8 : VST4LN<0b0011, {?,?,?,?}, "8">; +def VST4LNd16 : VST4LN<0b0111, {?,?,0,?}, "16">; +def VST4LNd32 : VST4LN<0b1011, {?,0,?,?}, "32">; // ...with double-spaced registers: -def VST4LNq16 : VST4LN<0b0111, "16"> { let Inst{5} = 1; } -def VST4LNq32 : VST4LN<0b1011, "32"> { let Inst{6} = 1; } +def VST4LNq16 : VST4LN<0b0111, {?,?,1,?}, "16">; +def VST4LNq32 : VST4LN<0b1011, {?,1,?,?}, "32">; // ...alternate versions to be allocated odd register numbers: -def VST4LNq16odd : VST4LN<0b0111, "16"> { let Inst{5} = 1; } -def VST4LNq32odd : VST4LN<0b1011, "32"> { let Inst{6} = 1; } +def VST4LNq16odd : VST4LN<0b0111, {?,?,1,?}, "16">; +def VST4LNq32odd : VST4LN<0b1011, {?,1,?,?}, "32">; // ...with address register writeback: -class VST4LNWB<bits<4> op11_8, string Dt> - : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs GPR:$wb), +class VST4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b00, op11_8, op7_4, (outs GPR:$wb), (ins addrmode6:$addr, am6offset:$offset, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane), IIC_VST, "vst4", Dt, "\\{$src1[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $addr$offset", "$addr.addr = $wb", []>; -def VST4LNd8_UPD : VST4LNWB<0b0011, "8">; -def VST4LNd16_UPD : VST4LNWB<0b0111, "16"> { let Inst{5} = 0; } -def VST4LNd32_UPD : VST4LNWB<0b1011, "32"> { let Inst{6} = 0; } +def VST4LNd8_UPD : VST4LNWB<0b0011, {?,?,?,?}, "8">; +def VST4LNd16_UPD : VST4LNWB<0b0111, {?,?,0,?}, "16">; +def VST4LNd32_UPD : VST4LNWB<0b1011, {?,0,?,?}, "32">; -def VST4LNq16_UPD : VST4LNWB<0b0111, "16"> { let Inst{5} = 1; } -def VST4LNq32_UPD : VST4LNWB<0b1011, "32"> { let Inst{6} = 1; } +def VST4LNq16_UPD : VST4LNWB<0b0111, {?,?,1,?}, "16">; +def VST4LNq32_UPD : VST4LNWB<0b1011, {?,1,?,?}, "32">; } // mayStore = 1, hasExtraSrcRegAllocReq = 1 @@ -906,18 +864,18 @@ class N2VD<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst), - (ins DPR:$src), IIC_VUNAD, OpcodeStr, Dt, "$dst, $src", "", + (ins DPR:$src), IIC_VUNAD, OpcodeStr, Dt,"$dst, $src", "", [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src))))]>; class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst), - (ins QPR:$src), IIC_VUNAQ, OpcodeStr, Dt, "$dst, $src", "", + (ins QPR:$src), IIC_VUNAQ, OpcodeStr, Dt,"$dst, $src", "", [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src))))]>; // Basic 2-register intrinsics, both double- and quad-register. class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, - bits<2> op17_16, bits<5> op11_7, bit op4, + bits<2> op17_16, bits<5> op11_7, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst), @@ -966,8 +924,8 @@ class N3VS<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src1, DPR_VFP2:$src2), IIC_VBIND, - OpcodeStr, Dt, "$dst, $src1, $src2", "", []> { + (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src1, DPR_VFP2:$src2), N3RegFrm, + IIC_VBIND, OpcodeStr, Dt, "$dst, $src1, $src2", "", []> { let isCommutable = Commutable; } @@ -975,7 +933,7 @@ class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2", "", [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]> { let isCommutable = Commutable; @@ -986,27 +944,28 @@ class N3VDX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> : N3VX<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin, OpcodeStr, "$dst, $src1, $src2", "", [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]>{ let isCommutable = Commutable; } + class N3VDSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> : N3V<0, 1, op21_20, op11_8, 1, 0, (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), - itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", [(set (Ty DPR:$dst), (Ty (ShOp (Ty DPR:$src1), - (Ty (NEONvduplane (Ty DPR_VFP2:$src2), imm:$lane)))))]>{ + (Ty (NEONvduplane (Ty DPR_VFP2:$src2),imm:$lane)))))]> { let isCommutable = 0; } class N3VDSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> : N3V<0, 1, op21_20, op11_8, 1, 0, (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), - IIC_VMULi16D, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + NVMulSLFrm, IIC_VMULi16D, OpcodeStr, Dt,"$dst, $src1, $src2[$lane]","", [(set (Ty DPR:$dst), (Ty (ShOp (Ty DPR:$src1), (Ty (NEONvduplane (Ty DPR_8:$src2), imm:$lane)))))]> { @@ -1017,7 +976,7 @@ class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 1, op4, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2", "", [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]> { let isCommutable = Commutable; @@ -1026,7 +985,7 @@ class N3VQX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> : N3VX<op24, op23, op21_20, op11_8, 1, op4, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, itin, OpcodeStr, "$dst, $src1, $src2", "", [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]>{ let isCommutable = Commutable; @@ -1036,7 +995,7 @@ class N3VQSL<bits<2> op21_20, bits<4> op11_8, ValueType ResTy, ValueType OpTy, SDNode ShOp> : N3V<1, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), - itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", [(set (ResTy QPR:$dst), (ResTy (ShOp (ResTy QPR:$src1), (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2), @@ -1047,7 +1006,7 @@ class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode ShOp> : N3V<1, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane), - IIC_VMULi16Q, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + NVMulSLFrm, IIC_VMULi16Q, OpcodeStr, Dt,"$dst, $src1, $src2[$lane]","", [(set (ResTy QPR:$dst), (ResTy (ShOp (ResTy QPR:$src1), (ResTy (NEONvduplane (OpTy DPR_8:$src2), @@ -1057,10 +1016,10 @@ class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt, // Basic 3-register intrinsics, both double- and quad-register. class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, - InstrItinClass itin, string OpcodeStr, string Dt, + Format f, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), f, itin, OpcodeStr, Dt, "$dst, $src1, $src2", "", [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1), (OpTy DPR:$src2))))]> { let isCommutable = Commutable; @@ -1069,7 +1028,7 @@ class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp> : N3V<0, 1, op21_20, op11_8, 1, 0, (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), - itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", [(set (Ty DPR:$dst), (Ty (IntOp (Ty DPR:$src1), (Ty (NEONvduplane (Ty DPR_VFP2:$src2), @@ -1080,19 +1039,18 @@ class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp> : N3V<0, 1, op21_20, op11_8, 1, 0, (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), - itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", [(set (Ty DPR:$dst), (Ty (IntOp (Ty DPR:$src1), - (Ty (NEONvduplane (Ty DPR_8:$src2), - imm:$lane)))))]> { + (Ty (NEONvduplane (Ty DPR_8:$src2), imm:$lane)))))]> { let isCommutable = 0; } class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, - InstrItinClass itin, string OpcodeStr, string Dt, + Format f, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 1, op4, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), f, itin, OpcodeStr, Dt, "$dst, $src1, $src2", "", [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), (OpTy QPR:$src2))))]> { let isCommutable = Commutable; @@ -1102,7 +1060,7 @@ class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<1, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), - itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", [(set (ResTy QPR:$dst), (ResTy (IntOp (ResTy QPR:$src1), (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2), @@ -1114,7 +1072,7 @@ class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<1, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane), - itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", [(set (ResTy QPR:$dst), (ResTy (IntOp (ResTy QPR:$src1), (ResTy (NEONvduplane (OpTy DPR_8:$src2), @@ -1128,14 +1086,14 @@ class N3VSMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, ValueType Ty, SDNode MulOp, SDNode OpNode> : N3V<op24, op23, op21_20, op11_8, 0, op4, (outs DPR_VFP2:$dst), - (ins DPR_VFP2:$src1, DPR_VFP2:$src2, DPR_VFP2:$src3), itin, + (ins DPR_VFP2:$src1, DPR_VFP2:$src2, DPR_VFP2:$src3), N3RegFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", []>; class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, SDNode MulOp, SDNode OpNode> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), itin, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", [(set DPR:$dst, (Ty (OpNode DPR:$src1, (Ty (MulOp DPR:$src2, DPR:$src3)))))]>; @@ -1144,7 +1102,8 @@ class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, ValueType Ty, SDNode MulOp, SDNode ShOp> : N3V<0, 1, op21_20, op11_8, 1, 0, (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin, + (ins DPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", [(set (Ty DPR:$dst), (Ty (ShOp (Ty DPR:$src1), @@ -1156,7 +1115,8 @@ class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, ValueType Ty, SDNode MulOp, SDNode ShOp> : N3V<0, 1, op21_20, op11_8, 1, 0, (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin, + (ins DPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", [(set (Ty DPR:$dst), (Ty (ShOp (Ty DPR:$src1), @@ -1168,7 +1128,7 @@ class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, SDNode MulOp, SDNode OpNode> : N3V<op24, op23, op21_20, op11_8, 1, op4, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), itin, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), N3RegFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", [(set QPR:$dst, (Ty (OpNode QPR:$src1, (Ty (MulOp QPR:$src2, QPR:$src3)))))]>; @@ -1177,7 +1137,8 @@ class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, SDNode MulOp, SDNode ShOp> : N3V<1, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin, + (ins QPR:$src1, QPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", [(set (ResTy QPR:$dst), (ResTy (ShOp (ResTy QPR:$src1), @@ -1190,7 +1151,8 @@ class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, SDNode MulOp, SDNode ShOp> : N3V<1, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin, + (ins QPR:$src1, QPR:$src2, DPR_8:$src3, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", [(set (ResTy QPR:$dst), (ResTy (ShOp (ResTy QPR:$src1), @@ -1204,7 +1166,7 @@ class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), itin, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1), (OpTy DPR:$src2), (OpTy DPR:$src3))))]>; @@ -1212,7 +1174,7 @@ class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<op24, op23, op21_20, op11_8, 1, op4, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), itin, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), N3RegFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), (OpTy QPR:$src2), (OpTy QPR:$src3))))]>; @@ -1223,7 +1185,7 @@ class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, Intrinsic IntOp> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), itin, + (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", [(set QPR:$dst, (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2), (TyD DPR:$src3))))]>; @@ -1232,7 +1194,8 @@ class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst), - (ins QPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin, + (ins QPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", [(set (ResTy QPR:$dst), (ResTy (IntOp (ResTy QPR:$src1), @@ -1244,7 +1207,8 @@ class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst), - (ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin, + (ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", [(set (ResTy QPR:$dst), (ResTy (IntOp (ResTy QPR:$src1), @@ -1257,7 +1221,7 @@ class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, string OpcodeStr, string Dt, ValueType TyD, ValueType TyQ, Intrinsic IntOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins QPR:$src1, QPR:$src2), IIC_VBINi4D, + (outs DPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINi4D, OpcodeStr, Dt, "$dst, $src1, $src2", "", [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src1), (TyQ QPR:$src2))))]> { let isCommutable = Commutable; @@ -1268,7 +1232,7 @@ class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, Intrinsic IntOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), itin, + (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2", "", [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src1), (TyD DPR:$src2))))]> { let isCommutable = Commutable; @@ -1277,8 +1241,8 @@ class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<op24, 1, op21_20, op11_8, 1, 0, - (outs QPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), - itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + (outs QPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", [(set (ResTy QPR:$dst), (ResTy (IntOp (OpTy DPR:$src1), (OpTy (NEONvduplane (OpTy DPR_VFP2:$src2), @@ -1288,7 +1252,7 @@ class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), - itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", [(set (ResTy QPR:$dst), (ResTy (IntOp (OpTy DPR:$src1), (OpTy (NEONvduplane (OpTy DPR_8:$src2), @@ -1299,7 +1263,7 @@ class N3VWInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, Intrinsic IntOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs QPR:$dst), (ins QPR:$src1, DPR:$src2), IIC_VSUBiD, + (outs QPR:$dst), (ins QPR:$src1, DPR:$src2), N3RegFrm, IIC_VSUBiD, OpcodeStr, Dt, "$dst, $src1, $src2", "", [(set QPR:$dst, (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2))))]> { let isCommutable = Commutable; @@ -1344,17 +1308,17 @@ class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, // Shift by immediate, // both double- and quad-register. class N2VDSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, - InstrItinClass itin, string OpcodeStr, string Dt, + Format f, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode> : N2VImm<op24, op23, op11_8, op7, 0, op4, - (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), itin, + (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), f, itin, OpcodeStr, Dt, "$dst, $src, $SIMM", "", [(set DPR:$dst, (Ty (OpNode (Ty DPR:$src), (i32 imm:$SIMM))))]>; class N2VQSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, - InstrItinClass itin, string OpcodeStr, string Dt, + Format f, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode> : N2VImm<op24, op23, op11_8, op7, 1, op4, - (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), itin, + (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), f, itin, OpcodeStr, Dt, "$dst, $src, $SIMM", "", [(set QPR:$dst, (Ty (OpNode (Ty QPR:$src), (i32 imm:$SIMM))))]>; @@ -1363,8 +1327,8 @@ class N2VLSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode> : N2VImm<op24, op23, op11_8, op7, op6, op4, - (outs QPR:$dst), (ins DPR:$src, i32imm:$SIMM), IIC_VSHLiD, - OpcodeStr, Dt, "$dst, $src, $SIMM", "", + (outs QPR:$dst), (ins DPR:$src, i32imm:$SIMM), N2RegVShLFrm, + IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src, $SIMM", "", [(set QPR:$dst, (ResTy (OpNode (OpTy DPR:$src), (i32 imm:$SIMM))))]>; @@ -1373,7 +1337,7 @@ class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode> : N2VImm<op24, op23, op11_8, op7, op6, op4, - (outs DPR:$dst), (ins QPR:$src, i32imm:$SIMM), itin, + (outs DPR:$dst), (ins QPR:$src, i32imm:$SIMM), N2RegVShRFrm, itin, OpcodeStr, Dt, "$dst, $src, $SIMM", "", [(set DPR:$dst, (ResTy (OpNode (OpTy QPR:$src), (i32 imm:$SIMM))))]>; @@ -1383,14 +1347,14 @@ class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, class N2VDShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), IIC_VPALiD, + (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), N2RegVShRFrm, IIC_VPALiD, OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst", [(set DPR:$dst, (Ty (add DPR:$src1, (Ty (ShOp DPR:$src2, (i32 imm:$SIMM))))))]>; class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), IIC_VPALiD, + (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), N2RegVShRFrm, IIC_VPALiD, OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst", [(set QPR:$dst, (Ty (add QPR:$src1, (Ty (ShOp QPR:$src2, (i32 imm:$SIMM))))))]>; @@ -1398,15 +1362,15 @@ class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, // Shift by immediate and insert, // both double- and quad-register. class N2VDShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, - string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> + Format f, string OpcodeStr, string Dt, ValueType Ty,SDNode ShOp> : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), IIC_VSHLiD, + (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), f, IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst", [(set DPR:$dst, (Ty (ShOp DPR:$src1, DPR:$src2, (i32 imm:$SIMM))))]>; class N2VQShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, - string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> + Format f, string OpcodeStr, string Dt, ValueType Ty,SDNode ShOp> : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), IIC_VSHLiQ, + (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), f, IIC_VSHLiQ, OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst", [(set QPR:$dst, (Ty (ShOp QPR:$src1, QPR:$src2, (i32 imm:$SIMM))))]>; @@ -1416,15 +1380,15 @@ class N2VCvtD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N2VImm<op24, op23, op11_8, op7, 0, op4, - (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), IIC_VUNAD, - OpcodeStr, Dt, "$dst, $src, $SIMM", "", + (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), NVCVTFrm, + IIC_VUNAD, OpcodeStr, Dt, "$dst, $src, $SIMM", "", [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src), (i32 imm:$SIMM))))]>; class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N2VImm<op24, op23, op11_8, op7, 1, op4, - (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), IIC_VUNAQ, - OpcodeStr, Dt, "$dst, $src, $SIMM", "", + (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), NVCVTFrm, + IIC_VUNAQ, OpcodeStr, Dt, "$dst, $src, $SIMM", "", [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src), (i32 imm:$SIMM))))]>; //===----------------------------------------------------------------------===// @@ -1568,24 +1532,24 @@ multiclass N2VLInt_QHS<bits<2> op24_23, bits<5> op11_7, bit op6, bit op4, // Neon 3-register vector intrinsics. // First with only element sizes of 16 and 32 bits: -multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, +multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, InstrItinClass itinD16, InstrItinClass itinD32, InstrItinClass itinQ16, InstrItinClass itinQ32, string OpcodeStr, string Dt, Intrinsic IntOp, bit Commutable = 0> { // 64-bit vector types. - def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, itinD16, + def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, f, itinD16, OpcodeStr, !strconcat(Dt, "16"), v4i16, v4i16, IntOp, Commutable>; - def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, itinD32, + def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, f, itinD32, OpcodeStr, !strconcat(Dt, "32"), v2i32, v2i32, IntOp, Commutable>; // 128-bit vector types. - def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, itinQ16, + def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, f, itinQ16, OpcodeStr, !strconcat(Dt, "16"), v8i16, v8i16, IntOp, Commutable>; - def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, itinQ32, + def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, f, itinQ32, OpcodeStr, !strconcat(Dt, "32"), v4i32, v4i32, IntOp, Commutable>; } @@ -1605,38 +1569,37 @@ multiclass N3VIntSL_HS<bits<4> op11_8, } // ....then also with element size of 8 bits: -multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, +multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, InstrItinClass itinD16, InstrItinClass itinD32, InstrItinClass itinQ16, InstrItinClass itinQ32, string OpcodeStr, string Dt, Intrinsic IntOp, bit Commutable = 0> - : N3VInt_HS<op24, op23, op11_8, op4, itinD16, itinD32, itinQ16, itinQ32, + : N3VInt_HS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32, OpcodeStr, Dt, IntOp, Commutable> { - def v8i8 : N3VDInt<op24, op23, 0b00, op11_8, op4, itinD16, + def v8i8 : N3VDInt<op24, op23, 0b00, op11_8, op4, f, itinD16, OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp, Commutable>; - def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, itinQ16, + def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, f, itinQ16, OpcodeStr, !strconcat(Dt, "8"), v16i8, v16i8, IntOp, Commutable>; } // ....then also with element size of 64 bits: -multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, +multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, InstrItinClass itinD16, InstrItinClass itinD32, InstrItinClass itinQ16, InstrItinClass itinQ32, string OpcodeStr, string Dt, Intrinsic IntOp, bit Commutable = 0> - : N3VInt_QHS<op24, op23, op11_8, op4, itinD16, itinD32, itinQ16, itinQ32, + : N3VInt_QHS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32, OpcodeStr, Dt, IntOp, Commutable> { - def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, itinD32, + def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, f, itinD32, OpcodeStr, !strconcat(Dt, "64"), v1i64, v1i64, IntOp, Commutable>; - def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, itinQ32, + def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, f, itinQ32, OpcodeStr, !strconcat(Dt, "64"), v2i64, v2i64, IntOp, Commutable>; } - // Neon Narrowing 3-register vector intrinsics, // source operand element sizes of 16, 32 and 64 bits: multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4, @@ -1866,46 +1829,46 @@ multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, // Neon 2-register vector shift by immediate, +// with f of either N2RegVShLFrm or N2RegVShRFrm // element sizes of 8, 16, 32 and 64 bits: multiclass N2VSh_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, - InstrItinClass itin, string OpcodeStr, string Dt, - SDNode OpNode> { + InstrItinClass itin, string OpcodeStr, string Dt, + SDNode OpNode, Format f> { // 64-bit vector types. - def v8i8 : N2VDSh<op24, op23, op11_8, 0, op4, itin, + def v8i8 : N2VDSh<op24, op23, op11_8, 0, op4, f, itin, OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> { let Inst{21-19} = 0b001; // imm6 = 001xxx } - def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, itin, + def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, f, itin, OpcodeStr, !strconcat(Dt, "16"), v4i16, OpNode> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } - def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, itin, + def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, f, itin, OpcodeStr, !strconcat(Dt, "32"), v2i32, OpNode> { let Inst{21} = 0b1; // imm6 = 1xxxxx } - def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, itin, + def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, f, itin, OpcodeStr, !strconcat(Dt, "64"), v1i64, OpNode>; // imm6 = xxxxxx // 128-bit vector types. - def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, itin, + def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, f, itin, OpcodeStr, !strconcat(Dt, "8"), v16i8, OpNode> { let Inst{21-19} = 0b001; // imm6 = 001xxx } - def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, itin, + def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, f, itin, OpcodeStr, !strconcat(Dt, "16"), v8i16, OpNode> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } - def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, itin, + def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, f, itin, OpcodeStr, !strconcat(Dt, "32"), v4i32, OpNode> { let Inst{21} = 0b1; // imm6 = 1xxxxx } - def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, itin, + def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, f, itin, OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>; // imm6 = xxxxxx } - // Neon Shift-Accumulate vector operations, // element sizes of 8, 16, 32 and 64 bits: multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, @@ -1947,41 +1910,43 @@ multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, // Neon Shift-Insert vector operations, +// with f of either N2RegVShLFrm or N2RegVShRFrm // element sizes of 8, 16, 32 and 64 bits: multiclass N2VShIns_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, - string OpcodeStr, SDNode ShOp> { + string OpcodeStr, SDNode ShOp, + Format f> { // 64-bit vector types. def v8i8 : N2VDShIns<op24, op23, op11_8, 0, op4, - OpcodeStr, "8", v8i8, ShOp> { + f, OpcodeStr, "8", v8i8, ShOp> { let Inst{21-19} = 0b001; // imm6 = 001xxx } def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4, - OpcodeStr, "16", v4i16, ShOp> { + f, OpcodeStr, "16", v4i16, ShOp> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4, - OpcodeStr, "32", v2i32, ShOp> { + f, OpcodeStr, "32", v2i32, ShOp> { let Inst{21} = 0b1; // imm6 = 1xxxxx } def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4, - OpcodeStr, "64", v1i64, ShOp>; + f, OpcodeStr, "64", v1i64, ShOp>; // imm6 = xxxxxx // 128-bit vector types. def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4, - OpcodeStr, "8", v16i8, ShOp> { + f, OpcodeStr, "8", v16i8, ShOp> { let Inst{21-19} = 0b001; // imm6 = 001xxx } def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4, - OpcodeStr, "16", v8i16, ShOp> { + f, OpcodeStr, "16", v8i16, ShOp> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4, - OpcodeStr, "32", v4i32, ShOp> { + f, OpcodeStr, "32", v4i32, ShOp> { let Inst{21} = 0b1; // imm6 = 1xxxxx } def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4, - OpcodeStr, "64", v2i64, ShOp>; + f, OpcodeStr, "64", v2i64, ShOp>; // imm6 = xxxxxx } @@ -2044,20 +2009,26 @@ defm VADDLu : N3VLInt_QHS<1,1,0b0000,0, IIC_VSHLiD, "vaddl", "u", defm VADDWs : N3VWInt_QHS<0,1,0b0001,0, "vaddw", "s", int_arm_neon_vaddws, 0>; defm VADDWu : N3VWInt_QHS<1,1,0b0001,0, "vaddw", "u", int_arm_neon_vaddwu, 0>; // VHADD : Vector Halving Add -defm VHADDs : N3VInt_QHS<0,0,0b0000,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, - IIC_VBINi4Q, "vhadd", "s", int_arm_neon_vhadds, 1>; -defm VHADDu : N3VInt_QHS<1,0,0b0000,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, - IIC_VBINi4Q, "vhadd", "u", int_arm_neon_vhaddu, 1>; +defm VHADDs : N3VInt_QHS<0, 0, 0b0000, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vhadd", "s", int_arm_neon_vhadds, 1>; +defm VHADDu : N3VInt_QHS<1, 0, 0b0000, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vhadd", "u", int_arm_neon_vhaddu, 1>; // VRHADD : Vector Rounding Halving Add -defm VRHADDs : N3VInt_QHS<0,0,0b0001,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, - IIC_VBINi4Q, "vrhadd", "s", int_arm_neon_vrhadds, 1>; -defm VRHADDu : N3VInt_QHS<1,0,0b0001,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, - IIC_VBINi4Q, "vrhadd", "u", int_arm_neon_vrhaddu, 1>; +defm VRHADDs : N3VInt_QHS<0, 0, 0b0001, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vrhadd", "s", int_arm_neon_vrhadds, 1>; +defm VRHADDu : N3VInt_QHS<1, 0, 0b0001, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vrhadd", "u", int_arm_neon_vrhaddu, 1>; // VQADD : Vector Saturating Add -defm VQADDs : N3VInt_QHSD<0,0,0b0000,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, - IIC_VBINi4Q, "vqadd", "s", int_arm_neon_vqadds, 1>; -defm VQADDu : N3VInt_QHSD<1,0,0b0000,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, - IIC_VBINi4Q, "vqadd", "u", int_arm_neon_vqaddu, 1>; +defm VQADDs : N3VInt_QHSD<0, 0, 0b0000, 1, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vqadd", "s", int_arm_neon_vqadds, 1>; +defm VQADDu : N3VInt_QHSD<1, 0, 0b0000, 1, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vqadd", "u", int_arm_neon_vqaddu, 1>; // VADDHN : Vector Add and Narrow Returning High Half (D = Q + Q) defm VADDHN : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i", int_arm_neon_vaddhn, 1>; @@ -2070,10 +2041,10 @@ defm VRADDHN : N3VNInt_HSD<1,1,0b0100,0, "vraddhn", "i", // VMUL : Vector Multiply (integer, polynomial and floating-point) defm VMUL : N3V_QHS<0, 0, 0b1001, 1, IIC_VMULi16D, IIC_VMULi32D, IIC_VMULi16Q, IIC_VMULi32Q, "vmul", "i", mul, 1>; -def VMULpd : N3VDInt<1, 0, 0b00, 0b1001, 1, IIC_VMULi16D, "vmul", "p8", - v8i8, v8i8, int_arm_neon_vmulp, 1>; -def VMULpq : N3VQInt<1, 0, 0b00, 0b1001, 1, IIC_VMULi16Q, "vmul", "p8", - v16i8, v16i8, int_arm_neon_vmulp, 1>; +def VMULpd : N3VDInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16D, "vmul", + "p8", v8i8, v8i8, int_arm_neon_vmulp, 1>; +def VMULpq : N3VQInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16Q, "vmul", + "p8", v16i8, v16i8, int_arm_neon_vmulp, 1>; def VMULfd : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VBIND, "vmul", "f32", v2f32, v2f32, fmul, 1>; def VMULfq : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VBINQ, "vmul", "f32", @@ -2103,7 +2074,7 @@ def : Pat<(v4f32 (fmul (v4f32 QPR:$src1), (SubReg_i32_lane imm:$lane)))>; // VQDMULH : Vector Saturating Doubling Multiply Returning High Half -defm VQDMULH : N3VInt_HS<0, 0, 0b1011, 0, IIC_VMULi16D, IIC_VMULi32D, +defm VQDMULH : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D, IIC_VMULi16Q, IIC_VMULi32Q, "vqdmulh", "s", int_arm_neon_vqdmulh, 1>; defm VQDMULHsl: N3VIntSL_HS<0b1100, IIC_VMULi16D, IIC_VMULi32D, @@ -2125,8 +2096,8 @@ def : Pat<(v4i32 (int_arm_neon_vqdmulh (v4i32 QPR:$src1), (SubReg_i32_lane imm:$lane)))>; // VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half -defm VQRDMULH : N3VInt_HS<1, 0, 0b1011, 0, IIC_VMULi16D, IIC_VMULi32D, - IIC_VMULi16Q, IIC_VMULi32Q, +defm VQRDMULH : N3VInt_HS<1, 0, 0b1011, 0, N3RegFrm, + IIC_VMULi16D,IIC_VMULi32D,IIC_VMULi16Q,IIC_VMULi32Q, "vqrdmulh", "s", int_arm_neon_vqrdmulh, 1>; defm VQRDMULHsl : N3VIntSL_HS<0b1101, IIC_VMULi16D, IIC_VMULi32D, IIC_VMULi16Q, IIC_VMULi32Q, @@ -2285,18 +2256,18 @@ defm VSUBLu : N3VLInt_QHS<1,1,0b0010,0, IIC_VSHLiD, "vsubl", "u", defm VSUBWs : N3VWInt_QHS<0,1,0b0011,0, "vsubw", "s", int_arm_neon_vsubws, 0>; defm VSUBWu : N3VWInt_QHS<1,1,0b0011,0, "vsubw", "u", int_arm_neon_vsubwu, 0>; // VHSUB : Vector Halving Subtract -defm VHSUBs : N3VInt_QHS<0, 0, 0b0010, 0, IIC_VBINi4D, IIC_VBINi4D, - IIC_VBINi4Q, IIC_VBINi4Q, +defm VHSUBs : N3VInt_QHS<0, 0, 0b0010, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, "vhsub", "s", int_arm_neon_vhsubs, 0>; -defm VHSUBu : N3VInt_QHS<1, 0, 0b0010, 0, IIC_VBINi4D, IIC_VBINi4D, - IIC_VBINi4Q, IIC_VBINi4Q, +defm VHSUBu : N3VInt_QHS<1, 0, 0b0010, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, "vhsub", "u", int_arm_neon_vhsubu, 0>; // VQSUB : Vector Saturing Subtract -defm VQSUBs : N3VInt_QHSD<0, 0, 0b0010, 1, IIC_VBINi4D, IIC_VBINi4D, - IIC_VBINi4Q, IIC_VBINi4Q, +defm VQSUBs : N3VInt_QHSD<0, 0, 0b0010, 1, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, "vqsub", "s", int_arm_neon_vqsubs, 0>; -defm VQSUBu : N3VInt_QHSD<1, 0, 0b0010, 1, IIC_VBINi4D, IIC_VBINi4D, - IIC_VBINi4Q, IIC_VBINi4Q, +defm VQSUBu : N3VInt_QHSD<1, 0, 0b0010, 1, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, "vqsub", "u", int_arm_neon_vqsubu, 0>; // VSUBHN : Vector Subtract and Narrow Returning High Half (D = Q - Q) defm VSUBHN : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i", @@ -2323,8 +2294,8 @@ defm VCGEs : N3V_QHS<0, 0, 0b0011, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, "vcge", "s", NEONvcge, 0>; defm VCGEu : N3V_QHS<1, 0, 0b0011, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, "vcge", "u", NEONvcgeu, 0>; -def VCGEfd : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", - v2i32, v2f32, NEONvcge, 0>; +def VCGEfd : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32, + NEONvcge, 0>; def VCGEfq : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32, NEONvcge, 0>; // For disassembly only. @@ -2351,21 +2322,27 @@ defm VCLTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s", "$dst, $src, #0">; // VACGE : Vector Absolute Compare Greater Than or Equal (aka VCAGE) -def VACGEd : N3VDInt<1, 0, 0b00, 0b1110, 1, IIC_VBIND, "vacge", "f32", - v2i32, v2f32, int_arm_neon_vacged, 0>; -def VACGEq : N3VQInt<1, 0, 0b00, 0b1110, 1, IIC_VBINQ, "vacge", "f32", - v4i32, v4f32, int_arm_neon_vacgeq, 0>; +def VACGEd : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge", + "f32", v2i32, v2f32, int_arm_neon_vacged, 0>; +def VACGEq : N3VQInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge", + "f32", v4i32, v4f32, int_arm_neon_vacgeq, 0>; // VACGT : Vector Absolute Compare Greater Than (aka VCAGT) -def VACGTd : N3VDInt<1, 0, 0b10, 0b1110, 1, IIC_VBIND, "vacgt", "f32", - v2i32, v2f32, int_arm_neon_vacgtd, 0>; -def VACGTq : N3VQInt<1, 0, 0b10, 0b1110, 1, IIC_VBINQ, "vacgt", "f32", - v4i32, v4f32, int_arm_neon_vacgtq, 0>; +def VACGTd : N3VDInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt", + "f32", v2i32, v2f32, int_arm_neon_vacgtd, 0>; +def VACGTq : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt", + "f32", v4i32, v4f32, int_arm_neon_vacgtq, 0>; // VTST : Vector Test Bits defm VTST : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, "vtst", "", NEONvtst, 1>; // Vector Bitwise Operations. +def vnot8 : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v8i8 immAllOnesV)))>; +def vnot16 : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v16i8 immAllOnesV)))>; + + // VAND : Vector Bitwise AND def VANDd : N3VDX<0, 0, 0b00, 0b0001, 1, IIC_VBINiD, "vand", v2i32, v2i32, and, 1>; @@ -2386,74 +2363,80 @@ def VORRq : N3VQX<0, 0, 0b10, 0b0001, 1, IIC_VBINiQ, "vorr", // VBIC : Vector Bitwise Bit Clear (AND NOT) def VBICd : N3VX<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2), IIC_VBINiD, - "vbic", "$dst, $src1, $src2", "", - [(set DPR:$dst, (v2i32 (and DPR:$src1, - (vnot_conv DPR:$src2))))]>; + (ins DPR:$src1, DPR:$src2), N3RegFrm, IIC_VBINiD, + "vbic", "$dst, $src1, $src2", "", + [(set DPR:$dst, (v2i32 (and DPR:$src1, + (vnot8 DPR:$src2))))]>; def VBICq : N3VX<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2), IIC_VBINiQ, - "vbic", "$dst, $src1, $src2", "", - [(set QPR:$dst, (v4i32 (and QPR:$src1, - (vnot_conv QPR:$src2))))]>; + (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINiQ, + "vbic", "$dst, $src1, $src2", "", + [(set QPR:$dst, (v4i32 (and QPR:$src1, + (vnot16 QPR:$src2))))]>; // VORN : Vector Bitwise OR NOT def VORNd : N3VX<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2), IIC_VBINiD, - "vorn", "$dst, $src1, $src2", "", - [(set DPR:$dst, (v2i32 (or DPR:$src1, - (vnot_conv DPR:$src2))))]>; + (ins DPR:$src1, DPR:$src2), N3RegFrm, IIC_VBINiD, + "vorn", "$dst, $src1, $src2", "", + [(set DPR:$dst, (v2i32 (or DPR:$src1, + (vnot8 DPR:$src2))))]>; def VORNq : N3VX<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2), IIC_VBINiQ, - "vorn", "$dst, $src1, $src2", "", - [(set QPR:$dst, (v4i32 (or QPR:$src1, - (vnot_conv QPR:$src2))))]>; + (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINiQ, + "vorn", "$dst, $src1, $src2", "", + [(set QPR:$dst, (v4i32 (or QPR:$src1, + (vnot16 QPR:$src2))))]>; // VMVN : Vector Bitwise NOT def VMVNd : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 0, 0, - (outs DPR:$dst), (ins DPR:$src), IIC_VSHLiD, - "vmvn", "$dst, $src", "", - [(set DPR:$dst, (v2i32 (vnot DPR:$src)))]>; + (outs DPR:$dst), (ins DPR:$src), IIC_VSHLiD, + "vmvn", "$dst, $src", "", + [(set DPR:$dst, (v2i32 (vnot8 DPR:$src)))]>; def VMVNq : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0, - (outs QPR:$dst), (ins QPR:$src), IIC_VSHLiD, - "vmvn", "$dst, $src", "", - [(set QPR:$dst, (v4i32 (vnot QPR:$src)))]>; -def : Pat<(v2i32 (vnot_conv DPR:$src)), (VMVNd DPR:$src)>; -def : Pat<(v4i32 (vnot_conv QPR:$src)), (VMVNq QPR:$src)>; + (outs QPR:$dst), (ins QPR:$src), IIC_VSHLiD, + "vmvn", "$dst, $src", "", + [(set QPR:$dst, (v4i32 (vnot16 QPR:$src)))]>; +def : Pat<(v2i32 (vnot8 DPR:$src)), (VMVNd DPR:$src)>; +def : Pat<(v4i32 (vnot16 QPR:$src)), (VMVNq QPR:$src)>; // VBSL : Vector Bitwise Select def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2, DPR:$src3), IIC_VCNTiD, - "vbsl", "$dst, $src2, $src3", "$src1 = $dst", - [(set DPR:$dst, - (v2i32 (or (and DPR:$src2, DPR:$src1), - (and DPR:$src3, (vnot_conv DPR:$src1)))))]>; + (ins DPR:$src1, DPR:$src2, DPR:$src3), + N3RegFrm, IIC_VCNTiD, + "vbsl", "$dst, $src2, $src3", "$src1 = $dst", + [(set DPR:$dst, + (v2i32 (or (and DPR:$src2, DPR:$src1), + (and DPR:$src3, (vnot8 DPR:$src1)))))]>; def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2, QPR:$src3), IIC_VCNTiQ, - "vbsl", "$dst, $src2, $src3", "$src1 = $dst", - [(set QPR:$dst, - (v4i32 (or (and QPR:$src2, QPR:$src1), - (and QPR:$src3, (vnot_conv QPR:$src1)))))]>; + (ins QPR:$src1, QPR:$src2, QPR:$src3), + N3RegFrm, IIC_VCNTiQ, + "vbsl", "$dst, $src2, $src3", "$src1 = $dst", + [(set QPR:$dst, + (v4i32 (or (and QPR:$src2, QPR:$src1), + (and QPR:$src3, (vnot16 QPR:$src1)))))]>; // VBIF : Vector Bitwise Insert if False // like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst", def VBIFd : N3VX<1, 0, 0b11, 0b0001, 0, 1, (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), - IIC_VBINiD, "vbif", "$dst, $src2, $src3", "$src1 = $dst", + N3RegFrm, IIC_VBINiD, + "vbif", "$dst, $src2, $src3", "$src1 = $dst", [/* For disassembly only; pattern left blank */]>; def VBIFq : N3VX<1, 0, 0b11, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), - IIC_VBINiQ, "vbif", "$dst, $src2, $src3", "$src1 = $dst", + N3RegFrm, IIC_VBINiQ, + "vbif", "$dst, $src2, $src3", "$src1 = $dst", [/* For disassembly only; pattern left blank */]>; // VBIT : Vector Bitwise Insert if True // like VBSL but with: "vbit $dst, $src2, $src1", "$src3 = $dst", def VBITd : N3VX<1, 0, 0b10, 0b0001, 0, 1, (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), - IIC_VBINiD, "vbit", "$dst, $src2, $src3", "$src1 = $dst", + N3RegFrm, IIC_VBINiD, + "vbit", "$dst, $src2, $src3", "$src1 = $dst", [/* For disassembly only; pattern left blank */]>; def VBITq : N3VX<1, 0, 0b10, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), - IIC_VBINiQ, "vbit", "$dst, $src2, $src3", "$src1 = $dst", + N3RegFrm, IIC_VBINiQ, + "vbit", "$dst, $src2, $src3", "$src1 = $dst", [/* For disassembly only; pattern left blank */]>; // VBIT/VBIF are not yet implemented. The TwoAddress pass will not go looking @@ -2463,15 +2446,15 @@ def VBITq : N3VX<1, 0, 0b10, 0b0001, 1, 1, // Vector Absolute Differences. // VABD : Vector Absolute Difference -defm VABDs : N3VInt_QHS<0, 0, 0b0111, 0, IIC_VBINi4D, IIC_VBINi4D, - IIC_VBINi4Q, IIC_VBINi4Q, +defm VABDs : N3VInt_QHS<0, 0, 0b0111, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, "vabd", "s", int_arm_neon_vabds, 0>; -defm VABDu : N3VInt_QHS<1, 0, 0b0111, 0, IIC_VBINi4D, IIC_VBINi4D, - IIC_VBINi4Q, IIC_VBINi4Q, +defm VABDu : N3VInt_QHS<1, 0, 0b0111, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, "vabd", "u", int_arm_neon_vabdu, 0>; -def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, IIC_VBIND, +def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBIND, "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 0>; -def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, IIC_VBINQ, +def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBINQ, "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 0>; // VABDL : Vector Absolute Difference Long (Q = | D - D |) @@ -2491,36 +2474,40 @@ defm VABALu : N3VLInt3_QHS<1,1,0b0101,0, "vabal", "u", int_arm_neon_vabalu>; // Vector Maximum and Minimum. // VMAX : Vector Maximum -defm VMAXs : N3VInt_QHS<0,0,0b0110,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, - IIC_VBINi4Q, "vmax", "s", int_arm_neon_vmaxs, 1>; -defm VMAXu : N3VInt_QHS<1,0,0b0110,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, - IIC_VBINi4Q, "vmax", "u", int_arm_neon_vmaxu, 1>; -def VMAXfd : N3VDInt<0, 0, 0b00, 0b1111, 0, IIC_VBIND, "vmax", "f32", - v2f32, v2f32, int_arm_neon_vmaxs, 1>; -def VMAXfq : N3VQInt<0, 0, 0b00, 0b1111, 0, IIC_VBINQ, "vmax", "f32", - v4f32, v4f32, int_arm_neon_vmaxs, 1>; +defm VMAXs : N3VInt_QHS<0, 0, 0b0110, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vmax", "s", int_arm_neon_vmaxs, 1>; +defm VMAXu : N3VInt_QHS<1, 0, 0b0110, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vmax", "u", int_arm_neon_vmaxu, 1>; +def VMAXfd : N3VDInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBIND, "vmax", + "f32", v2f32, v2f32, int_arm_neon_vmaxs, 1>; +def VMAXfq : N3VQInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINQ, "vmax", + "f32", v4f32, v4f32, int_arm_neon_vmaxs, 1>; // VMIN : Vector Minimum -defm VMINs : N3VInt_QHS<0,0,0b0110,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, - IIC_VBINi4Q, "vmin", "s", int_arm_neon_vmins, 1>; -defm VMINu : N3VInt_QHS<1,0,0b0110,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, - IIC_VBINi4Q, "vmin", "u", int_arm_neon_vminu, 1>; -def VMINfd : N3VDInt<0, 0, 0b10, 0b1111, 0, IIC_VBIND, "vmin", "f32", - v2f32, v2f32, int_arm_neon_vmins, 1>; -def VMINfq : N3VQInt<0, 0, 0b10, 0b1111, 0, IIC_VBINQ, "vmin", "f32", - v4f32, v4f32, int_arm_neon_vmins, 1>; +defm VMINs : N3VInt_QHS<0, 0, 0b0110, 1, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vmin", "s", int_arm_neon_vmins, 1>; +defm VMINu : N3VInt_QHS<1, 0, 0b0110, 1, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vmin", "u", int_arm_neon_vminu, 1>; +def VMINfd : N3VDInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBIND, "vmin", + "f32", v2f32, v2f32, int_arm_neon_vmins, 1>; +def VMINfq : N3VQInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINQ, "vmin", + "f32", v4f32, v4f32, int_arm_neon_vmins, 1>; // Vector Pairwise Operations. // VPADD : Vector Pairwise Add -def VPADDi8 : N3VDInt<0, 0, 0b00, 0b1011, 1, IIC_VBINiD, "vpadd", "i8", - v8i8, v8i8, int_arm_neon_vpadd, 0>; -def VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, IIC_VBINiD, "vpadd", "i16", - v4i16, v4i16, int_arm_neon_vpadd, 0>; -def VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, IIC_VBINiD, "vpadd", "i32", - v2i32, v2i32, int_arm_neon_vpadd, 0>; -def VPADDf : N3VDInt<1, 0, 0b00, 0b1101, 0, IIC_VBIND, "vpadd", "f32", - v2f32, v2f32, int_arm_neon_vpadd, 0>; +def VPADDi8 : N3VDInt<0, 0, 0b00, 0b1011, 1, N3RegFrm, IIC_VBINiD, "vpadd", + "i8", v8i8, v8i8, int_arm_neon_vpadd, 0>; +def VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, N3RegFrm, IIC_VBINiD, "vpadd", + "i16", v4i16, v4i16, int_arm_neon_vpadd, 0>; +def VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, N3RegFrm, IIC_VBINiD, "vpadd", + "i32", v2i32, v2i32, int_arm_neon_vpadd, 0>; +def VPADDf : N3VDInt<1, 0, 0b00, 0b1101, 0, N3RegFrm, IIC_VBIND, "vpadd", + "f32", v2f32, v2f32, int_arm_neon_vpadd, 0>; // VPADDL : Vector Pairwise Add Long defm VPADDLs : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpaddl", "s", @@ -2535,36 +2522,36 @@ defm VPADALu : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b01101, 0, "vpadal", "u", int_arm_neon_vpadalu>; // VPMAX : Vector Pairwise Maximum -def VPMAXs8 : N3VDInt<0, 0, 0b00, 0b1010, 0, IIC_VBINi4D, "vpmax", "s8", - v8i8, v8i8, int_arm_neon_vpmaxs, 0>; -def VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, IIC_VBINi4D, "vpmax", "s16", - v4i16, v4i16, int_arm_neon_vpmaxs, 0>; -def VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, IIC_VBINi4D, "vpmax", "s32", - v2i32, v2i32, int_arm_neon_vpmaxs, 0>; -def VPMAXu8 : N3VDInt<1, 0, 0b00, 0b1010, 0, IIC_VBINi4D, "vpmax", "u8", - v8i8, v8i8, int_arm_neon_vpmaxu, 0>; -def VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, IIC_VBINi4D, "vpmax", "u16", - v4i16, v4i16, int_arm_neon_vpmaxu, 0>; -def VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, IIC_VBINi4D, "vpmax", "u32", - v2i32, v2i32, int_arm_neon_vpmaxu, 0>; -def VPMAXf : N3VDInt<1, 0, 0b00, 0b1111, 0, IIC_VBINi4D, "vpmax", "f32", - v2f32, v2f32, int_arm_neon_vpmaxs, 0>; +def VPMAXs8 : N3VDInt<0, 0, 0b00, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax", + "s8", v8i8, v8i8, int_arm_neon_vpmaxs, 0>; +def VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax", + "s16", v4i16, v4i16, int_arm_neon_vpmaxs, 0>; +def VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax", + "s32", v2i32, v2i32, int_arm_neon_vpmaxs, 0>; +def VPMAXu8 : N3VDInt<1, 0, 0b00, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax", + "u8", v8i8, v8i8, int_arm_neon_vpmaxu, 0>; +def VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax", + "u16", v4i16, v4i16, int_arm_neon_vpmaxu, 0>; +def VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax", + "u32", v2i32, v2i32, int_arm_neon_vpmaxu, 0>; +def VPMAXf : N3VDInt<1, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINi4D, "vpmax", + "f32", v2f32, v2f32, int_arm_neon_vpmaxs, 0>; // VPMIN : Vector Pairwise Minimum -def VPMINs8 : N3VDInt<0, 0, 0b00, 0b1010, 1, IIC_VBINi4D, "vpmin", "s8", - v8i8, v8i8, int_arm_neon_vpmins, 0>; -def VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, IIC_VBINi4D, "vpmin", "s16", - v4i16, v4i16, int_arm_neon_vpmins, 0>; -def VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, IIC_VBINi4D, "vpmin", "s32", - v2i32, v2i32, int_arm_neon_vpmins, 0>; -def VPMINu8 : N3VDInt<1, 0, 0b00, 0b1010, 1, IIC_VBINi4D, "vpmin", "u8", - v8i8, v8i8, int_arm_neon_vpminu, 0>; -def VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, IIC_VBINi4D, "vpmin", "u16", - v4i16, v4i16, int_arm_neon_vpminu, 0>; -def VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, IIC_VBINi4D, "vpmin", "u32", - v2i32, v2i32, int_arm_neon_vpminu, 0>; -def VPMINf : N3VDInt<1, 0, 0b10, 0b1111, 0, IIC_VBINi4D, "vpmin", "f32", - v2f32, v2f32, int_arm_neon_vpmins, 0>; +def VPMINs8 : N3VDInt<0, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin", + "s8", v8i8, v8i8, int_arm_neon_vpmins, 0>; +def VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin", + "s16", v4i16, v4i16, int_arm_neon_vpmins, 0>; +def VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin", + "s32", v2i32, v2i32, int_arm_neon_vpmins, 0>; +def VPMINu8 : N3VDInt<1, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin", + "u8", v8i8, v8i8, int_arm_neon_vpminu, 0>; +def VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin", + "u16", v4i16, v4i16, int_arm_neon_vpminu, 0>; +def VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin", + "u32", v2i32, v2i32, int_arm_neon_vpminu, 0>; +def VPMINf : N3VDInt<1, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINi4D, "vpmin", + "f32", v2f32, v2f32, int_arm_neon_vpmins, 0>; // Vector Reciprocal and Reciprocal Square Root Estimate and Step. @@ -2583,10 +2570,10 @@ def VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, v4f32, v4f32, int_arm_neon_vrecpe>; // VRECPS : Vector Reciprocal Step -def VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, +def VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, N3RegFrm, IIC_VRECSD, "vrecps", "f32", v2f32, v2f32, int_arm_neon_vrecps, 1>; -def VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, +def VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, N3RegFrm, IIC_VRECSQ, "vrecps", "f32", v4f32, v4f32, int_arm_neon_vrecps, 1>; @@ -2605,25 +2592,30 @@ def VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, v4f32, v4f32, int_arm_neon_vrsqrte>; // VRSQRTS : Vector Reciprocal Square Root Step -def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, +def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, N3RegFrm, IIC_VRECSD, "vrsqrts", "f32", v2f32, v2f32, int_arm_neon_vrsqrts, 1>; -def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, +def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, N3RegFrm, IIC_VRECSQ, "vrsqrts", "f32", v4f32, v4f32, int_arm_neon_vrsqrts, 1>; // Vector Shifts. // VSHL : Vector Shift -defm VSHLs : N3VInt_QHSD<0, 0, 0b0100, 0, IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, - IIC_VSHLiQ, "vshl", "s", int_arm_neon_vshifts, 0>; -defm VSHLu : N3VInt_QHSD<1, 0, 0b0100, 0, IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, - IIC_VSHLiQ, "vshl", "u", int_arm_neon_vshiftu, 0>; +defm VSHLs : N3VInt_QHSD<0, 0, 0b0100, 0, N3RegVShFrm, + IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ, + "vshl", "s", int_arm_neon_vshifts, 0>; +defm VSHLu : N3VInt_QHSD<1, 0, 0b0100, 0, N3RegVShFrm, + IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ, + "vshl", "u", int_arm_neon_vshiftu, 0>; // VSHL : Vector Shift Left (Immediate) -defm VSHLi : N2VSh_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl>; +defm VSHLi : N2VSh_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl, + N2RegVShLFrm>; // VSHR : Vector Shift Right (Immediate) -defm VSHRs : N2VSh_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s", NEONvshrs>; -defm VSHRu : N2VSh_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", NEONvshru>; +defm VSHRs : N2VSh_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s", NEONvshrs, + N2RegVShRFrm>; +defm VSHRu : N2VSh_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", NEONvshru, + N2RegVShRFrm>; // VSHLL : Vector Shift Left Long defm VSHLLs : N2VLSh_QHS<0, 1, 0b1010, 0, 0, 1, "vshll", "s", NEONvshlls>; @@ -2649,28 +2641,37 @@ defm VSHRN : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i", NEONvshrn>; // VRSHL : Vector Rounding Shift -defm VRSHLs : N3VInt_QHSD<0,0,0b0101,0, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, - IIC_VSHLi4Q, "vrshl", "s", int_arm_neon_vrshifts,0>; -defm VRSHLu : N3VInt_QHSD<1,0,0b0101,0, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, - IIC_VSHLi4Q, "vrshl", "u", int_arm_neon_vrshiftu,0>; +defm VRSHLs : N3VInt_QHSD<0, 0, 0b0101, 0, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vrshl", "s", int_arm_neon_vrshifts, 0>; +defm VRSHLu : N3VInt_QHSD<1, 0, 0b0101, 0, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vrshl", "u", int_arm_neon_vrshiftu, 0>; // VRSHR : Vector Rounding Shift Right -defm VRSHRs : N2VSh_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", NEONvrshrs>; -defm VRSHRu : N2VSh_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u", NEONvrshru>; +defm VRSHRs : N2VSh_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", NEONvrshrs, + N2RegVShRFrm>; +defm VRSHRu : N2VSh_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u", NEONvrshru, + N2RegVShRFrm>; // VRSHRN : Vector Rounding Shift Right and Narrow defm VRSHRN : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn", "i", NEONvrshrn>; // VQSHL : Vector Saturating Shift -defm VQSHLs : N3VInt_QHSD<0,0,0b0100,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, - IIC_VSHLi4Q, "vqshl", "s", int_arm_neon_vqshifts,0>; -defm VQSHLu : N3VInt_QHSD<1,0,0b0100,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, - IIC_VSHLi4Q, "vqshl", "u", int_arm_neon_vqshiftu,0>; +defm VQSHLs : N3VInt_QHSD<0, 0, 0b0100, 1, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vqshl", "s", int_arm_neon_vqshifts, 0>; +defm VQSHLu : N3VInt_QHSD<1, 0, 0b0100, 1, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vqshl", "u", int_arm_neon_vqshiftu, 0>; // VQSHL : Vector Saturating Shift Left (Immediate) -defm VQSHLsi : N2VSh_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s", NEONvqshls>; -defm VQSHLui : N2VSh_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u", NEONvqshlu>; +defm VQSHLsi : N2VSh_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshls, + N2RegVShLFrm>; +defm VQSHLui : N2VSh_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u",NEONvqshlu, + N2RegVShLFrm>; // VQSHLU : Vector Saturating Shift Left (Immediate, Unsigned) -defm VQSHLsu : N2VSh_QHSD<1,1,0b0110,1, IIC_VSHLi4D, "vqshlu","s",NEONvqshlsu>; +defm VQSHLsu : N2VSh_QHSD<1,1,0b0110,1, IIC_VSHLi4D,"vqshlu","s",NEONvqshlsu, + N2RegVShLFrm>; // VQSHRN : Vector Saturating Shift Right and Narrow defm VQSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "s", @@ -2683,12 +2684,12 @@ defm VQSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 0, 1, IIC_VSHLi4D, "vqshrun", "s", NEONvqshrnsu>; // VQRSHL : Vector Saturating Rounding Shift -defm VQRSHLs : N3VInt_QHSD<0,0,0b0101,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, - IIC_VSHLi4Q, "vqrshl", "s", - int_arm_neon_vqrshifts, 0>; -defm VQRSHLu : N3VInt_QHSD<1,0,0b0101,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, - IIC_VSHLi4Q, "vqrshl", "u", - int_arm_neon_vqrshiftu, 0>; +defm VQRSHLs : N3VInt_QHSD<0, 0, 0b0101, 1, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vqrshl", "s", int_arm_neon_vqrshifts, 0>; +defm VQRSHLu : N3VInt_QHSD<1, 0, 0b0101, 1, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vqrshl", "u", int_arm_neon_vqrshiftu, 0>; // VQRSHRN : Vector Saturating Rounding Shift Right and Narrow defm VQRSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "s", @@ -2708,9 +2709,9 @@ defm VRSRAs : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra", "s", NEONvrshrs>; defm VRSRAu : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra", "u", NEONvrshru>; // VSLI : Vector Shift Left and Insert -defm VSLI : N2VShIns_QHSD<1, 1, 0b0101, 1, "vsli", NEONvsli>; +defm VSLI : N2VShIns_QHSD<1, 1, 0b0101, 1, "vsli", NEONvsli, N2RegVShLFrm>; // VSRI : Vector Shift Right and Insert -defm VSRI : N2VShIns_QHSD<1, 1, 0b0100, 1, "vsri", NEONvsri>; +defm VSRI : N2VShIns_QHSD<1, 1, 0b0100, 1, "vsri", NEONvsri, N2RegVShRFrm>; // Vector Absolute and Saturating Absolute. @@ -2732,19 +2733,22 @@ defm VQABS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, // Vector Negate. -def vneg : PatFrag<(ops node:$in), (sub immAllZerosV, node:$in)>; -def vneg_conv : PatFrag<(ops node:$in), (sub immAllZerosV_bc, node:$in)>; +def vneg : PatFrag<(ops node:$in), (sub immAllZerosV, node:$in)>; +def vneg8 : PatFrag<(ops node:$in), + (sub (bitconvert (v8i8 immAllZerosV)), node:$in)>; +def vneg16 : PatFrag<(ops node:$in), + (sub (bitconvert (v16i8 immAllZerosV)), node:$in)>; class VNEGD<bits<2> size, string OpcodeStr, string Dt, ValueType Ty> : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$dst), (ins DPR:$src), IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "", - [(set DPR:$dst, (Ty (vneg DPR:$src)))]>; + [(set DPR:$dst, (Ty (vneg8 DPR:$src)))]>; class VNEGQ<bits<2> size, string OpcodeStr, string Dt, ValueType Ty> : N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$dst), (ins QPR:$src), IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "", - [(set QPR:$dst, (Ty (vneg QPR:$src)))]>; + [(set QPR:$dst, (Ty (vneg16 QPR:$src)))]>; -// VNEG : Vector Negate +// VNEG : Vector Negate (integer) def VNEGs8d : VNEGD<0b00, "vneg", "s8", v8i8>; def VNEGs16d : VNEGD<0b01, "vneg", "s16", v4i16>; def VNEGs32d : VNEGD<0b10, "vneg", "s32", v2i32>; @@ -2762,12 +2766,12 @@ def VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0, "vneg", "f32", "$dst, $src", "", [(set QPR:$dst, (v4f32 (fneg QPR:$src)))]>; -def : Pat<(v8i8 (vneg_conv DPR:$src)), (VNEGs8d DPR:$src)>; -def : Pat<(v4i16 (vneg_conv DPR:$src)), (VNEGs16d DPR:$src)>; -def : Pat<(v2i32 (vneg_conv DPR:$src)), (VNEGs32d DPR:$src)>; -def : Pat<(v16i8 (vneg_conv QPR:$src)), (VNEGs8q QPR:$src)>; -def : Pat<(v8i16 (vneg_conv QPR:$src)), (VNEGs16q QPR:$src)>; -def : Pat<(v4i32 (vneg_conv QPR:$src)), (VNEGs32q QPR:$src)>; +def : Pat<(v8i8 (vneg8 DPR:$src)), (VNEGs8d DPR:$src)>; +def : Pat<(v4i16 (vneg8 DPR:$src)), (VNEGs16d DPR:$src)>; +def : Pat<(v2i32 (vneg8 DPR:$src)), (VNEGs32d DPR:$src)>; +def : Pat<(v16i8 (vneg16 QPR:$src)), (VNEGs8q QPR:$src)>; +def : Pat<(v8i16 (vneg16 QPR:$src)), (VNEGs16q QPR:$src)>; +def : Pat<(v4i32 (vneg16 QPR:$src)), (VNEGs32q QPR:$src)>; // VQNEG : Vector Saturating Negate defm VQNEG : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0, @@ -2805,9 +2809,9 @@ def VSWPq : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 1, 0, // VMOV : Vector Move (Register) def VMOVDneon: N3VX<0, 0, 0b10, 0b0001, 0, 1, (outs DPR:$dst), (ins DPR:$src), - IIC_VMOVD, "vmov", "$dst, $src", "", []>; + N3RegFrm, IIC_VMOVD, "vmov", "$dst, $src", "", []>; def VMOVQ : N3VX<0, 0, 0b10, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src), - IIC_VMOVD, "vmov", "$dst, $src", "", []>; + N3RegFrm, IIC_VMOVD, "vmov", "$dst, $src", "", []>; // VMOV : Vector Move (Immediate) @@ -3048,30 +3052,29 @@ def VDUPfq : NVDup<0b11101010, 0b1011, 0b00, (outs QPR:$dst), (ins GPR:$src), // VDUP : Vector Duplicate Lane (from scalar to all elements) -class VDUPLND<bits<2> op19_18, bits<2> op17_16, - string OpcodeStr, string Dt, ValueType Ty> - : N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 0, 0, - (outs DPR:$dst), (ins DPR:$src, nohash_imm:$lane), IIC_VMOVD, - OpcodeStr, Dt, "$dst, $src[$lane]", "", - [(set DPR:$dst, (Ty (NEONvduplane (Ty DPR:$src), imm:$lane)))]>; +class VDUPLND<bits<4> op19_16, string OpcodeStr, string Dt, + ValueType Ty> + : NVDupLane<op19_16, 0, (outs DPR:$dst), (ins DPR:$src, nohash_imm:$lane), + IIC_VMOVD, OpcodeStr, Dt, "$dst, $src[$lane]", + [(set DPR:$dst, (Ty (NEONvduplane (Ty DPR:$src), imm:$lane)))]>; -class VDUPLNQ<bits<2> op19_18, bits<2> op17_16, string OpcodeStr, string Dt, +class VDUPLNQ<bits<4> op19_16, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy> - : N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 1, 0, - (outs QPR:$dst), (ins DPR:$src, nohash_imm:$lane), IIC_VMOVD, - OpcodeStr, Dt, "$dst, $src[$lane]", "", - [(set QPR:$dst, (ResTy (NEONvduplane (OpTy DPR:$src), imm:$lane)))]>; + : NVDupLane<op19_16, 1, (outs QPR:$dst), (ins DPR:$src, nohash_imm:$lane), + IIC_VMOVD, OpcodeStr, Dt, "$dst, $src[$lane]", + [(set QPR:$dst, (ResTy (NEONvduplane (OpTy DPR:$src), + imm:$lane)))]>; // Inst{19-16} is partially specified depending on the element size. -def VDUPLN8d : VDUPLND<{?,?}, {?,1}, "vdup", "8", v8i8>; -def VDUPLN16d : VDUPLND<{?,?}, {1,0}, "vdup", "16", v4i16>; -def VDUPLN32d : VDUPLND<{?,1}, {0,0}, "vdup", "32", v2i32>; -def VDUPLNfd : VDUPLND<{?,1}, {0,0}, "vdup", "32", v2f32>; -def VDUPLN8q : VDUPLNQ<{?,?}, {?,1}, "vdup", "8", v16i8, v8i8>; -def VDUPLN16q : VDUPLNQ<{?,?}, {1,0}, "vdup", "16", v8i16, v4i16>; -def VDUPLN32q : VDUPLNQ<{?,1}, {0,0}, "vdup", "32", v4i32, v2i32>; -def VDUPLNfq : VDUPLNQ<{?,1}, {0,0}, "vdup", "32", v4f32, v2f32>; +def VDUPLN8d : VDUPLND<{?,?,?,1}, "vdup", "8", v8i8>; +def VDUPLN16d : VDUPLND<{?,?,1,0}, "vdup", "16", v4i16>; +def VDUPLN32d : VDUPLND<{?,1,0,0}, "vdup", "32", v2i32>; +def VDUPLNfd : VDUPLND<{?,1,0,0}, "vdup", "32", v2f32>; +def VDUPLN8q : VDUPLNQ<{?,?,?,1}, "vdup", "8", v16i8, v8i8>; +def VDUPLN16q : VDUPLNQ<{?,?,1,0}, "vdup", "16", v8i16, v4i16>; +def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32>; +def VDUPLNfq : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4f32, v2f32>; def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)), (v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src, @@ -3233,15 +3236,15 @@ def VREV16q8 : VREV16Q<0b00, "vrev16", "8", v16i8>; class VEXTd<string OpcodeStr, string Dt, ValueType Ty> : N3V<0,1,0b11,{?,?,?,?},0,0, (outs DPR:$dst), - (ins DPR:$lhs, DPR:$rhs, i32imm:$index), IIC_VEXTD, - OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "", + (ins DPR:$lhs, DPR:$rhs, i32imm:$index), NVExtFrm, + IIC_VEXTD, OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "", [(set DPR:$dst, (Ty (NEONvext (Ty DPR:$lhs), (Ty DPR:$rhs), imm:$index)))]>; class VEXTq<string OpcodeStr, string Dt, ValueType Ty> : N3V<0,1,0b11,{?,?,?,?},1,0, (outs QPR:$dst), - (ins QPR:$lhs, QPR:$rhs, i32imm:$index), IIC_VEXTQ, - OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "", + (ins QPR:$lhs, QPR:$rhs, i32imm:$index), NVExtFrm, + IIC_VEXTQ, OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "", [(set QPR:$dst, (Ty (NEONvext (Ty QPR:$lhs), (Ty QPR:$rhs), imm:$index)))]>; @@ -3290,25 +3293,26 @@ def VZIPq32 : N2VQShuffle<0b10, 0b00011, IIC_VPERMQ3, "vzip", "32">; // VTBL : Vector Table Lookup def VTBL1 : N3V<1,1,0b11,0b1000,0,0, (outs DPR:$dst), - (ins DPR:$tbl1, DPR:$src), IIC_VTB1, + (ins DPR:$tbl1, DPR:$src), NVTBLFrm, IIC_VTB1, "vtbl", "8", "$dst, \\{$tbl1\\}, $src", "", [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl1 DPR:$tbl1, DPR:$src)))]>; let hasExtraSrcRegAllocReq = 1 in { def VTBL2 : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$dst), - (ins DPR:$tbl1, DPR:$tbl2, DPR:$src), IIC_VTB2, + (ins DPR:$tbl1, DPR:$tbl2, DPR:$src), NVTBLFrm, IIC_VTB2, "vtbl", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "", [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl2 DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>; def VTBL3 : N3V<1,1,0b11,0b1010,0,0, (outs DPR:$dst), - (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), IIC_VTB3, + (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), NVTBLFrm, IIC_VTB3, "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", "", [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl3 DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>; def VTBL4 : N3V<1,1,0b11,0b1011,0,0, (outs DPR:$dst), - (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), IIC_VTB4, + (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), + NVTBLFrm, IIC_VTB4, "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src", "", [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl4 DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src)))]>; @@ -3317,26 +3321,27 @@ def VTBL4 // VTBX : Vector Table Extension def VTBX1 : N3V<1,1,0b11,0b1000,1,0, (outs DPR:$dst), - (ins DPR:$orig, DPR:$tbl1, DPR:$src), IIC_VTBX1, + (ins DPR:$orig, DPR:$tbl1, DPR:$src), NVTBLFrm, IIC_VTBX1, "vtbx", "8", "$dst, \\{$tbl1\\}, $src", "$orig = $dst", [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx1 DPR:$orig, DPR:$tbl1, DPR:$src)))]>; let hasExtraSrcRegAllocReq = 1 in { def VTBX2 : N3V<1,1,0b11,0b1001,1,0, (outs DPR:$dst), - (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src), IIC_VTBX2, + (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src), NVTBLFrm, IIC_VTBX2, "vtbx", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "$orig = $dst", [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx2 DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>; def VTBX3 : N3V<1,1,0b11,0b1010,1,0, (outs DPR:$dst), - (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), IIC_VTBX3, + (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), + NVTBLFrm, IIC_VTBX3, "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", "$orig = $dst", [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx3 DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>; def VTBX4 : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$dst), (ins DPR:$orig, DPR:$tbl1, - DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), IIC_VTBX4, + DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), NVTBLFrm, IIC_VTBX4, "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src", "$orig = $dst", [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx4 DPR:$orig, DPR:$tbl1, @@ -3396,12 +3401,12 @@ def : N3VSPat<fmul, VMULfd_sfp>; //let neverHasSideEffects = 1 in //def VMLAfd_sfp : N3VSMulOp<0,0,0b00,0b1101,1, IIC_VMACD, "vmla", "f32", -// v2f32, fmul, fadd>; +// v2f32, fmul, fadd>; //def : N3VSMulOpPat<fmul, fadd, VMLAfd_sfp>; //let neverHasSideEffects = 1 in //def VMLSfd_sfp : N3VSMulOp<0,0,0b10,0b1101,1, IIC_VMACD, "vmls", "f32", -// v2f32, fmul, fsub>; +// v2f32, fmul, fsub>; //def : N3VSMulOpPat<fmul, fsub, VMLSfd_sfp>; // Vector Absolute used for single-precision FP @@ -3421,14 +3426,14 @@ def : N2VSPat<fneg, f32, v2f32, VNEGfd_sfp>; // Vector Maximum used for single-precision FP let neverHasSideEffects = 1 in def VMAXfd_sfp : N3V<0, 0, 0b00, 0b1111, 0, 0, (outs DPR_VFP2:$dst), - (ins DPR_VFP2:$src1, DPR_VFP2:$src2), IIC_VBIND, + (ins DPR_VFP2:$src1, DPR_VFP2:$src2), N3RegFrm, IIC_VBIND, "vmax", "f32", "$dst, $src1, $src2", "", []>; def : N3VSPat<NEONfmax, VMAXfd_sfp>; // Vector Minimum used for single-precision FP let neverHasSideEffects = 1 in def VMINfd_sfp : N3V<0, 0, 0b00, 0b1111, 0, 0, (outs DPR_VFP2:$dst), - (ins DPR_VFP2:$src1, DPR_VFP2:$src2), IIC_VBIND, + (ins DPR_VFP2:$src1, DPR_VFP2:$src2), N3RegFrm, IIC_VBIND, "vmin", "f32", "$dst, $src1, $src2", "", []>; def : N3VSPat<NEONfmin, VMINfd_sfp>; diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td index aca8230..0458389 100644 --- a/lib/Target/ARM/ARMInstrVFP.td +++ b/lib/Target/ARM/ARMInstrVFP.td @@ -545,7 +545,7 @@ def VULTOD : AVConv1XI<0b11101, 0b11, 0b1011, 0b1011, 1, // FP FMA Operations. // -def VMLAD : ADbI<0b11100, 0b00, 0, 0, +def VMLAD : ADbI_vmlX<0b11100, 0b00, 0, 0, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b), IIC_fpMAC64, "vmla", ".f64\t$dst, $a, $b", [(set DPR:$dst, (fadd (fmul DPR:$a, DPR:$b), @@ -558,7 +558,7 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0, [(set SPR:$dst, (fadd (fmul SPR:$a, SPR:$b), SPR:$dstin))]>, RegConstraint<"$dstin = $dst">; -def VNMLSD : ADbI<0b11100, 0b01, 0, 0, +def VNMLSD : ADbI_vmlX<0b11100, 0b01, 0, 0, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b), IIC_fpMAC64, "vnmls", ".f64\t$dst, $a, $b", [(set DPR:$dst, (fsub (fmul DPR:$a, DPR:$b), @@ -571,7 +571,7 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0, [(set SPR:$dst, (fsub (fmul SPR:$a, SPR:$b), SPR:$dstin))]>, RegConstraint<"$dstin = $dst">; -def VMLSD : ADbI<0b11100, 0b00, 1, 0, +def VMLSD : ADbI_vmlX<0b11100, 0b00, 1, 0, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b), IIC_fpMAC64, "vmls", ".f64\t$dst, $a, $b", [(set DPR:$dst, (fadd (fneg (fmul DPR:$a, DPR:$b)), @@ -589,7 +589,7 @@ def : Pat<(fsub DPR:$dstin, (fmul DPR:$a, (f64 DPR:$b))), def : Pat<(fsub SPR:$dstin, (fmul SPR:$a, SPR:$b)), (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[DontUseNEONForFP]>; -def VNMLAD : ADbI<0b11100, 0b01, 1, 0, +def VNMLAD : ADbI_vmlX<0b11100, 0b01, 1, 0, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b), IIC_fpMAC64, "vnmla", ".f64\t$dst, $a, $b", [(set DPR:$dst, (fsub (fneg (fmul DPR:$a, DPR:$b)), diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index bdbec30..cb762a4 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -341,6 +341,7 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned PReg = PMO.getReg(); unsigned PRegNum = PMO.isUndef() ? UINT_MAX : ARMRegisterInfo::getRegisterNumbering(PReg); + unsigned Count = 1; for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) { int NewOffset = MemOps[i].Offset; @@ -350,11 +351,14 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, : ARMRegisterInfo::getRegisterNumbering(Reg); // AM4 - register numbers in ascending order. // AM5 - consecutive register numbers in ascending order. + // Can only do up to 16 double-word registers per insn. if (Reg != ARM::SP && NewOffset == Offset + (int)Size && - ((isAM4 && RegNum > PRegNum) || RegNum == PRegNum+1)) { + ((isAM4 && RegNum > PRegNum) + || ((Size < 8 || Count < 16) && RegNum == PRegNum+1))) { Offset += Size; PRegNum = RegNum; + ++Count; } else { // Can't merge this in. Try merge the earlier ones first. MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset, diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index 2dad7f1..9e55cd8 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -22,10 +22,6 @@ using namespace llvm; static cl::opt<bool> ReserveR9("arm-reserve-r9", cl::Hidden, cl::desc("Reserve R9, making it unavailable as GPR")); -static cl::opt<bool> -UseNEONFP("arm-use-neon-fp", - cl::desc("Use NEON for single-precision FP"), - cl::init(false), cl::Hidden); static cl::opt<bool> UseMOVT("arm-use-movt", @@ -35,7 +31,8 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS, bool isT) : ARMArchVersion(V4) , ARMFPUType(None) - , UseNEONForSinglePrecisionFP(UseNEONFP) + , UseNEONForSinglePrecisionFP(false) + , SlowVMLx(false) , IsThumb(isT) , ThumbMode(Thumb1) , PostRAScheduler(false) @@ -115,14 +112,6 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS, if (!isThumb() || hasThumb2()) PostRAScheduler = true; - - // Set CPU specific features. - if (CPUString == "cortex-a8") { - // On Cortex-a8, it's faster to perform some single-precision FP - // operations with NEON instructions. - if (UseNEONFP.getPosition() == 0) - UseNEONForSinglePrecisionFP = true; - } } /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol. diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 2dc81a4..fa56a91 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -50,6 +50,10 @@ protected: /// determine if NEON should actually be used. bool UseNEONForSinglePrecisionFP; + /// SlowVMLx - If the VFP2 instructions are available, indicates whether + /// the VML[AS] instructions are slow (if so, don't use them). + bool SlowVMLx; + /// IsThumb - True if we are in thumb mode, false if in ARM mode. bool IsThumb; @@ -119,6 +123,7 @@ protected: bool hasNEON() const { return ARMFPUType >= NEON; } bool useNEONForSinglePrecisionFP() const { return hasNEON() && UseNEONForSinglePrecisionFP; } + bool useVMLx() const {return hasVFP2() && !SlowVMLx; } bool hasFP16() const { return HasFP16; } diff --git a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp index 4a7a1e4..ba736e3 100644 --- a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp +++ b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp @@ -841,7 +841,7 @@ GetARMSetPICJumpTableLabel2(unsigned uid, unsigned uid2, raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix() << getFunctionNumber() << '_' << uid << '_' << uid2 << "_set_" << MBB->getNumber(); - return OutContext.GetOrCreateTemporarySymbol(Name.str()); + return OutContext.GetOrCreateSymbol(Name.str()); } MCSymbol *ARMAsmPrinter:: @@ -849,7 +849,7 @@ GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const { SmallString<60> Name; raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() << '_' << uid << '_' << uid2; - return OutContext.GetOrCreateTemporarySymbol(Name.str()); + return OutContext.GetOrCreateSymbol(Name.str()); } void ARMAsmPrinter::printJTBlockOperand(const MachineInstr *MI, int OpNum) { @@ -1132,6 +1132,11 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { OutStreamer.EmitIntValue(0, 4/*size*/, 0/*addrspace*/); else // Internal to current translation unit. + // + // When we place the LSDA into the TEXT section, the type info pointers + // need to be indirect and pc-rel. We accomplish this by using NLPs. + // However, sometimes the types are local to the file. So we need to + // fill in the value for the NLP in those cases. OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(), OutContext), 4/*size*/, 0/*addrspace*/); @@ -1186,7 +1191,7 @@ void ARMAsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI) { // FIXME: MOVE TO SHARED PLACE. unsigned Id = (unsigned)MI->getOperand(2).getImm(); const char *Prefix = MAI->getPrivateGlobalPrefix(); - MCSymbol *Label =OutContext.GetOrCreateTemporarySymbol(Twine(Prefix) + MCSymbol *Label =OutContext.GetOrCreateSymbol(Twine(Prefix) + "PC" + Twine(getFunctionNumber()) + "_" + Twine(Id)); OutStreamer.EmitLabel(Label); diff --git a/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp b/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp index 7cb305f..ab2b06b 100644 --- a/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp +++ b/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp @@ -75,7 +75,7 @@ GetJumpTableSymbol(const MachineOperand &MO) const { #endif // Create a symbol for the name. - return Ctx.GetOrCreateTemporarySymbol(Name.str()); + return Ctx.GetOrCreateSymbol(Name.str()); } MCSymbol *ARMMCInstLower:: @@ -91,7 +91,7 @@ GetConstantPoolIndexSymbol(const MachineOperand &MO) const { #endif // Create a symbol for the name. - return Ctx.GetOrCreateTemporarySymbol(Name.str()); + return Ctx.GetOrCreateSymbol(Name.str()); } MCOperand ARMMCInstLower:: diff --git a/lib/Target/ARM/NEONPreAllocPass.cpp b/lib/Target/ARM/NEONPreAllocPass.cpp index c36fe63..7334259 100644 --- a/lib/Target/ARM/NEONPreAllocPass.cpp +++ b/lib/Target/ARM/NEONPreAllocPass.cpp @@ -46,10 +46,13 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs, default: break; + case ARM::VLD1q8: + case ARM::VLD1q16: + case ARM::VLD1q32: + case ARM::VLD1q64: case ARM::VLD2d8: case ARM::VLD2d16: case ARM::VLD2d32: - case ARM::VLD2d64: case ARM::VLD2LNd8: case ARM::VLD2LNd16: case ARM::VLD2LNd32: @@ -83,7 +86,7 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs, case ARM::VLD3d8: case ARM::VLD3d16: case ARM::VLD3d32: - case ARM::VLD3d64: + case ARM::VLD1d64T: case ARM::VLD3LNd8: case ARM::VLD3LNd16: case ARM::VLD3LNd32: @@ -128,7 +131,7 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs, case ARM::VLD4d8: case ARM::VLD4d16: case ARM::VLD4d32: - case ARM::VLD4d64: + case ARM::VLD1d64Q: case ARM::VLD4LNd8: case ARM::VLD4LNd16: case ARM::VLD4LNd32: @@ -170,10 +173,13 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs, Stride = 2; return true; + case ARM::VST1q8: + case ARM::VST1q16: + case ARM::VST1q32: + case ARM::VST1q64: case ARM::VST2d8: case ARM::VST2d16: case ARM::VST2d32: - case ARM::VST2d64: case ARM::VST2LNd8: case ARM::VST2LNd16: case ARM::VST2LNd32: @@ -207,7 +213,7 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs, case ARM::VST3d8: case ARM::VST3d16: case ARM::VST3d32: - case ARM::VST3d64: + case ARM::VST1d64T: case ARM::VST3LNd8: case ARM::VST3LNd16: case ARM::VST3LNd32: @@ -252,7 +258,7 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs, case ARM::VST4d8: case ARM::VST4d16: case ARM::VST4d32: - case ARM::VST4d64: + case ARM::VST1d64Q: case ARM::VST4LNd8: case ARM::VST4LNd16: case ARM::VST4LNd32: diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt index 57b65cf..85d5ca0 100644 --- a/lib/Target/ARM/README.txt +++ b/lib/Target/ARM/README.txt @@ -12,6 +12,9 @@ Reimplement 'select' in terms of 'SEL'. A few ARMv6T2 ops should be pattern matched: BFI, SBFX, and UBFX +Interesting optimization for PIC codegen on arm-linux: +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43129 + //===---------------------------------------------------------------------===// Crazy idea: Consider code that uses lots of 8-bit or 16-bit values. By the diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp index 29ae631..ad98839 100644 --- a/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -200,6 +200,8 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB, // It's illegal to emit pop instruction without operands. if (NumRegs) MBB.insert(MI, &*MIB); + else + MF.DeleteMachineInstr(MIB); return true; } diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp index e4abcdb..55163f9 100644 --- a/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -69,7 +69,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL = DebugLoc::getUnknownLoc(); if (I != MBB.end()) DL = I->getDebugLoc(); - if (RC == ARM::GPRRegisterClass) { + if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass) { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); MachineMemOperand *MMO = @@ -93,7 +93,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL = DebugLoc::getUnknownLoc(); if (I != MBB.end()) DL = I->getDebugLoc(); - if (RC == ARM::GPRRegisterClass) { + if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass) { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); MachineMemOperand *MMO = |