diff options
author | rdivacky <rdivacky@FreeBSD.org> | 2010-04-02 08:54:30 +0000 |
---|---|---|
committer | rdivacky <rdivacky@FreeBSD.org> | 2010-04-02 08:54:30 +0000 |
commit | 20e856b2a58d12231aa42d5d13888b15ac03e5a4 (patch) | |
tree | cf5763d092b81cecc168fa28032247ee495d06e2 /lib/Target | |
parent | 2f2afc1aae898651e26987a5c71f3febb19bca98 (diff) | |
download | FreeBSD-src-20e856b2a58d12231aa42d5d13888b15ac03e5a4.zip FreeBSD-src-20e856b2a58d12231aa42d5d13888b15ac03e5a4.tar.gz |
Update LLVM to r100181.
Diffstat (limited to 'lib/Target')
72 files changed, 2678 insertions, 1734 deletions
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index bbb1dbd..6486a60 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -43,6 +43,21 @@ def FeatureThumb2 : SubtargetFeature<"thumb2", "ThumbMode", "Thumb2", def FeatureFP16 : SubtargetFeature<"fp16", "HasFP16", "true", "Enable half-precision floating point">; +// Some processors have multiply-accumulate instructions that don't +// play nicely with other VFP instructions, and it's generally better +// to just not use them. +// FIXME: Currently, this is only flagged for Cortex-A8. It may be true for +// others as well. We should do more benchmarking and confirm one way or +// the other. +def FeatureHasSlowVMLx : SubtargetFeature<"vmlx", "SlowVMLx", "true", + "Disable VFP MAC instructions">; +// Some processors benefit from using NEON instructions for scalar +// single-precision FP operations. +def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP", + "true", + "Use NEON for single precision FP">; + + //===----------------------------------------------------------------------===// // ARM Processors supported. // @@ -92,7 +107,8 @@ def : ProcNoItin<"iwmmxt", [ArchV5TE]>; // V6 Processors. def : Processor<"arm1136j-s", ARMV6Itineraries, [ArchV6]>; -def : Processor<"arm1136jf-s", ARMV6Itineraries, [ArchV6, FeatureVFP2]>; +def : Processor<"arm1136jf-s", ARMV6Itineraries, [ArchV6, FeatureVFP2, + FeatureHasSlowVMLx]>; def : Processor<"arm1176jz-s", ARMV6Itineraries, [ArchV6]>; def : Processor<"arm1176jzf-s", ARMV6Itineraries, [ArchV6, FeatureVFP2]>; def : Processor<"mpcorenovfp", ARMV6Itineraries, [ArchV6]>; @@ -106,7 +122,8 @@ def : Processor<"arm1156t2f-s", ARMV6Itineraries, // V7 Processors. def : Processor<"cortex-a8", CortexA8Itineraries, - [ArchV7A, FeatureThumb2, FeatureNEON]>; + [ArchV7A, FeatureThumb2, FeatureNEON, FeatureHasSlowVMLx, + FeatureNEONForFP]>; def : ProcNoItin<"cortex-a9", [ArchV7A, FeatureThumb2, FeatureNEON]>; //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index e6ea03a..0a0b0ea 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -204,7 +204,15 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, bool AllowModify) const { // If the block has no terminators, it just falls into the block after it. MachineBasicBlock::iterator I = MBB.end(); - if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) + if (I == MBB.begin()) + return false; + --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return false; + --I; + } + if (!isUnpredicatedTerminator(I)) return false; // Get the last instruction in the block. @@ -275,6 +283,11 @@ unsigned ARMBaseInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { MachineBasicBlock::iterator I = MBB.end(); if (I == MBB.begin()) return 0; --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return 0; + --I; + } if (!isUncondBranchOpcode(I->getOpcode()) && !isCondBranchOpcode(I->getOpcode())) return 0; @@ -738,14 +751,16 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, RC == ARM::QPR_VFP2RegisterClass) && "Unknown regclass!"); // FIXME: Neon instructions should support predicates if (Align >= 16 && (getRegisterInfo().canRealignStack(MF))) { - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q64)) + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q)) .addFrameIndex(FI).addImm(128) .addMemOperand(MMO) .addReg(SrcReg, getKillRegState(isKill))); } else { - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRQ)). + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMQ)). addReg(SrcReg, getKillRegState(isKill)) - .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + .addFrameIndex(FI) + .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4)) + .addMemOperand(MMO)); } } } @@ -788,12 +803,14 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, RC == ARM::QPR_8RegisterClass) && "Unknown regclass!"); if (Align >= 16 && (getRegisterInfo().canRealignStack(MF))) { - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg) + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q), DestReg) .addFrameIndex(FI).addImm(128) .addMemOperand(MMO)); } else { - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRQ), DestReg) - .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQ), DestReg) + .addFrameIndex(FI) + .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4)) + .addMemOperand(MMO)); } } } diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 71207c8..7d48663 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -124,17 +124,17 @@ private: /// SelectDYN_ALLOC - Select dynamic alloc for Thumb. SDNode *SelectDYN_ALLOC(SDNode *N); - /// SelectVLD - Select NEON load intrinsics. NumVecs should - /// be 2, 3 or 4. The opcode arrays specify the instructions used for + /// SelectVLD - Select NEON load intrinsics. NumVecs should be + /// 1, 2, 3 or 4. The opcode arrays specify the instructions used for /// loads of D registers and even subregs and odd subregs of Q registers. - /// For NumVecs == 2, QOpcodes1 is not used. + /// For NumVecs <= 2, QOpcodes1 is not used. SDNode *SelectVLD(SDNode *N, unsigned NumVecs, unsigned *DOpcodes, unsigned *QOpcodes0, unsigned *QOpcodes1); /// SelectVST - Select NEON store intrinsics. NumVecs should - /// be 2, 3 or 4. The opcode arrays specify the instructions used for + /// be 1, 2, 3 or 4. The opcode arrays specify the instructions used for /// stores of D registers and even subregs and odd subregs of Q registers. - /// For NumVecs == 2, QOpcodes1 is not used. + /// For NumVecs <= 2, QOpcodes1 is not used. SDNode *SelectVST(SDNode *N, unsigned NumVecs, unsigned *DOpcodes, unsigned *QOpcodes0, unsigned *QOpcodes1); @@ -1022,7 +1022,7 @@ static EVT GetNEONSubregVT(EVT VT) { SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, unsigned *DOpcodes, unsigned *QOpcodes0, unsigned *QOpcodes1) { - assert(NumVecs >=2 && NumVecs <= 4 && "VLD NumVecs out-of-range"); + assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range"); DebugLoc dl = N->getDebugLoc(); SDValue MemAddr, Align; @@ -1047,6 +1047,9 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, case MVT::v8i16: OpcodeIndex = 1; break; case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 2; break; + case MVT::v2i64: OpcodeIndex = 3; + assert(NumVecs == 1 && "v2i64 type only supported for VLD1"); + break; } SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32); @@ -1060,15 +1063,15 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, } EVT RegVT = GetNEONSubregVT(VT); - if (NumVecs == 2) { - // Quad registers are directly supported for VLD2, - // loading 2 pairs of D regs. + if (NumVecs <= 2) { + // Quad registers are directly supported for VLD1 and VLD2, + // loading pairs of D regs. unsigned Opc = QOpcodes0[OpcodeIndex]; const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain }; - std::vector<EVT> ResTys(4, VT); + std::vector<EVT> ResTys(2 * NumVecs, RegVT); ResTys.push_back(MVT::Other); SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops, 5); - Chain = SDValue(VLd, 4); + Chain = SDValue(VLd, 2 * NumVecs); // Combine the even and odd subregs to produce the result. for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { @@ -1109,7 +1112,7 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, unsigned *DOpcodes, unsigned *QOpcodes0, unsigned *QOpcodes1) { - assert(NumVecs >=2 && NumVecs <= 4 && "VST NumVecs out-of-range"); + assert(NumVecs >=1 && NumVecs <= 4 && "VST NumVecs out-of-range"); DebugLoc dl = N->getDebugLoc(); SDValue MemAddr, Align; @@ -1134,6 +1137,9 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, case MVT::v8i16: OpcodeIndex = 1; break; case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 2; break; + case MVT::v2i64: OpcodeIndex = 3; + assert(NumVecs == 1 && "v2i64 type only supported for VST1"); + break; } SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32); @@ -1154,9 +1160,9 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, } EVT RegVT = GetNEONSubregVT(VT); - if (NumVecs == 2) { - // Quad registers are directly supported for VST2, - // storing 2 pairs of D regs. + if (NumVecs <= 2) { + // Quad registers are directly supported for VST1 and VST2, + // storing pairs of D regs. unsigned Opc = QOpcodes0[OpcodeIndex]; for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT, @@ -1167,7 +1173,8 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, Ops.push_back(Pred); Ops.push_back(Reg0); // predicate register Ops.push_back(Chain); - return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), 9); + return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), + 5 + 2 * NumVecs); } // Otherwise, quad registers are stored with two separate instructions, @@ -1694,6 +1701,35 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ResNode = SelectARMIndexedLoad(N); if (ResNode) return ResNode; + + // VLDMQ must be custom-selected for "v2f64 load" to set the AM5Opc value. + if (Subtarget->hasVFP2() && + N->getValueType(0).getSimpleVT().SimpleTy == MVT::v2f64) { + SDValue Chain = N->getOperand(0); + SDValue AM5Opc = + CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::ia, 4), MVT::i32); + SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32); + SDValue PredReg = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { N->getOperand(1), AM5Opc, Pred, PredReg, Chain }; + return CurDAG->getMachineNode(ARM::VLDMQ, dl, MVT::v2f64, MVT::Other, + Ops, 5); + } + // Other cases are autogenerated. + break; + } + case ISD::STORE: { + // VSTMQ must be custom-selected for "v2f64 store" to set the AM5Opc value. + if (Subtarget->hasVFP2() && + N->getOperand(1).getValueType().getSimpleVT().SimpleTy == MVT::v2f64) { + SDValue Chain = N->getOperand(0); + SDValue AM5Opc = + CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::ia, 4), MVT::i32); + SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32); + SDValue PredReg = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { N->getOperand(1), N->getOperand(2), + AM5Opc, Pred, PredReg, Chain }; + return CurDAG->getMachineNode(ARM::VSTMQ, dl, MVT::Other, Ops, 6); + } // Other cases are autogenerated. break; } @@ -1831,16 +1867,24 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { default: break; + case Intrinsic::arm_neon_vld1: { + unsigned DOpcodes[] = { ARM::VLD1d8, ARM::VLD1d16, + ARM::VLD1d32, ARM::VLD1d64 }; + unsigned QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16, + ARM::VLD1q32, ARM::VLD1q64 }; + return SelectVLD(N, 1, DOpcodes, QOpcodes, 0); + } + case Intrinsic::arm_neon_vld2: { unsigned DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16, - ARM::VLD2d32, ARM::VLD2d64 }; + ARM::VLD2d32, ARM::VLD1q64 }; unsigned QOpcodes[] = { ARM::VLD2q8, ARM::VLD2q16, ARM::VLD2q32 }; return SelectVLD(N, 2, DOpcodes, QOpcodes, 0); } case Intrinsic::arm_neon_vld3: { unsigned DOpcodes[] = { ARM::VLD3d8, ARM::VLD3d16, - ARM::VLD3d32, ARM::VLD3d64 }; + ARM::VLD3d32, ARM::VLD1d64T }; unsigned QOpcodes0[] = { ARM::VLD3q8_UPD, ARM::VLD3q16_UPD, ARM::VLD3q32_UPD }; @@ -1852,7 +1896,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { case Intrinsic::arm_neon_vld4: { unsigned DOpcodes[] = { ARM::VLD4d8, ARM::VLD4d16, - ARM::VLD4d32, ARM::VLD4d64 }; + ARM::VLD4d32, ARM::VLD1d64Q }; unsigned QOpcodes0[] = { ARM::VLD4q8_UPD, ARM::VLD4q16_UPD, ARM::VLD4q32_UPD }; @@ -1883,16 +1927,24 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { return SelectVLDSTLane(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); } + case Intrinsic::arm_neon_vst1: { + unsigned DOpcodes[] = { ARM::VST1d8, ARM::VST1d16, + ARM::VST1d32, ARM::VST1d64 }; + unsigned QOpcodes[] = { ARM::VST1q8, ARM::VST1q16, + ARM::VST1q32, ARM::VST1q64 }; + return SelectVST(N, 1, DOpcodes, QOpcodes, 0); + } + case Intrinsic::arm_neon_vst2: { unsigned DOpcodes[] = { ARM::VST2d8, ARM::VST2d16, - ARM::VST2d32, ARM::VST2d64 }; + ARM::VST2d32, ARM::VST1q64 }; unsigned QOpcodes[] = { ARM::VST2q8, ARM::VST2q16, ARM::VST2q32 }; return SelectVST(N, 2, DOpcodes, QOpcodes, 0); } case Intrinsic::arm_neon_vst3: { unsigned DOpcodes[] = { ARM::VST3d8, ARM::VST3d16, - ARM::VST3d32, ARM::VST3d64 }; + ARM::VST3d32, ARM::VST1d64T }; unsigned QOpcodes0[] = { ARM::VST3q8_UPD, ARM::VST3q16_UPD, ARM::VST3q32_UPD }; @@ -1904,7 +1956,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { case Intrinsic::arm_neon_vst4: { unsigned DOpcodes[] = { ARM::VST4d8, ARM::VST4d16, - ARM::VST4d32, ARM::VST4d64 }; + ARM::VST4d32, ARM::VST1d64Q }; unsigned QOpcodes0[] = { ARM::VST4q8_UPD, ARM::VST4q16_UPD, ARM::VST4q32_UPD }; diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 0d0a004..b6c81f6 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -456,6 +456,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) // Generic (and overly aggressive) if-conversion limits. setIfCvtBlockSizeLimit(10); setIfCvtDupBlockSizeLimit(2); + } else if (Subtarget->hasV7Ops()) { + setIfCvtBlockSizeLimit(3); + setIfCvtDupBlockSizeLimit(1); } else if (Subtarget->hasV6Ops()) { setIfCvtBlockSizeLimit(2); setIfCvtDupBlockSizeLimit(1); diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td index 4f6f05d..4427e50 100644 --- a/lib/Target/ARM/ARMInstrFormats.td +++ b/lib/Target/ARM/ARMInstrFormats.td @@ -1,10 +1,10 @@ //===- ARMInstrFormats.td - ARM Instruction Formats --*- tablegen -*---------=// -// +// // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. -// +// //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// @@ -59,7 +59,18 @@ def NEONDupFrm : Format<28>; def MiscFrm : Format<29>; def ThumbMiscFrm : Format<30>; -def NLdStFrm : Format<31>; +def NLdStFrm : Format<31>; +def N1RegModImmFrm : Format<32>; +def N2RegFrm : Format<33>; +def NVCVTFrm : Format<34>; +def NVDupLnFrm : Format<35>; +def N2RegVShLFrm : Format<36>; +def N2RegVShRFrm : Format<37>; +def N3RegFrm : Format<38>; +def N3RegVShFrm : Format<39>; +def NVExtFrm : Format<40>; +def NVMulSLFrm : Format<41>; +def NVTBLFrm : Format<42>; // Misc flags. @@ -177,13 +188,13 @@ class InstTemplate<AddrMode am, SizeFlagVal sz, IndexMode im, // TSFlagsFields AddrMode AM = am; bits<4> AddrModeBits = AM.Value; - + SizeFlagVal SZ = sz; bits<3> SizeFlag = SZ.Value; IndexMode IM = im; bits<2> IndexModeBits = IM.Value; - + Format F = f; bits<6> Form = F.Value; @@ -195,7 +206,7 @@ class InstTemplate<AddrMode am, SizeFlagVal sz, IndexMode im, // bit isUnaryDataProc = 0; bit canXformTo16Bit = 0; - + let Constraints = cstr; let Itinerary = itin; } @@ -214,9 +225,9 @@ class InstThumb<AddrMode am, SizeFlagVal sz, IndexMode im, Format f, Domain d, string cstr, InstrItinClass itin> : InstTemplate<am, sz, im, f, d, cstr, itin>; -class PseudoInst<dag oops, dag iops, InstrItinClass itin, +class PseudoInst<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern> - : InstARM<AddrModeNone, SizeSpecial, IndexModeNone, Pseudo, GenericDomain, + : InstARM<AddrModeNone, SizeSpecial, IndexModeNone, Pseudo, GenericDomain, "", itin> { let OutOperandList = oops; let InOperandList = iops; @@ -226,7 +237,7 @@ class PseudoInst<dag oops, dag iops, InstrItinClass itin, // Almost all ARM instructions are predicable. class I<dag oops, dag iops, AddrMode am, SizeFlagVal sz, - IndexMode im, Format f, InstrItinClass itin, + IndexMode im, Format f, InstrItinClass itin, string opc, string asm, string cstr, list<dag> pattern> : InstARM<am, sz, im, f, GenericDomain, cstr, itin> { @@ -238,9 +249,9 @@ class I<dag oops, dag iops, AddrMode am, SizeFlagVal sz, } // A few are not predicable class InoP<dag oops, dag iops, AddrMode am, SizeFlagVal sz, - IndexMode im, Format f, InstrItinClass itin, - string opc, string asm, string cstr, - list<dag> pattern> + IndexMode im, Format f, InstrItinClass itin, + string opc, string asm, string cstr, + list<dag> pattern> : InstARM<am, sz, im, f, GenericDomain, cstr, itin> { let OutOperandList = oops; let InOperandList = iops; @@ -290,9 +301,9 @@ class AXI<dag oops, dag iops, Format f, InstrItinClass itin, : XI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin, asm, "", pattern>; class AInoP<dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string asm, list<dag> pattern> + string opc, string asm, list<dag> pattern> : InoP<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin, - opc, asm, "", pattern>; + opc, asm, "", pattern>; // Ctrl flow instructions class ABI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin, @@ -362,7 +373,7 @@ class AXI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin, let Inst{24-21} = opcod; let Inst{27-26} = {0,0}; } -class AI1x2<dag oops, dag iops, Format f, InstrItinClass itin, +class AI1x2<dag oops, dag iops, Format f, InstrItinClass itin, string opc, string asm, list<dag> pattern> : I<oops, iops, AddrMode1, Size8Bytes, IndexModeNone, f, itin, opc, asm, "", pattern>; @@ -387,7 +398,7 @@ class AI2ldw<dag oops, dag iops, Format f, InstrItinClass itin, let Inst{24} = 1; // P bit let Inst{27-26} = {0,1}; } -class AXI2ldw<dag oops, dag iops, Format f, InstrItinClass itin, +class AXI2ldw<dag oops, dag iops, Format f, InstrItinClass itin, string asm, list<dag> pattern> : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, asm, "", pattern> { @@ -407,7 +418,7 @@ class AI2ldb<dag oops, dag iops, Format f, InstrItinClass itin, let Inst{24} = 1; // P bit let Inst{27-26} = {0,1}; } -class AXI2ldb<dag oops, dag iops, Format f, InstrItinClass itin, +class AXI2ldb<dag oops, dag iops, Format f, InstrItinClass itin, string asm, list<dag> pattern> : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, asm, "", pattern> { @@ -549,7 +560,7 @@ class AI2stbpo<dag oops, dag iops, Format f, InstrItinClass itin, } // addrmode3 instructions -class AI3<dag oops, dag iops, Format f, InstrItinClass itin, +class AI3<dag oops, dag iops, Format f, InstrItinClass itin, string opc, string asm, list<dag> pattern> : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, opc, asm, "", pattern>; @@ -853,7 +864,6 @@ class AI3stdpo<dag oops, dag iops, Format f, InstrItinClass itin, let Inst{27-25} = 0b000; } - // addrmode4 instructions class AXI4ld<dag oops, dag iops, IndexMode im, Format f, InstrItinClass itin, string asm, string cstr, list<dag> pattern> @@ -961,20 +971,25 @@ class TI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern> : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "", pattern>; // Two-address instructions -class TIt<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern> - : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "$lhs = $dst", pattern>; +class TIt<dag oops, dag iops, InstrItinClass itin, string asm, + list<dag> pattern> + : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "$lhs = $dst", + pattern>; // tBL, tBX 32-bit instructions class TIx2<bits<5> opcod1, bits<2> opcod2, bit opcod3, - dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern> - : ThumbI<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>, Encoding { + dag oops, dag iops, InstrItinClass itin, string asm, + list<dag> pattern> + : ThumbI<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>, + Encoding { let Inst{31-27} = opcod1; let Inst{15-14} = opcod2; let Inst{12} = opcod3; } // BR_JT instructions -class TJTI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern> +class TJTI<dag oops, dag iops, InstrItinClass itin, string asm, + list<dag> pattern> : ThumbI<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>; // Thumb1 only @@ -1001,7 +1016,7 @@ class T1JTI<dag oops, dag iops, InstrItinClass itin, // Two-address instructions class T1It<dag oops, dag iops, InstrItinClass itin, string asm, string cstr, list<dag> pattern> - : Thumb1I<oops, iops, AddrModeNone, Size2Bytes, itin, + : Thumb1I<oops, iops, AddrModeNone, Size2Bytes, itin, asm, cstr, pattern>; // Thumb1 instruction that can either be predicated or set CPSR. @@ -1024,7 +1039,7 @@ class T1sI<dag oops, dag iops, InstrItinClass itin, class T1sIt<dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : Thumb1sI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm, - "$lhs = $dst", pattern>; + "$lhs = $dst", pattern>; // Thumb1 instruction that can be predicated. class Thumb1pI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, @@ -1046,7 +1061,7 @@ class T1pI<dag oops, dag iops, InstrItinClass itin, class T1pIt<dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : Thumb1pI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm, - "$lhs = $dst", pattern>; + "$lhs = $dst", pattern>; class T1pI1<dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> @@ -1057,7 +1072,7 @@ class T1pI2<dag oops, dag iops, InstrItinClass itin, class T1pI4<dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : Thumb1pI<oops, iops, AddrModeT1_4, Size2Bytes, itin, opc, asm, "", pattern>; -class T1pIs<dag oops, dag iops, +class T1pIs<dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : Thumb1pI<oops, iops, AddrModeT1_s, Size2Bytes, itin, opc, asm, "", pattern>; @@ -1146,8 +1161,8 @@ class Thumb2XI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, } class ThumbXI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, - InstrItinClass itin, - string asm, string cstr, list<dag> pattern> + InstrItinClass itin, + string asm, string cstr, list<dag> pattern> : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { let OutOperandList = oops; let InOperandList = iops; @@ -1161,7 +1176,7 @@ class T2I<dag oops, dag iops, InstrItinClass itin, : Thumb2I<oops, iops, AddrModeNone, Size4Bytes, itin, opc, asm, "", pattern>; class T2Ii12<dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> - : Thumb2I<oops, iops, AddrModeT2_i12, Size4Bytes, itin, opc, asm, "", pattern>; + : Thumb2I<oops, iops, AddrModeT2_i12, Size4Bytes, itin, opc, asm, "",pattern>; class T2Ii8<dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : Thumb2I<oops, iops, AddrModeT2_i8, Size4Bytes, itin, opc, asm, "", pattern>; @@ -1196,7 +1211,7 @@ class T2JTI<dag oops, dag iops, InstrItinClass itin, : Thumb2XI<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>; class T2Ix2<dag oops, dag iops, InstrItinClass itin, - string opc, string asm, list<dag> pattern> + string opc, string asm, list<dag> pattern> : Thumb2I<oops, iops, AddrModeNone, Size8Bytes, itin, opc, asm, "", pattern>; // Two-address instructions @@ -1295,7 +1310,7 @@ class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : VFPI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone, - VFPLdStFrm, itin, opc, asm, "", pattern> { + VFPLdStFrm, itin, opc, asm, "", pattern> { // TODO: Mark the instructions with the appropriate subtarget info. let Inst{27-24} = opcod1; let Inst{21-20} = opcod2; @@ -1309,7 +1324,7 @@ class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : VFPI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone, - VFPLdStFrm, itin, opc, asm, "", pattern> { + VFPLdStFrm, itin, opc, asm, "", pattern> { // TODO: Mark the instructions with the appropriate subtarget info. let Inst{27-24} = opcod1; let Inst{21-20} = opcod2; @@ -1320,7 +1335,7 @@ class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops, class AXDI5<dag oops, dag iops, IndexMode im, InstrItinClass itin, string asm, string cstr, list<dag> pattern> : VFPXI<oops, iops, AddrMode5, Size4Bytes, im, - VFPLdStMulFrm, itin, asm, cstr, pattern> { + VFPLdStMulFrm, itin, asm, cstr, pattern> { // TODO: Mark the instructions with the appropriate subtarget info. let Inst{27-25} = 0b110; let Inst{11-8} = 0b1011; @@ -1332,7 +1347,7 @@ class AXDI5<dag oops, dag iops, IndexMode im, InstrItinClass itin, class AXSI5<dag oops, dag iops, IndexMode im, InstrItinClass itin, string asm, string cstr, list<dag> pattern> : VFPXI<oops, iops, AddrMode5, Size4Bytes, im, - VFPLdStMulFrm, itin, asm, cstr, pattern> { + VFPLdStMulFrm, itin, asm, cstr, pattern> { // TODO: Mark the instructions with the appropriate subtarget info. let Inst{27-25} = 0b110; let Inst{11-8} = 0b1010; @@ -1353,7 +1368,8 @@ class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, // Double precision, binary class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, - dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> + dag iops, InstrItinClass itin, string opc, string asm, + list<dag> pattern> : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> { let Inst{27-23} = opcod1; let Inst{21-20} = opcod2; @@ -1362,6 +1378,20 @@ class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, let Inst{4} = op4; } +// Double precision, binary, VML[AS] (for additional predicate) +class ADbI_vmlX<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, + dag iops, InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> { + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{11-8} = 0b1011; + let Inst{6} = op6; + let Inst{4} = op4; + list<Predicate> Predicates = [HasVFP2, UseVMLx]; +} + + // Single precision, unary class ASuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc, @@ -1399,7 +1429,8 @@ class ASbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops, // Single precision binary, if no NEON // Same as ASbI except not available if NEON is enabled class ASbIn<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, - dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> + dag iops, InstrItinClass itin, string opc, string asm, + list<dag> pattern> : ASbI<opcod1, opcod2, op6, op4, oops, iops, itin, opc, asm, pattern> { list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP]; } @@ -1419,8 +1450,8 @@ class AVConv1I<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4, // VFP conversion between floating-point and fixed-point class AVConv1XI<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, bit op5, - dag oops, dag iops, InstrItinClass itin, string opc, string asm, - list<dag> pattern> + dag oops, dag iops, InstrItinClass itin, string opc, string asm, + list<dag> pattern> : AVConv1I<op1, op2, op3, op4, oops, iops, itin, opc, asm, pattern> { // size (fixed-point number): sx == 0 ? 16 : 32 let Inst{7} = op5; // sx @@ -1448,7 +1479,7 @@ class AVConv2I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : AVConvXI<opcod1, opcod2, oops, iops, VFPConv2Frm, itin, opc, asm, pattern>; -class AVConv3I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, +class AVConv3I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : AVConvXI<opcod1, opcod2, oops, iops, VFPConv3Frm, itin, opc, asm, pattern>; @@ -1480,9 +1511,10 @@ class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, Format f, } // Same as NeonI except it does not have a "data type" specifier. -class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : InstARM<am, Size4Bytes, im, NEONFrm, NeonDomain, cstr, itin> { +class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, Format f, + InstrItinClass itin, string opc, string asm, string cstr, + list<dag> pattern> + : InstARM<am, Size4Bytes, im, f, NeonDomain, cstr, itin> { let OutOperandList = oops; let InOperandList = !con(iops, (ins pred:$p)); let AsmString = !strconcat(!strconcat(opc, "${p}"), !strconcat("\t", asm)); @@ -1490,18 +1522,6 @@ class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, InstrItinClass itin, list<Predicate> Predicates = [HasNEON]; } -class NI<dag oops, dag iops, InstrItinClass itin, string opc, string asm, - list<dag> pattern> - : NeonXI<oops, iops, AddrModeNone, IndexModeNone, itin, opc, asm, "", - pattern> { -} - -class NI4<dag oops, dag iops, InstrItinClass itin, string opc, - string asm, list<dag> pattern> - : NeonXI<oops, iops, AddrMode4, IndexModeNone, itin, opc, asm, "", - pattern> { -} - class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4, dag oops, dag iops, InstrItinClass itin, string opc, string dt, string asm, string cstr, list<dag> pattern> @@ -1514,17 +1534,17 @@ class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4, let Inst{7-4} = op7_4; } -class NDataI<dag oops, dag iops, InstrItinClass itin, +class NDataI<dag oops, dag iops, Format f, InstrItinClass itin, string opc, string dt, string asm, string cstr, list<dag> pattern> - : NeonI<oops, iops, AddrModeNone, IndexModeNone, NEONFrm, itin, opc, dt, asm, - cstr, pattern> { + : NeonI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, dt, asm, cstr, + pattern> { let Inst{31-25} = 0b1111001; } -class NDataXI<dag oops, dag iops, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : NeonXI<oops, iops, AddrModeNone, IndexModeNone, itin, opc, asm, - cstr, pattern> { +class NDataXI<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : NeonXI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, asm, + cstr, pattern> { let Inst{31-25} = 0b1111001; } @@ -1532,8 +1552,9 @@ class NDataXI<dag oops, dag iops, InstrItinClass itin, class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6, bit op5, bit op4, dag oops, dag iops, InstrItinClass itin, - string opc, string dt, string asm, string cstr, list<dag> pattern> - : NDataI<oops, iops, itin, opc, dt, asm, cstr, pattern> { + string opc, string dt, string asm, string cstr, + list<dag> pattern> + : NDataI<oops, iops, N1RegModImmFrm, itin, opc, dt, asm, cstr, pattern> { let Inst{23} = op23; let Inst{21-19} = op21_19; let Inst{11-8} = op11_8; @@ -1548,7 +1569,7 @@ class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, dag oops, dag iops, InstrItinClass itin, string opc, string dt, string asm, string cstr, list<dag> pattern> - : NDataI<oops, iops, itin, opc, dt, asm, cstr, pattern> { + : NDataI<oops, iops, N2RegFrm, itin, opc, dt, asm, cstr, pattern> { let Inst{24-23} = op24_23; let Inst{21-20} = op21_20; let Inst{19-18} = op19_18; @@ -1560,10 +1581,10 @@ class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, // Same as N2V except it doesn't have a datatype suffix. class N2VX<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, - bits<5> op11_7, bit op6, bit op4, - dag oops, dag iops, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : NDataXI<oops, iops, itin, opc, asm, cstr, pattern> { + bits<5> op11_7, bit op6, bit op4, + dag oops, dag iops, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : NDataXI<oops, iops, N2RegFrm, itin, opc, asm, cstr, pattern> { let Inst{24-23} = op24_23; let Inst{21-20} = op21_20; let Inst{19-18} = op19_18; @@ -1575,9 +1596,9 @@ class N2VX<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, // NEON 2 vector register with immediate. class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, - dag oops, dag iops, InstrItinClass itin, + dag oops, dag iops, Format f, InstrItinClass itin, string opc, string dt, string asm, string cstr, list<dag> pattern> - : NDataI<oops, iops, itin, opc, dt, asm, cstr, pattern> { + : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> { let Inst{24} = op24; let Inst{23} = op23; let Inst{11-8} = op11_8; @@ -1588,9 +1609,9 @@ class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, // NEON 3 vector register format. class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, - dag oops, dag iops, InstrItinClass itin, + dag oops, dag iops, Format f, InstrItinClass itin, string opc, string dt, string asm, string cstr, list<dag> pattern> - : NDataI<oops, iops, itin, opc, dt, asm, cstr, pattern> { + : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> { let Inst{24} = op24; let Inst{23} = op23; let Inst{21-20} = op21_20; @@ -1599,11 +1620,12 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, let Inst{4} = op4; } -// Same as N3VX except it doesn't have a data type suffix. -class N3VX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, - dag oops, dag iops, InstrItinClass itin, - string opc, string asm, string cstr, list<dag> pattern> - : NDataXI<oops, iops, itin, opc, asm, cstr, pattern> { +// Same as N3V except it doesn't have a data type suffix. +class N3VX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, + dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : NDataXI<oops, iops, f, itin, opc, asm, cstr, pattern> { let Inst{24} = op24; let Inst{23} = op23; let Inst{21-20} = op21_20; @@ -1617,7 +1639,7 @@ class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, dag oops, dag iops, Format f, InstrItinClass itin, string opc, string dt, string asm, list<dag> pattern> : InstARM<AddrModeNone, Size4Bytes, IndexModeNone, f, GenericDomain, - "", itin> { + "", itin> { let Inst{27-20} = opcod1; let Inst{11-8} = opcod2; let Inst{6-5} = opcod3; @@ -1647,6 +1669,19 @@ class NVDup<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONDupFrm, itin, opc, dt, asm, pattern>; +// Vector Duplicate Lane (from scalar to all elements) +class NVDupLane<bits<4> op19_16, bit op6, dag oops, dag iops, + InstrItinClass itin, string opc, string dt, string asm, + list<dag> pattern> + : NDataI<oops, iops, NVDupLnFrm, itin, opc, dt, asm, "", pattern> { + let Inst{24-23} = 0b11; + let Inst{21-20} = 0b11; + let Inst{19-16} = op19_16; + let Inst{11-7} = 0b11000; + let Inst{6} = op6; + let Inst{4} = 0; +} + // NEONFPPat - Same as Pat<>, but requires that the compiler be using NEON // for single-precision FP. class NEONFPPat<dag pattern, dag result> : Pat<pattern, result> { diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 26a2806..f2ab06f 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -140,6 +140,8 @@ def IsNotDarwin : Predicate<"!Subtarget->isTargetDarwin()">; def UseMovt : Predicate<"Subtarget->useMovt()">; def DontUseMovt : Predicate<"!Subtarget->useMovt()">; +def UseVMLx : Predicate<"Subtarget->useVMLx()">; + //===----------------------------------------------------------------------===// // ARM Flag Definitions. diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index c977cc3..ed9d31d 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -115,51 +115,80 @@ def h64imm : Operand<i64> { // NEON load / store instructions //===----------------------------------------------------------------------===// +let mayLoad = 1 in { // Use vldmia to load a Q register as a D register pair. -def VLDRQ : NI4<(outs QPR:$dst), (ins addrmode4:$addr), IIC_fpLoadm, - "vldmia", "$addr, ${dst:dregpair}", - [(set QPR:$dst, (v2f64 (load addrmode4:$addr)))]> { - let Inst{27-25} = 0b110; - let Inst{24} = 0; // P bit - let Inst{23} = 1; // U bit - let Inst{20} = 1; - let Inst{11-8} = 0b1011; -} +// This is equivalent to VLDMD except that it has a Q register operand +// instead of a pair of D registers. +def VLDMQ + : AXDI5<(outs QPR:$dst), (ins addrmode5:$addr, pred:$p), + IndexModeNone, IIC_fpLoadm, + "vldm${addr:submode}${p}\t${addr:base}, ${dst:dregpair}", "", []>; +def VLDMQ_UPD + : AXDI5<(outs QPR:$dst, GPR:$wb), (ins addrmode5:$addr, pred:$p), + IndexModeUpd, IIC_fpLoadm, + "vldm${addr:submode}${p}\t${addr:base}!, ${dst:dregpair}", + "$addr.base = $wb", []>; + +// Use vld1 to load a Q register as a D register pair. +// This alternative to VLDMQ allows an alignment to be specified. +// This is equivalent to VLD1q64 except that it has a Q register operand. +def VLD1q + : NLdSt<0,0b10,0b1010,0b1100, (outs QPR:$dst), (ins addrmode6:$addr), + IIC_VLD1, "vld1", "64", "${dst:dregpair}, $addr", "", []>; +def VLD1q_UPD + : NLdSt<0,0b10,0b1010,0b1100, (outs QPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, "vld1", "64", + "${dst:dregpair}, $addr$offset", "$addr.addr = $wb", []>; +} // mayLoad = 1 +let mayStore = 1 in { // Use vstmia to store a Q register as a D register pair. -def VSTRQ : NI4<(outs), (ins QPR:$src, addrmode4:$addr), IIC_fpStorem, - "vstmia", "$addr, ${src:dregpair}", - [(store (v2f64 QPR:$src), addrmode4:$addr)]> { - let Inst{27-25} = 0b110; - let Inst{24} = 0; // P bit - let Inst{23} = 1; // U bit - let Inst{20} = 0; - let Inst{11-8} = 0b1011; -} +// This is equivalent to VSTMD except that it has a Q register operand +// instead of a pair of D registers. +def VSTMQ + : AXDI5<(outs), (ins QPR:$src, addrmode5:$addr, pred:$p), + IndexModeNone, IIC_fpStorem, + "vstm${addr:submode}${p}\t${addr:base}, ${src:dregpair}", "", []>; +def VSTMQ_UPD + : AXDI5<(outs GPR:$wb), (ins QPR:$src, addrmode5:$addr, pred:$p), + IndexModeUpd, IIC_fpStorem, + "vstm${addr:submode}${p}\t${addr:base}!, ${src:dregpair}", + "$addr.base = $wb", []>; + +// Use vst1 to store a Q register as a D register pair. +// This alternative to VSTMQ allows an alignment to be specified. +// This is equivalent to VST1q64 except that it has a Q register operand. +def VST1q + : NLdSt<0,0b00,0b1010,0b1100, (outs), (ins addrmode6:$addr, QPR:$src), + IIC_VST, "vst1", "64", "${src:dregpair}, $addr", "", []>; +def VST1q_UPD + : NLdSt<0,0b00,0b1010,0b1100, (outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QPR:$src), + IIC_VST, "vst1", "64", "{$src:dregpair}, $addr$offset", + "$addr.addr = $wb", []>; +} // mayStore = 1 -// VLD1 : Vector Load (multiple single elements) -class VLD1D<bits<4> op7_4, string Dt, ValueType Ty> - : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$dst), (ins addrmode6:$addr), IIC_VLD1, - "vld1", Dt, "\\{$dst\\}, $addr", "", - [(set DPR:$dst, (Ty (int_arm_neon_vld1 addrmode6:$addr)))]>; -class VLD1Q<bits<4> op7_4, string Dt, ValueType Ty> - : NLdSt<0,0b10,0b1010,op7_4, (outs QPR:$dst), (ins addrmode6:$addr), IIC_VLD1, - "vld1", Dt, "${dst:dregpair}, $addr", "", - [(set QPR:$dst, (Ty (int_arm_neon_vld1 addrmode6:$addr)))]>; - -def VLD1d8 : VLD1D<0b0000, "8", v8i8>; -def VLD1d16 : VLD1D<0b0100, "16", v4i16>; -def VLD1d32 : VLD1D<0b1000, "32", v2i32>; -def VLD1df : VLD1D<0b1000, "32", v2f32>; -def VLD1d64 : VLD1D<0b1100, "64", v1i64>; - -def VLD1q8 : VLD1Q<0b0000, "8", v16i8>; -def VLD1q16 : VLD1Q<0b0100, "16", v8i16>; -def VLD1q32 : VLD1Q<0b1000, "32", v4i32>; -def VLD1qf : VLD1Q<0b1000, "32", v4f32>; -def VLD1q64 : VLD1Q<0b1100, "64", v2i64>; +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in { -let mayLoad = 1 in { +// VLD1 : Vector Load (multiple single elements) +class VLD1D<bits<4> op7_4, string Dt> + : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$dst), + (ins addrmode6:$addr), IIC_VLD1, + "vld1", Dt, "\\{$dst\\}, $addr", "", []>; +class VLD1Q<bits<4> op7_4, string Dt> + : NLdSt<0,0b10,0b1010,op7_4, (outs DPR:$dst1, DPR:$dst2), + (ins addrmode6:$addr), IIC_VLD1, + "vld1", Dt, "\\{$dst1, $dst2\\}, $addr", "", []>; + +def VLD1d8 : VLD1D<0b0000, "8">; +def VLD1d16 : VLD1D<0b0100, "16">; +def VLD1d32 : VLD1D<0b1000, "32">; +def VLD1d64 : VLD1D<0b1100, "64">; + +def VLD1q8 : VLD1Q<0b0000, "8">; +def VLD1q16 : VLD1Q<0b0100, "16">; +def VLD1q32 : VLD1Q<0b1000, "32">; +def VLD1q64 : VLD1Q<0b1100, "64">; // ...with address register writeback: class VLD1DWB<bits<4> op7_4, string Dt> @@ -182,54 +211,48 @@ def VLD1q8_UPD : VLD1QWB<0b0000, "8">; def VLD1q16_UPD : VLD1QWB<0b0100, "16">; def VLD1q32_UPD : VLD1QWB<0b1000, "32">; def VLD1q64_UPD : VLD1QWB<0b1100, "64">; -} // mayLoad = 1 -let mayLoad = 1, hasExtraDefRegAllocReq = 1 in { - -// These (dreg triple/quadruple) are for disassembly only. +// ...with 3 registers (some of these are only for the disassembler): class VLD1D3<bits<4> op7_4, string Dt> : NLdSt<0,0b10,0b0110,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3), (ins addrmode6:$addr), IIC_VLD1, "vld1", Dt, - "\\{$dst1, $dst2, $dst3\\}, $addr", "", - [/* For disassembly only; pattern left blank */]>; -class VLD1D4<bits<4> op7_4, string Dt> - : NLdSt<0,0b10,0b0010,op7_4,(outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4), - (ins addrmode6:$addr), IIC_VLD1, "vld1", Dt, - "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr", "", - [/* For disassembly only; pattern left blank */]>; - -def VLD1d8T : VLD1D3<0b0000, "8">; -def VLD1d16T : VLD1D3<0b0100, "16">; -def VLD1d32T : VLD1D3<0b1000, "32">; -// VLD1d64T : implemented as VLD3d64 - -def VLD1d8Q : VLD1D4<0b0000, "8">; -def VLD1d16Q : VLD1D4<0b0100, "16">; -def VLD1d32Q : VLD1D4<0b1000, "32">; -// VLD1d64Q : implemented as VLD4d64 - -// ...with address register writeback: + "\\{$dst1, $dst2, $dst3\\}, $addr", "", []>; class VLD1D3WB<bits<4> op7_4, string Dt> : NLdSt<0,0b10,0b0110,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb), (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, "vld1", Dt, - "\\{$dst1, $dst2, $dst3\\}, $addr$offset", "$addr.addr = $wb", - [/* For disassembly only; pattern left blank */]>; + "\\{$dst1, $dst2, $dst3\\}, $addr$offset", "$addr.addr = $wb", []>; + +def VLD1d8T : VLD1D3<0b0000, "8">; +def VLD1d16T : VLD1D3<0b0100, "16">; +def VLD1d32T : VLD1D3<0b1000, "32">; +def VLD1d64T : VLD1D3<0b1100, "64">; + +def VLD1d8T_UPD : VLD1D3WB<0b0000, "8">; +def VLD1d16T_UPD : VLD1D3WB<0b0100, "16">; +def VLD1d32T_UPD : VLD1D3WB<0b1000, "32">; +def VLD1d64T_UPD : VLD1D3WB<0b1100, "64">; + +// ...with 4 registers (some of these are only for the disassembler): +class VLD1D4<bits<4> op7_4, string Dt> + : NLdSt<0,0b10,0b0010,op7_4,(outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6:$addr), IIC_VLD1, "vld1", Dt, + "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr", "", []>; class VLD1D4WB<bits<4> op7_4, string Dt> : NLdSt<0,0b10,0b0010,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, "vld1", Dt, "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr$offset", "$addr.addr = $wb", - [/* For disassembly only; pattern left blank */]>; + []>; -def VLD1d8T_UPD : VLD1D3WB<0b0000, "8">; -def VLD1d16T_UPD : VLD1D3WB<0b0100, "16">; -def VLD1d32T_UPD : VLD1D3WB<0b1000, "32">; -// VLD1d64T_UPD : implemented as VLD3d64_UPD +def VLD1d8Q : VLD1D4<0b0000, "8">; +def VLD1d16Q : VLD1D4<0b0100, "16">; +def VLD1d32Q : VLD1D4<0b1000, "32">; +def VLD1d64Q : VLD1D4<0b1100, "64">; def VLD1d8Q_UPD : VLD1D4WB<0b0000, "8">; def VLD1d16Q_UPD : VLD1D4WB<0b0100, "16">; def VLD1d32Q_UPD : VLD1D4WB<0b1000, "32">; -// VLD1d64Q_UPD : implemented as VLD4d64_UPD +def VLD1d64Q_UPD : VLD1D4WB<0b1100, "64">; // VLD2 : Vector Load (multiple 2-element structures) class VLD2D<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -245,9 +268,6 @@ class VLD2Q<bits<4> op7_4, string Dt> def VLD2d8 : VLD2D<0b1000, 0b0000, "8">; def VLD2d16 : VLD2D<0b1000, 0b0100, "16">; def VLD2d32 : VLD2D<0b1000, 0b1000, "32">; -def VLD2d64 : NLdSt<0,0b10,0b1010,0b1100, (outs DPR:$dst1, DPR:$dst2), - (ins addrmode6:$addr), IIC_VLD1, - "vld1", "64", "\\{$dst1, $dst2\\}, $addr", "", []>; def VLD2q8 : VLD2Q<0b0000, "8">; def VLD2q16 : VLD2Q<0b0100, "16">; @@ -269,11 +289,6 @@ class VLD2QWB<bits<4> op7_4, string Dt> def VLD2d8_UPD : VLD2DWB<0b1000, 0b0000, "8">; def VLD2d16_UPD : VLD2DWB<0b1000, 0b0100, "16">; def VLD2d32_UPD : VLD2DWB<0b1000, 0b1000, "32">; -def VLD2d64_UPD : NLdSt<0,0b10,0b1010,0b1100, - (outs DPR:$dst1, DPR:$dst2, GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, - "vld1", "64", "\\{$dst1, $dst2\\}, $addr$offset", - "$addr.addr = $wb", []>; def VLD2q8_UPD : VLD2QWB<0b0000, "8">; def VLD2q16_UPD : VLD2QWB<0b0100, "16">; @@ -296,10 +311,6 @@ class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt> def VLD3d8 : VLD3D<0b0100, 0b0000, "8">; def VLD3d16 : VLD3D<0b0100, 0b0100, "16">; def VLD3d32 : VLD3D<0b0100, 0b1000, "32">; -def VLD3d64 : NLdSt<0,0b10,0b0110,0b1100, - (outs DPR:$dst1, DPR:$dst2, DPR:$dst3), - (ins addrmode6:$addr), IIC_VLD1, - "vld1", "64", "\\{$dst1, $dst2, $dst3\\}, $addr", "", []>; // ...with address register writeback: class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -312,11 +323,6 @@ class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt> def VLD3d8_UPD : VLD3DWB<0b0100, 0b0000, "8">; def VLD3d16_UPD : VLD3DWB<0b0100, 0b0100, "16">; def VLD3d32_UPD : VLD3DWB<0b0100, 0b1000, "32">; -def VLD3d64_UPD : NLdSt<0,0b10,0b0110,0b1100, - (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, - "vld1", "64", "\\{$dst1, $dst2, $dst3\\}, $addr$offset", - "$addr.addr = $wb", []>; // ...with double-spaced registers (non-updating versions for disassembly only): def VLD3q8 : VLD3D<0b0101, 0b0000, "8">; @@ -341,11 +347,6 @@ class VLD4D<bits<4> op11_8, bits<4> op7_4, string Dt> def VLD4d8 : VLD4D<0b0000, 0b0000, "8">; def VLD4d16 : VLD4D<0b0000, 0b0100, "16">; def VLD4d32 : VLD4D<0b0000, 0b1000, "32">; -def VLD4d64 : NLdSt<0,0b10,0b0010,0b1100, - (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4), - (ins addrmode6:$addr), IIC_VLD1, - "vld1", "64", "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr", - "", []>; // ...with address register writeback: class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -358,13 +359,6 @@ class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt> def VLD4d8_UPD : VLD4DWB<0b0000, 0b0000, "8">; def VLD4d16_UPD : VLD4DWB<0b0000, 0b0100, "16">; def VLD4d32_UPD : VLD4DWB<0b0000, 0b1000, "32">; -def VLD4d64_UPD : NLdSt<0,0b10,0b0010,0b1100, - (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, - GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, - "vld1", "64", - "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr$offset", - "$addr.addr = $wb", []>; // ...with double-spaced registers (non-updating versions for disassembly only): def VLD4q8 : VLD4D<0b0001, 0b0000, "8">; @@ -383,62 +377,62 @@ def VLD4q32odd_UPD : VLD4DWB<0b0001, 0b1000, "32">; // FIXME: Not yet implemented. // VLD2LN : Vector Load (single 2-element structure to one lane) -class VLD2LN<bits<4> op11_8, string Dt> - : NLdSt<1, 0b10, op11_8, {?,?,?,?}, (outs DPR:$dst1, DPR:$dst2), +class VLD2LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2), (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VLD2, "vld2", Dt, "\\{$dst1[$lane], $dst2[$lane]\\}, $addr", "$src1 = $dst1, $src2 = $dst2", []>; -def VLD2LNd8 : VLD2LN<0b0001, "8">; -def VLD2LNd16 : VLD2LN<0b0101, "16"> { let Inst{5} = 0; } -def VLD2LNd32 : VLD2LN<0b1001, "32"> { let Inst{6} = 0; } +def VLD2LNd8 : VLD2LN<0b0001, {?,?,?,?}, "8">; +def VLD2LNd16 : VLD2LN<0b0101, {?,?,0,?}, "16">; +def VLD2LNd32 : VLD2LN<0b1001, {?,0,?,?}, "32">; // ...with double-spaced registers: -def VLD2LNq16 : VLD2LN<0b0101, "16"> { let Inst{5} = 1; } -def VLD2LNq32 : VLD2LN<0b1001, "32"> { let Inst{6} = 1; } +def VLD2LNq16 : VLD2LN<0b0101, {?,?,1,?}, "16">; +def VLD2LNq32 : VLD2LN<0b1001, {?,1,?,?}, "32">; // ...alternate versions to be allocated odd register numbers: -def VLD2LNq16odd : VLD2LN<0b0101, "16"> { let Inst{5} = 1; } -def VLD2LNq32odd : VLD2LN<0b1001, "32"> { let Inst{6} = 1; } +def VLD2LNq16odd : VLD2LN<0b0101, {?,?,1,?}, "16">; +def VLD2LNq32odd : VLD2LN<0b1001, {?,1,?,?}, "32">; // ...with address register writeback: -class VLD2LNWB<bits<4> op11_8, string Dt> - : NLdSt<1, 0b10, op11_8, {?,?,?,?}, (outs DPR:$dst1, DPR:$dst2, GPR:$wb), +class VLD2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, GPR:$wb), (ins addrmode6:$addr, am6offset:$offset, DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VLD2, "vld2", Dt, "\\{$dst1[$lane], $dst2[$lane]\\}, $addr$offset", "$src1 = $dst1, $src2 = $dst2, $addr.addr = $wb", []>; -def VLD2LNd8_UPD : VLD2LNWB<0b0001, "8">; -def VLD2LNd16_UPD : VLD2LNWB<0b0101, "16"> { let Inst{5} = 0; } -def VLD2LNd32_UPD : VLD2LNWB<0b1001, "32"> { let Inst{6} = 0; } +def VLD2LNd8_UPD : VLD2LNWB<0b0001, {?,?,?,?}, "8">; +def VLD2LNd16_UPD : VLD2LNWB<0b0101, {?,?,0,?}, "16">; +def VLD2LNd32_UPD : VLD2LNWB<0b1001, {?,0,?,?}, "32">; -def VLD2LNq16_UPD : VLD2LNWB<0b0101, "16"> { let Inst{5} = 1; } -def VLD2LNq32_UPD : VLD2LNWB<0b1001, "32"> { let Inst{6} = 1; } +def VLD2LNq16_UPD : VLD2LNWB<0b0101, {?,?,1,?}, "16">; +def VLD2LNq32_UPD : VLD2LNWB<0b1001, {?,1,?,?}, "32">; // VLD3LN : Vector Load (single 3-element structure to one lane) -class VLD3LN<bits<4> op11_8, string Dt> - : NLdSt<1, 0b10, op11_8, {?,?,?,?}, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3), +class VLD3LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3), (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane), IIC_VLD3, "vld3", Dt, "\\{$dst1[$lane], $dst2[$lane], $dst3[$lane]\\}, $addr", "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3", []>; -def VLD3LNd8 : VLD3LN<0b0010, "8"> { let Inst{4} = 0; } -def VLD3LNd16 : VLD3LN<0b0110, "16"> { let Inst{5-4} = 0b00; } -def VLD3LNd32 : VLD3LN<0b1010, "32"> { let Inst{6-4} = 0b000; } +def VLD3LNd8 : VLD3LN<0b0010, {?,?,?,0}, "8">; +def VLD3LNd16 : VLD3LN<0b0110, {?,?,0,0}, "16">; +def VLD3LNd32 : VLD3LN<0b1010, {?,0,0,0}, "32">; // ...with double-spaced registers: -def VLD3LNq16 : VLD3LN<0b0110, "16"> { let Inst{5-4} = 0b10; } -def VLD3LNq32 : VLD3LN<0b1010, "32"> { let Inst{6-4} = 0b100; } +def VLD3LNq16 : VLD3LN<0b0110, {?,?,1,0}, "16">; +def VLD3LNq32 : VLD3LN<0b1010, {?,1,0,0}, "32">; // ...alternate versions to be allocated odd register numbers: -def VLD3LNq16odd : VLD3LN<0b0110, "16"> { let Inst{5-4} = 0b10; } -def VLD3LNq32odd : VLD3LN<0b1010, "32"> { let Inst{6-4} = 0b100; } +def VLD3LNq16odd : VLD3LN<0b0110, {?,?,1,0}, "16">; +def VLD3LNq32odd : VLD3LN<0b1010, {?,1,0,0}, "32">; // ...with address register writeback: -class VLD3LNWB<bits<4> op11_8, string Dt> - : NLdSt<1, 0b10, op11_8, {?,?,?,?}, +class VLD3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb), (ins addrmode6:$addr, am6offset:$offset, DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane), @@ -447,37 +441,37 @@ class VLD3LNWB<bits<4> op11_8, string Dt> "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $addr.addr = $wb", []>; -def VLD3LNd8_UPD : VLD3LNWB<0b0010, "8"> { let Inst{4} = 0; } -def VLD3LNd16_UPD : VLD3LNWB<0b0110, "16"> { let Inst{5-4} = 0b00; } -def VLD3LNd32_UPD : VLD3LNWB<0b1010, "32"> { let Inst{6-4} = 0b000; } +def VLD3LNd8_UPD : VLD3LNWB<0b0010, {?,?,?,0}, "8">; +def VLD3LNd16_UPD : VLD3LNWB<0b0110, {?,?,0,0}, "16">; +def VLD3LNd32_UPD : VLD3LNWB<0b1010, {?,0,0,0}, "32">; -def VLD3LNq16_UPD : VLD3LNWB<0b0110, "16"> { let Inst{5-4} = 0b10; } -def VLD3LNq32_UPD : VLD3LNWB<0b1010, "32"> { let Inst{6-4} = 0b100; } +def VLD3LNq16_UPD : VLD3LNWB<0b0110, {?,?,1,0}, "16">; +def VLD3LNq32_UPD : VLD3LNWB<0b1010, {?,1,0,0}, "32">; // VLD4LN : Vector Load (single 4-element structure to one lane) -class VLD4LN<bits<4> op11_8, string Dt> - : NLdSt<1, 0b10, op11_8, {?,?,?,?}, +class VLD4LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4), (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane), IIC_VLD4, "vld4", Dt, "\\{$dst1[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $addr", "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []>; -def VLD4LNd8 : VLD4LN<0b0011, "8">; -def VLD4LNd16 : VLD4LN<0b0111, "16"> { let Inst{5} = 0; } -def VLD4LNd32 : VLD4LN<0b1011, "32"> { let Inst{6} = 0; } +def VLD4LNd8 : VLD4LN<0b0011, {?,?,?,?}, "8">; +def VLD4LNd16 : VLD4LN<0b0111, {?,?,0,?}, "16">; +def VLD4LNd32 : VLD4LN<0b1011, {?,0,?,?}, "32">; // ...with double-spaced registers: -def VLD4LNq16 : VLD4LN<0b0111, "16"> { let Inst{5} = 1; } -def VLD4LNq32 : VLD4LN<0b1011, "32"> { let Inst{6} = 1; } +def VLD4LNq16 : VLD4LN<0b0111, {?,?,1,?}, "16">; +def VLD4LNq32 : VLD4LN<0b1011, {?,1,?,?}, "32">; // ...alternate versions to be allocated odd register numbers: -def VLD4LNq16odd : VLD4LN<0b0111, "16"> { let Inst{5} = 1; } -def VLD4LNq32odd : VLD4LN<0b1011, "32"> { let Inst{6} = 1; } +def VLD4LNq16odd : VLD4LN<0b0111, {?,?,1,?}, "16">; +def VLD4LNq32odd : VLD4LN<0b1011, {?,1,?,?}, "32">; // ...with address register writeback: -class VLD4LNWB<bits<4> op11_8, string Dt> - : NLdSt<1, 0b10, op11_8, {?,?,?,?}, +class VLD4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), (ins addrmode6:$addr, am6offset:$offset, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane), @@ -486,12 +480,12 @@ class VLD4LNWB<bits<4> op11_8, string Dt> "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4, $addr.addr = $wb", []>; -def VLD4LNd8_UPD : VLD4LNWB<0b0011, "8">; -def VLD4LNd16_UPD : VLD4LNWB<0b0111, "16"> { let Inst{5} = 0; } -def VLD4LNd32_UPD : VLD4LNWB<0b1011, "32"> { let Inst{6} = 0; } +def VLD4LNd8_UPD : VLD4LNWB<0b0011, {?,?,?,?}, "8">; +def VLD4LNd16_UPD : VLD4LNWB<0b0111, {?,?,0,?}, "16">; +def VLD4LNd32_UPD : VLD4LNWB<0b1011, {?,0,?,?}, "32">; -def VLD4LNq16_UPD : VLD4LNWB<0b0111, "16"> { let Inst{5} = 1; } -def VLD4LNq32_UPD : VLD4LNWB<0b1011, "32"> { let Inst{6} = 1; } +def VLD4LNq16_UPD : VLD4LNWB<0b0111, {?,?,1,?}, "16">; +def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32">; // VLD1DUP : Vector Load (single element to all lanes) // VLD2DUP : Vector Load (single 2-element structure to all lanes) @@ -500,31 +494,26 @@ def VLD4LNq32_UPD : VLD4LNWB<0b1011, "32"> { let Inst{6} = 1; } // FIXME: Not yet implemented. } // mayLoad = 1, hasExtraDefRegAllocReq = 1 +let mayStore = 1, hasExtraSrcRegAllocReq = 1 in { + // VST1 : Vector Store (multiple single elements) -class VST1D<bits<4> op7_4, string Dt, ValueType Ty> +class VST1D<bits<4> op7_4, string Dt> : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins addrmode6:$addr, DPR:$src), IIC_VST, - "vst1", Dt, "\\{$src\\}, $addr", "", - [(int_arm_neon_vst1 addrmode6:$addr, (Ty DPR:$src))]>; -class VST1Q<bits<4> op7_4, string Dt, ValueType Ty> - : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins addrmode6:$addr, QPR:$src), IIC_VST, - "vst1", Dt, "${src:dregpair}, $addr", "", - [(int_arm_neon_vst1 addrmode6:$addr, (Ty QPR:$src))]>; - -let hasExtraSrcRegAllocReq = 1 in { -def VST1d8 : VST1D<0b0000, "8", v8i8>; -def VST1d16 : VST1D<0b0100, "16", v4i16>; -def VST1d32 : VST1D<0b1000, "32", v2i32>; -def VST1df : VST1D<0b1000, "32", v2f32>; -def VST1d64 : VST1D<0b1100, "64", v1i64>; - -def VST1q8 : VST1Q<0b0000, "8", v16i8>; -def VST1q16 : VST1Q<0b0100, "16", v8i16>; -def VST1q32 : VST1Q<0b1000, "32", v4i32>; -def VST1qf : VST1Q<0b1000, "32", v4f32>; -def VST1q64 : VST1Q<0b1100, "64", v2i64>; -} // hasExtraSrcRegAllocReq - -let mayStore = 1, hasExtraSrcRegAllocReq = 1 in { + "vst1", Dt, "\\{$src\\}, $addr", "", []>; +class VST1Q<bits<4> op7_4, string Dt> + : NLdSt<0,0b00,0b1010,op7_4, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2), IIC_VST, + "vst1", Dt, "\\{$src1, $src2\\}, $addr", "", []>; + +def VST1d8 : VST1D<0b0000, "8">; +def VST1d16 : VST1D<0b0100, "16">; +def VST1d32 : VST1D<0b1000, "32">; +def VST1d64 : VST1D<0b1100, "64">; + +def VST1q8 : VST1Q<0b0000, "8">; +def VST1q16 : VST1Q<0b0100, "16">; +def VST1q32 : VST1Q<0b1000, "32">; +def VST1q64 : VST1Q<0b1100, "64">; // ...with address register writeback: class VST1DWB<bits<4> op7_4, string Dt> @@ -546,53 +535,50 @@ def VST1q16_UPD : VST1QWB<0b0100, "16">; def VST1q32_UPD : VST1QWB<0b1000, "32">; def VST1q64_UPD : VST1QWB<0b1100, "64">; -// These (dreg triple/quadruple) are for disassembly only. +// ...with 3 registers (some of these are only for the disassembler): class VST1D3<bits<4> op7_4, string Dt> : NLdSt<0, 0b00, 0b0110, op7_4, (outs), (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3), - IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3\\}, $addr", "", - [/* For disassembly only; pattern left blank */]>; -class VST1D4<bits<4> op7_4, string Dt> - : NLdSt<0, 0b00, 0b0010, op7_4, (outs), - (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), - IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr", "", - [/* For disassembly only; pattern left blank */]>; - -def VST1d8T : VST1D3<0b0000, "8">; -def VST1d16T : VST1D3<0b0100, "16">; -def VST1d32T : VST1D3<0b1000, "32">; -// VST1d64T : implemented as VST3d64 - -def VST1d8Q : VST1D4<0b0000, "8">; -def VST1d16Q : VST1D4<0b0100, "16">; -def VST1d32Q : VST1D4<0b1000, "32">; -// VST1d64Q : implemented as VST4d64 - -// ...with address register writeback: + IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3\\}, $addr", "", []>; class VST1D3WB<bits<4> op7_4, string Dt> : NLdSt<0, 0b00, 0b0110, op7_4, (outs GPR:$wb), (ins addrmode6:$addr, am6offset:$offset, DPR:$src1, DPR:$src2, DPR:$src3), IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3\\}, $addr$offset", - "$addr.addr = $wb", - [/* For disassembly only; pattern left blank */]>; + "$addr.addr = $wb", []>; + +def VST1d8T : VST1D3<0b0000, "8">; +def VST1d16T : VST1D3<0b0100, "16">; +def VST1d32T : VST1D3<0b1000, "32">; +def VST1d64T : VST1D3<0b1100, "64">; + +def VST1d8T_UPD : VST1D3WB<0b0000, "8">; +def VST1d16T_UPD : VST1D3WB<0b0100, "16">; +def VST1d32T_UPD : VST1D3WB<0b1000, "32">; +def VST1d64T_UPD : VST1D3WB<0b1100, "64">; + +// ...with 4 registers (some of these are only for the disassembler): +class VST1D4<bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, 0b0010, op7_4, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), + IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr", "", + []>; class VST1D4WB<bits<4> op7_4, string Dt> : NLdSt<0, 0b00, 0b0010, op7_4, (outs GPR:$wb), (ins addrmode6:$addr, am6offset:$offset, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr$offset", - "$addr.addr = $wb", - [/* For disassembly only; pattern left blank */]>; + "$addr.addr = $wb", []>; -def VST1d8T_UPD : VST1D3WB<0b0000, "8">; -def VST1d16T_UPD : VST1D3WB<0b0100, "16">; -def VST1d32T_UPD : VST1D3WB<0b1000, "32">; -// VST1d64T_UPD : implemented as VST3d64_UPD +def VST1d8Q : VST1D4<0b0000, "8">; +def VST1d16Q : VST1D4<0b0100, "16">; +def VST1d32Q : VST1D4<0b1000, "32">; +def VST1d64Q : VST1D4<0b1100, "64">; def VST1d8Q_UPD : VST1D4WB<0b0000, "8">; def VST1d16Q_UPD : VST1D4WB<0b0100, "16">; def VST1d32Q_UPD : VST1D4WB<0b1000, "32">; -// VST1d64Q_UPD : implemented as VST4d64_UPD +def VST1d64Q_UPD : VST1D4WB<0b1100, "64">; // VST2 : Vector Store (multiple 2-element structures) class VST2D<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -608,9 +594,6 @@ class VST2Q<bits<4> op7_4, string Dt> def VST2d8 : VST2D<0b1000, 0b0000, "8">; def VST2d16 : VST2D<0b1000, 0b0100, "16">; def VST2d32 : VST2D<0b1000, 0b1000, "32">; -def VST2d64 : NLdSt<0,0b00,0b1010,0b1100, (outs), - (ins addrmode6:$addr, DPR:$src1, DPR:$src2), IIC_VST, - "vst1", "64", "\\{$src1, $src2\\}, $addr", "", []>; def VST2q8 : VST2Q<0b0000, "8">; def VST2q16 : VST2Q<0b0100, "16">; @@ -632,11 +615,6 @@ class VST2QWB<bits<4> op7_4, string Dt> def VST2d8_UPD : VST2DWB<0b1000, 0b0000, "8">; def VST2d16_UPD : VST2DWB<0b1000, 0b0100, "16">; def VST2d32_UPD : VST2DWB<0b1000, 0b1000, "32">; -def VST2d64_UPD : NLdSt<0,0b00,0b1010,0b1100, (outs GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset, - DPR:$src1, DPR:$src2), IIC_VST, - "vst1", "64", "\\{$src1, $src2\\}, $addr$offset", - "$addr.addr = $wb", []>; def VST2q8_UPD : VST2QWB<0b0000, "8">; def VST2q16_UPD : VST2QWB<0b0100, "16">; @@ -659,10 +637,6 @@ class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt> def VST3d8 : VST3D<0b0100, 0b0000, "8">; def VST3d16 : VST3D<0b0100, 0b0100, "16">; def VST3d32 : VST3D<0b0100, 0b1000, "32">; -def VST3d64 : NLdSt<0,0b00,0b0110,0b1100, (outs), - (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3), - IIC_VST, - "vst1", "64", "\\{$src1, $src2, $src3\\}, $addr", "", []>; // ...with address register writeback: class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -675,11 +649,6 @@ class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt> def VST3d8_UPD : VST3DWB<0b0100, 0b0000, "8">; def VST3d16_UPD : VST3DWB<0b0100, 0b0100, "16">; def VST3d32_UPD : VST3DWB<0b0100, 0b1000, "32">; -def VST3d64_UPD : NLdSt<0,0b00,0b0110,0b1100, (outs GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset, - DPR:$src1, DPR:$src2, DPR:$src3), IIC_VST, - "vst1", "64", "\\{$src1, $src2, $src3\\}, $addr$offset", - "$addr.addr = $wb", []>; // ...with double-spaced registers (non-updating versions for disassembly only): def VST3q8 : VST3D<0b0101, 0b0000, "8">; @@ -704,11 +673,6 @@ class VST4D<bits<4> op11_8, bits<4> op7_4, string Dt> def VST4d8 : VST4D<0b0000, 0b0000, "8">; def VST4d16 : VST4D<0b0000, 0b0100, "16">; def VST4d32 : VST4D<0b0000, 0b1000, "32">; -def VST4d64 : NLdSt<0,0b00,0b0010,0b1100, (outs), - (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, - DPR:$src4), IIC_VST, - "vst1", "64", "\\{$src1, $src2, $src3, $src4\\}, $addr", - "", []>; // ...with address register writeback: class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -721,12 +685,6 @@ class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt> def VST4d8_UPD : VST4DWB<0b0000, 0b0000, "8">; def VST4d16_UPD : VST4DWB<0b0000, 0b0100, "16">; def VST4d32_UPD : VST4DWB<0b0000, 0b1000, "32">; -def VST4d64_UPD : NLdSt<0,0b00,0b0010,0b1100, (outs GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset, - DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST, - "vst1", "64", - "\\{$src1, $src2, $src3, $src4\\}, $addr$offset", - "$addr.addr = $wb", []>; // ...with double-spaced registers (non-updating versions for disassembly only): def VST4q8 : VST4D<0b0001, 0b0000, "8">; @@ -745,109 +703,109 @@ def VST4q32odd_UPD : VST4DWB<0b0001, 0b1000, "32">; // FIXME: Not yet implemented. // VST2LN : Vector Store (single 2-element structure from one lane) -class VST2LN<bits<4> op11_8, string Dt> - : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs), +class VST2LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b00, op11_8, op7_4, (outs), (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VST, "vst2", Dt, "\\{$src1[$lane], $src2[$lane]\\}, $addr", "", []>; -def VST2LNd8 : VST2LN<0b0001, "8">; -def VST2LNd16 : VST2LN<0b0101, "16"> { let Inst{5} = 0; } -def VST2LNd32 : VST2LN<0b1001, "32"> { let Inst{6} = 0; } +def VST2LNd8 : VST2LN<0b0001, {?,?,?,?}, "8">; +def VST2LNd16 : VST2LN<0b0101, {?,?,0,?}, "16">; +def VST2LNd32 : VST2LN<0b1001, {?,0,?,?}, "32">; // ...with double-spaced registers: -def VST2LNq16 : VST2LN<0b0101, "16"> { let Inst{5} = 1; } -def VST2LNq32 : VST2LN<0b1001, "32"> { let Inst{6} = 1; } +def VST2LNq16 : VST2LN<0b0101, {?,?,1,?}, "16">; +def VST2LNq32 : VST2LN<0b1001, {?,1,?,?}, "32">; // ...alternate versions to be allocated odd register numbers: -def VST2LNq16odd : VST2LN<0b0101, "16"> { let Inst{5} = 1; } -def VST2LNq32odd : VST2LN<0b1001, "32"> { let Inst{6} = 1; } +def VST2LNq16odd : VST2LN<0b0101, {?,?,1,?}, "16">; +def VST2LNq32odd : VST2LN<0b1001, {?,1,?,?}, "32">; // ...with address register writeback: -class VST2LNWB<bits<4> op11_8, string Dt> - : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs GPR:$wb), +class VST2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b00, op11_8, op7_4, (outs GPR:$wb), (ins addrmode6:$addr, am6offset:$offset, DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VST, "vst2", Dt, "\\{$src1[$lane], $src2[$lane]\\}, $addr$offset", "$addr.addr = $wb", []>; -def VST2LNd8_UPD : VST2LNWB<0b0001, "8">; -def VST2LNd16_UPD : VST2LNWB<0b0101, "16"> { let Inst{5} = 0; } -def VST2LNd32_UPD : VST2LNWB<0b1001, "32"> { let Inst{6} = 0; } +def VST2LNd8_UPD : VST2LNWB<0b0001, {?,?,?,?}, "8">; +def VST2LNd16_UPD : VST2LNWB<0b0101, {?,?,0,?}, "16">; +def VST2LNd32_UPD : VST2LNWB<0b1001, {?,0,?,?}, "32">; -def VST2LNq16_UPD : VST2LNWB<0b0101, "16"> { let Inst{5} = 1; } -def VST2LNq32_UPD : VST2LNWB<0b1001, "32"> { let Inst{6} = 1; } +def VST2LNq16_UPD : VST2LNWB<0b0101, {?,?,1,?}, "16">; +def VST2LNq32_UPD : VST2LNWB<0b1001, {?,1,?,?}, "32">; // VST3LN : Vector Store (single 3-element structure from one lane) -class VST3LN<bits<4> op11_8, string Dt> - : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs), +class VST3LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b00, op11_8, op7_4, (outs), (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane), IIC_VST, "vst3", Dt, "\\{$src1[$lane], $src2[$lane], $src3[$lane]\\}, $addr", "", []>; -def VST3LNd8 : VST3LN<0b0010, "8"> { let Inst{4} = 0; } -def VST3LNd16 : VST3LN<0b0110, "16"> { let Inst{5-4} = 0b00; } -def VST3LNd32 : VST3LN<0b1010, "32"> { let Inst{6-4} = 0b000; } +def VST3LNd8 : VST3LN<0b0010, {?,?,?,0}, "8">; +def VST3LNd16 : VST3LN<0b0110, {?,?,0,0}, "16">; +def VST3LNd32 : VST3LN<0b1010, {?,0,0,0}, "32">; // ...with double-spaced registers: -def VST3LNq16 : VST3LN<0b0110, "16"> { let Inst{5-4} = 0b10; } -def VST3LNq32 : VST3LN<0b1010, "32"> { let Inst{6-4} = 0b100; } +def VST3LNq16 : VST3LN<0b0110, {?,?,1,0}, "16">; +def VST3LNq32 : VST3LN<0b1010, {?,1,0,0}, "32">; // ...alternate versions to be allocated odd register numbers: -def VST3LNq16odd : VST3LN<0b0110, "16"> { let Inst{5-4} = 0b10; } -def VST3LNq32odd : VST3LN<0b1010, "32"> { let Inst{6-4} = 0b100; } +def VST3LNq16odd : VST3LN<0b0110, {?,?,1,0}, "16">; +def VST3LNq32odd : VST3LN<0b1010, {?,1,0,0}, "32">; // ...with address register writeback: -class VST3LNWB<bits<4> op11_8, string Dt> - : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs GPR:$wb), +class VST3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b00, op11_8, op7_4, (outs GPR:$wb), (ins addrmode6:$addr, am6offset:$offset, DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane), IIC_VST, "vst3", Dt, "\\{$src1[$lane], $src2[$lane], $src3[$lane]\\}, $addr$offset", "$addr.addr = $wb", []>; -def VST3LNd8_UPD : VST3LNWB<0b0010, "8"> { let Inst{4} = 0; } -def VST3LNd16_UPD : VST3LNWB<0b0110, "16"> { let Inst{5-4} = 0b00; } -def VST3LNd32_UPD : VST3LNWB<0b1010, "32"> { let Inst{6-4} = 0b000; } +def VST3LNd8_UPD : VST3LNWB<0b0010, {?,?,?,0}, "8">; +def VST3LNd16_UPD : VST3LNWB<0b0110, {?,?,0,0}, "16">; +def VST3LNd32_UPD : VST3LNWB<0b1010, {?,0,0,0}, "32">; -def VST3LNq16_UPD : VST3LNWB<0b0110, "16"> { let Inst{5-4} = 0b10; } -def VST3LNq32_UPD : VST3LNWB<0b1010, "32"> { let Inst{6-4} = 0b100; } +def VST3LNq16_UPD : VST3LNWB<0b0110, {?,?,1,0}, "16">; +def VST3LNq32_UPD : VST3LNWB<0b1010, {?,1,0,0}, "32">; // VST4LN : Vector Store (single 4-element structure from one lane) -class VST4LN<bits<4> op11_8, string Dt> - : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs), +class VST4LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b00, op11_8, op7_4, (outs), (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane), IIC_VST, "vst4", Dt, "\\{$src1[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $addr", "", []>; -def VST4LNd8 : VST4LN<0b0011, "8">; -def VST4LNd16 : VST4LN<0b0111, "16"> { let Inst{5} = 0; } -def VST4LNd32 : VST4LN<0b1011, "32"> { let Inst{6} = 0; } +def VST4LNd8 : VST4LN<0b0011, {?,?,?,?}, "8">; +def VST4LNd16 : VST4LN<0b0111, {?,?,0,?}, "16">; +def VST4LNd32 : VST4LN<0b1011, {?,0,?,?}, "32">; // ...with double-spaced registers: -def VST4LNq16 : VST4LN<0b0111, "16"> { let Inst{5} = 1; } -def VST4LNq32 : VST4LN<0b1011, "32"> { let Inst{6} = 1; } +def VST4LNq16 : VST4LN<0b0111, {?,?,1,?}, "16">; +def VST4LNq32 : VST4LN<0b1011, {?,1,?,?}, "32">; // ...alternate versions to be allocated odd register numbers: -def VST4LNq16odd : VST4LN<0b0111, "16"> { let Inst{5} = 1; } -def VST4LNq32odd : VST4LN<0b1011, "32"> { let Inst{6} = 1; } +def VST4LNq16odd : VST4LN<0b0111, {?,?,1,?}, "16">; +def VST4LNq32odd : VST4LN<0b1011, {?,1,?,?}, "32">; // ...with address register writeback: -class VST4LNWB<bits<4> op11_8, string Dt> - : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs GPR:$wb), +class VST4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b00, op11_8, op7_4, (outs GPR:$wb), (ins addrmode6:$addr, am6offset:$offset, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane), IIC_VST, "vst4", Dt, "\\{$src1[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $addr$offset", "$addr.addr = $wb", []>; -def VST4LNd8_UPD : VST4LNWB<0b0011, "8">; -def VST4LNd16_UPD : VST4LNWB<0b0111, "16"> { let Inst{5} = 0; } -def VST4LNd32_UPD : VST4LNWB<0b1011, "32"> { let Inst{6} = 0; } +def VST4LNd8_UPD : VST4LNWB<0b0011, {?,?,?,?}, "8">; +def VST4LNd16_UPD : VST4LNWB<0b0111, {?,?,0,?}, "16">; +def VST4LNd32_UPD : VST4LNWB<0b1011, {?,0,?,?}, "32">; -def VST4LNq16_UPD : VST4LNWB<0b0111, "16"> { let Inst{5} = 1; } -def VST4LNq32_UPD : VST4LNWB<0b1011, "32"> { let Inst{6} = 1; } +def VST4LNq16_UPD : VST4LNWB<0b0111, {?,?,1,?}, "16">; +def VST4LNq32_UPD : VST4LNWB<0b1011, {?,1,?,?}, "32">; } // mayStore = 1, hasExtraSrcRegAllocReq = 1 @@ -906,18 +864,18 @@ class N2VD<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst), - (ins DPR:$src), IIC_VUNAD, OpcodeStr, Dt, "$dst, $src", "", + (ins DPR:$src), IIC_VUNAD, OpcodeStr, Dt,"$dst, $src", "", [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src))))]>; class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst), - (ins QPR:$src), IIC_VUNAQ, OpcodeStr, Dt, "$dst, $src", "", + (ins QPR:$src), IIC_VUNAQ, OpcodeStr, Dt,"$dst, $src", "", [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src))))]>; // Basic 2-register intrinsics, both double- and quad-register. class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, - bits<2> op17_16, bits<5> op11_7, bit op4, + bits<2> op17_16, bits<5> op11_7, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst), @@ -966,8 +924,8 @@ class N3VS<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src1, DPR_VFP2:$src2), IIC_VBIND, - OpcodeStr, Dt, "$dst, $src1, $src2", "", []> { + (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src1, DPR_VFP2:$src2), N3RegFrm, + IIC_VBIND, OpcodeStr, Dt, "$dst, $src1, $src2", "", []> { let isCommutable = Commutable; } @@ -975,7 +933,7 @@ class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2", "", [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]> { let isCommutable = Commutable; @@ -986,27 +944,28 @@ class N3VDX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> : N3VX<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin, OpcodeStr, "$dst, $src1, $src2", "", [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]>{ let isCommutable = Commutable; } + class N3VDSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> : N3V<0, 1, op21_20, op11_8, 1, 0, (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), - itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", [(set (Ty DPR:$dst), (Ty (ShOp (Ty DPR:$src1), - (Ty (NEONvduplane (Ty DPR_VFP2:$src2), imm:$lane)))))]>{ + (Ty (NEONvduplane (Ty DPR_VFP2:$src2),imm:$lane)))))]> { let isCommutable = 0; } class N3VDSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> : N3V<0, 1, op21_20, op11_8, 1, 0, (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), - IIC_VMULi16D, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + NVMulSLFrm, IIC_VMULi16D, OpcodeStr, Dt,"$dst, $src1, $src2[$lane]","", [(set (Ty DPR:$dst), (Ty (ShOp (Ty DPR:$src1), (Ty (NEONvduplane (Ty DPR_8:$src2), imm:$lane)))))]> { @@ -1017,7 +976,7 @@ class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 1, op4, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2", "", [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]> { let isCommutable = Commutable; @@ -1026,7 +985,7 @@ class N3VQX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> : N3VX<op24, op23, op21_20, op11_8, 1, op4, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, itin, OpcodeStr, "$dst, $src1, $src2", "", [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]>{ let isCommutable = Commutable; @@ -1036,7 +995,7 @@ class N3VQSL<bits<2> op21_20, bits<4> op11_8, ValueType ResTy, ValueType OpTy, SDNode ShOp> : N3V<1, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), - itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", [(set (ResTy QPR:$dst), (ResTy (ShOp (ResTy QPR:$src1), (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2), @@ -1047,7 +1006,7 @@ class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode ShOp> : N3V<1, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane), - IIC_VMULi16Q, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + NVMulSLFrm, IIC_VMULi16Q, OpcodeStr, Dt,"$dst, $src1, $src2[$lane]","", [(set (ResTy QPR:$dst), (ResTy (ShOp (ResTy QPR:$src1), (ResTy (NEONvduplane (OpTy DPR_8:$src2), @@ -1057,10 +1016,10 @@ class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt, // Basic 3-register intrinsics, both double- and quad-register. class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, - InstrItinClass itin, string OpcodeStr, string Dt, + Format f, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), f, itin, OpcodeStr, Dt, "$dst, $src1, $src2", "", [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1), (OpTy DPR:$src2))))]> { let isCommutable = Commutable; @@ -1069,7 +1028,7 @@ class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp> : N3V<0, 1, op21_20, op11_8, 1, 0, (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), - itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", [(set (Ty DPR:$dst), (Ty (IntOp (Ty DPR:$src1), (Ty (NEONvduplane (Ty DPR_VFP2:$src2), @@ -1080,19 +1039,18 @@ class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp> : N3V<0, 1, op21_20, op11_8, 1, 0, (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), - itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", [(set (Ty DPR:$dst), (Ty (IntOp (Ty DPR:$src1), - (Ty (NEONvduplane (Ty DPR_8:$src2), - imm:$lane)))))]> { + (Ty (NEONvduplane (Ty DPR_8:$src2), imm:$lane)))))]> { let isCommutable = 0; } class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, - InstrItinClass itin, string OpcodeStr, string Dt, + Format f, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 1, op4, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), f, itin, OpcodeStr, Dt, "$dst, $src1, $src2", "", [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), (OpTy QPR:$src2))))]> { let isCommutable = Commutable; @@ -1102,7 +1060,7 @@ class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<1, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), - itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", [(set (ResTy QPR:$dst), (ResTy (IntOp (ResTy QPR:$src1), (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2), @@ -1114,7 +1072,7 @@ class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<1, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane), - itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", [(set (ResTy QPR:$dst), (ResTy (IntOp (ResTy QPR:$src1), (ResTy (NEONvduplane (OpTy DPR_8:$src2), @@ -1128,14 +1086,14 @@ class N3VSMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, ValueType Ty, SDNode MulOp, SDNode OpNode> : N3V<op24, op23, op21_20, op11_8, 0, op4, (outs DPR_VFP2:$dst), - (ins DPR_VFP2:$src1, DPR_VFP2:$src2, DPR_VFP2:$src3), itin, + (ins DPR_VFP2:$src1, DPR_VFP2:$src2, DPR_VFP2:$src3), N3RegFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", []>; class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, SDNode MulOp, SDNode OpNode> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), itin, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", [(set DPR:$dst, (Ty (OpNode DPR:$src1, (Ty (MulOp DPR:$src2, DPR:$src3)))))]>; @@ -1144,7 +1102,8 @@ class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, ValueType Ty, SDNode MulOp, SDNode ShOp> : N3V<0, 1, op21_20, op11_8, 1, 0, (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin, + (ins DPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", [(set (Ty DPR:$dst), (Ty (ShOp (Ty DPR:$src1), @@ -1156,7 +1115,8 @@ class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, ValueType Ty, SDNode MulOp, SDNode ShOp> : N3V<0, 1, op21_20, op11_8, 1, 0, (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin, + (ins DPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", [(set (Ty DPR:$dst), (Ty (ShOp (Ty DPR:$src1), @@ -1168,7 +1128,7 @@ class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, SDNode MulOp, SDNode OpNode> : N3V<op24, op23, op21_20, op11_8, 1, op4, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), itin, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), N3RegFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", [(set QPR:$dst, (Ty (OpNode QPR:$src1, (Ty (MulOp QPR:$src2, QPR:$src3)))))]>; @@ -1177,7 +1137,8 @@ class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, SDNode MulOp, SDNode ShOp> : N3V<1, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin, + (ins QPR:$src1, QPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", [(set (ResTy QPR:$dst), (ResTy (ShOp (ResTy QPR:$src1), @@ -1190,7 +1151,8 @@ class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, SDNode MulOp, SDNode ShOp> : N3V<1, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin, + (ins QPR:$src1, QPR:$src2, DPR_8:$src3, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", [(set (ResTy QPR:$dst), (ResTy (ShOp (ResTy QPR:$src1), @@ -1204,7 +1166,7 @@ class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), itin, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1), (OpTy DPR:$src2), (OpTy DPR:$src3))))]>; @@ -1212,7 +1174,7 @@ class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<op24, op23, op21_20, op11_8, 1, op4, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), itin, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), N3RegFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), (OpTy QPR:$src2), (OpTy QPR:$src3))))]>; @@ -1223,7 +1185,7 @@ class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, Intrinsic IntOp> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), itin, + (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", [(set QPR:$dst, (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2), (TyD DPR:$src3))))]>; @@ -1232,7 +1194,8 @@ class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst), - (ins QPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin, + (ins QPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", [(set (ResTy QPR:$dst), (ResTy (IntOp (ResTy QPR:$src1), @@ -1244,7 +1207,8 @@ class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst), - (ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin, + (ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst", [(set (ResTy QPR:$dst), (ResTy (IntOp (ResTy QPR:$src1), @@ -1257,7 +1221,7 @@ class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, string OpcodeStr, string Dt, ValueType TyD, ValueType TyQ, Intrinsic IntOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins QPR:$src1, QPR:$src2), IIC_VBINi4D, + (outs DPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINi4D, OpcodeStr, Dt, "$dst, $src1, $src2", "", [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src1), (TyQ QPR:$src2))))]> { let isCommutable = Commutable; @@ -1268,7 +1232,7 @@ class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, Intrinsic IntOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), itin, + (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2", "", [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src1), (TyD DPR:$src2))))]> { let isCommutable = Commutable; @@ -1277,8 +1241,8 @@ class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<op24, 1, op21_20, op11_8, 1, 0, - (outs QPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), - itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + (outs QPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", [(set (ResTy QPR:$dst), (ResTy (IntOp (OpTy DPR:$src1), (OpTy (NEONvduplane (OpTy DPR_VFP2:$src2), @@ -1288,7 +1252,7 @@ class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), - itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", + NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "", [(set (ResTy QPR:$dst), (ResTy (IntOp (OpTy DPR:$src1), (OpTy (NEONvduplane (OpTy DPR_8:$src2), @@ -1299,7 +1263,7 @@ class N3VWInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, Intrinsic IntOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs QPR:$dst), (ins QPR:$src1, DPR:$src2), IIC_VSUBiD, + (outs QPR:$dst), (ins QPR:$src1, DPR:$src2), N3RegFrm, IIC_VSUBiD, OpcodeStr, Dt, "$dst, $src1, $src2", "", [(set QPR:$dst, (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2))))]> { let isCommutable = Commutable; @@ -1344,17 +1308,17 @@ class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, // Shift by immediate, // both double- and quad-register. class N2VDSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, - InstrItinClass itin, string OpcodeStr, string Dt, + Format f, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode> : N2VImm<op24, op23, op11_8, op7, 0, op4, - (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), itin, + (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), f, itin, OpcodeStr, Dt, "$dst, $src, $SIMM", "", [(set DPR:$dst, (Ty (OpNode (Ty DPR:$src), (i32 imm:$SIMM))))]>; class N2VQSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, - InstrItinClass itin, string OpcodeStr, string Dt, + Format f, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode> : N2VImm<op24, op23, op11_8, op7, 1, op4, - (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), itin, + (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), f, itin, OpcodeStr, Dt, "$dst, $src, $SIMM", "", [(set QPR:$dst, (Ty (OpNode (Ty QPR:$src), (i32 imm:$SIMM))))]>; @@ -1363,8 +1327,8 @@ class N2VLSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode> : N2VImm<op24, op23, op11_8, op7, op6, op4, - (outs QPR:$dst), (ins DPR:$src, i32imm:$SIMM), IIC_VSHLiD, - OpcodeStr, Dt, "$dst, $src, $SIMM", "", + (outs QPR:$dst), (ins DPR:$src, i32imm:$SIMM), N2RegVShLFrm, + IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src, $SIMM", "", [(set QPR:$dst, (ResTy (OpNode (OpTy DPR:$src), (i32 imm:$SIMM))))]>; @@ -1373,7 +1337,7 @@ class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode> : N2VImm<op24, op23, op11_8, op7, op6, op4, - (outs DPR:$dst), (ins QPR:$src, i32imm:$SIMM), itin, + (outs DPR:$dst), (ins QPR:$src, i32imm:$SIMM), N2RegVShRFrm, itin, OpcodeStr, Dt, "$dst, $src, $SIMM", "", [(set DPR:$dst, (ResTy (OpNode (OpTy QPR:$src), (i32 imm:$SIMM))))]>; @@ -1383,14 +1347,14 @@ class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, class N2VDShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), IIC_VPALiD, + (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), N2RegVShRFrm, IIC_VPALiD, OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst", [(set DPR:$dst, (Ty (add DPR:$src1, (Ty (ShOp DPR:$src2, (i32 imm:$SIMM))))))]>; class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), IIC_VPALiD, + (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), N2RegVShRFrm, IIC_VPALiD, OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst", [(set QPR:$dst, (Ty (add QPR:$src1, (Ty (ShOp QPR:$src2, (i32 imm:$SIMM))))))]>; @@ -1398,15 +1362,15 @@ class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, // Shift by immediate and insert, // both double- and quad-register. class N2VDShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, - string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> + Format f, string OpcodeStr, string Dt, ValueType Ty,SDNode ShOp> : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), IIC_VSHLiD, + (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), f, IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst", [(set DPR:$dst, (Ty (ShOp DPR:$src1, DPR:$src2, (i32 imm:$SIMM))))]>; class N2VQShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, - string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> + Format f, string OpcodeStr, string Dt, ValueType Ty,SDNode ShOp> : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), IIC_VSHLiQ, + (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), f, IIC_VSHLiQ, OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst", [(set QPR:$dst, (Ty (ShOp QPR:$src1, QPR:$src2, (i32 imm:$SIMM))))]>; @@ -1416,15 +1380,15 @@ class N2VCvtD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N2VImm<op24, op23, op11_8, op7, 0, op4, - (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), IIC_VUNAD, - OpcodeStr, Dt, "$dst, $src, $SIMM", "", + (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), NVCVTFrm, + IIC_VUNAD, OpcodeStr, Dt, "$dst, $src, $SIMM", "", [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src), (i32 imm:$SIMM))))]>; class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N2VImm<op24, op23, op11_8, op7, 1, op4, - (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), IIC_VUNAQ, - OpcodeStr, Dt, "$dst, $src, $SIMM", "", + (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), NVCVTFrm, + IIC_VUNAQ, OpcodeStr, Dt, "$dst, $src, $SIMM", "", [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src), (i32 imm:$SIMM))))]>; //===----------------------------------------------------------------------===// @@ -1568,24 +1532,24 @@ multiclass N2VLInt_QHS<bits<2> op24_23, bits<5> op11_7, bit op6, bit op4, // Neon 3-register vector intrinsics. // First with only element sizes of 16 and 32 bits: -multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, +multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, InstrItinClass itinD16, InstrItinClass itinD32, InstrItinClass itinQ16, InstrItinClass itinQ32, string OpcodeStr, string Dt, Intrinsic IntOp, bit Commutable = 0> { // 64-bit vector types. - def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, itinD16, + def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, f, itinD16, OpcodeStr, !strconcat(Dt, "16"), v4i16, v4i16, IntOp, Commutable>; - def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, itinD32, + def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, f, itinD32, OpcodeStr, !strconcat(Dt, "32"), v2i32, v2i32, IntOp, Commutable>; // 128-bit vector types. - def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, itinQ16, + def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, f, itinQ16, OpcodeStr, !strconcat(Dt, "16"), v8i16, v8i16, IntOp, Commutable>; - def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, itinQ32, + def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, f, itinQ32, OpcodeStr, !strconcat(Dt, "32"), v4i32, v4i32, IntOp, Commutable>; } @@ -1605,38 +1569,37 @@ multiclass N3VIntSL_HS<bits<4> op11_8, } // ....then also with element size of 8 bits: -multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, +multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, InstrItinClass itinD16, InstrItinClass itinD32, InstrItinClass itinQ16, InstrItinClass itinQ32, string OpcodeStr, string Dt, Intrinsic IntOp, bit Commutable = 0> - : N3VInt_HS<op24, op23, op11_8, op4, itinD16, itinD32, itinQ16, itinQ32, + : N3VInt_HS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32, OpcodeStr, Dt, IntOp, Commutable> { - def v8i8 : N3VDInt<op24, op23, 0b00, op11_8, op4, itinD16, + def v8i8 : N3VDInt<op24, op23, 0b00, op11_8, op4, f, itinD16, OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp, Commutable>; - def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, itinQ16, + def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, f, itinQ16, OpcodeStr, !strconcat(Dt, "8"), v16i8, v16i8, IntOp, Commutable>; } // ....then also with element size of 64 bits: -multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, +multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, InstrItinClass itinD16, InstrItinClass itinD32, InstrItinClass itinQ16, InstrItinClass itinQ32, string OpcodeStr, string Dt, Intrinsic IntOp, bit Commutable = 0> - : N3VInt_QHS<op24, op23, op11_8, op4, itinD16, itinD32, itinQ16, itinQ32, + : N3VInt_QHS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32, OpcodeStr, Dt, IntOp, Commutable> { - def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, itinD32, + def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, f, itinD32, OpcodeStr, !strconcat(Dt, "64"), v1i64, v1i64, IntOp, Commutable>; - def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, itinQ32, + def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, f, itinQ32, OpcodeStr, !strconcat(Dt, "64"), v2i64, v2i64, IntOp, Commutable>; } - // Neon Narrowing 3-register vector intrinsics, // source operand element sizes of 16, 32 and 64 bits: multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4, @@ -1866,46 +1829,46 @@ multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, // Neon 2-register vector shift by immediate, +// with f of either N2RegVShLFrm or N2RegVShRFrm // element sizes of 8, 16, 32 and 64 bits: multiclass N2VSh_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, - InstrItinClass itin, string OpcodeStr, string Dt, - SDNode OpNode> { + InstrItinClass itin, string OpcodeStr, string Dt, + SDNode OpNode, Format f> { // 64-bit vector types. - def v8i8 : N2VDSh<op24, op23, op11_8, 0, op4, itin, + def v8i8 : N2VDSh<op24, op23, op11_8, 0, op4, f, itin, OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> { let Inst{21-19} = 0b001; // imm6 = 001xxx } - def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, itin, + def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, f, itin, OpcodeStr, !strconcat(Dt, "16"), v4i16, OpNode> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } - def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, itin, + def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, f, itin, OpcodeStr, !strconcat(Dt, "32"), v2i32, OpNode> { let Inst{21} = 0b1; // imm6 = 1xxxxx } - def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, itin, + def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, f, itin, OpcodeStr, !strconcat(Dt, "64"), v1i64, OpNode>; // imm6 = xxxxxx // 128-bit vector types. - def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, itin, + def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, f, itin, OpcodeStr, !strconcat(Dt, "8"), v16i8, OpNode> { let Inst{21-19} = 0b001; // imm6 = 001xxx } - def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, itin, + def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, f, itin, OpcodeStr, !strconcat(Dt, "16"), v8i16, OpNode> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } - def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, itin, + def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, f, itin, OpcodeStr, !strconcat(Dt, "32"), v4i32, OpNode> { let Inst{21} = 0b1; // imm6 = 1xxxxx } - def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, itin, + def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, f, itin, OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>; // imm6 = xxxxxx } - // Neon Shift-Accumulate vector operations, // element sizes of 8, 16, 32 and 64 bits: multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, @@ -1947,41 +1910,43 @@ multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, // Neon Shift-Insert vector operations, +// with f of either N2RegVShLFrm or N2RegVShRFrm // element sizes of 8, 16, 32 and 64 bits: multiclass N2VShIns_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, - string OpcodeStr, SDNode ShOp> { + string OpcodeStr, SDNode ShOp, + Format f> { // 64-bit vector types. def v8i8 : N2VDShIns<op24, op23, op11_8, 0, op4, - OpcodeStr, "8", v8i8, ShOp> { + f, OpcodeStr, "8", v8i8, ShOp> { let Inst{21-19} = 0b001; // imm6 = 001xxx } def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4, - OpcodeStr, "16", v4i16, ShOp> { + f, OpcodeStr, "16", v4i16, ShOp> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4, - OpcodeStr, "32", v2i32, ShOp> { + f, OpcodeStr, "32", v2i32, ShOp> { let Inst{21} = 0b1; // imm6 = 1xxxxx } def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4, - OpcodeStr, "64", v1i64, ShOp>; + f, OpcodeStr, "64", v1i64, ShOp>; // imm6 = xxxxxx // 128-bit vector types. def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4, - OpcodeStr, "8", v16i8, ShOp> { + f, OpcodeStr, "8", v16i8, ShOp> { let Inst{21-19} = 0b001; // imm6 = 001xxx } def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4, - OpcodeStr, "16", v8i16, ShOp> { + f, OpcodeStr, "16", v8i16, ShOp> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4, - OpcodeStr, "32", v4i32, ShOp> { + f, OpcodeStr, "32", v4i32, ShOp> { let Inst{21} = 0b1; // imm6 = 1xxxxx } def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4, - OpcodeStr, "64", v2i64, ShOp>; + f, OpcodeStr, "64", v2i64, ShOp>; // imm6 = xxxxxx } @@ -2044,20 +2009,26 @@ defm VADDLu : N3VLInt_QHS<1,1,0b0000,0, IIC_VSHLiD, "vaddl", "u", defm VADDWs : N3VWInt_QHS<0,1,0b0001,0, "vaddw", "s", int_arm_neon_vaddws, 0>; defm VADDWu : N3VWInt_QHS<1,1,0b0001,0, "vaddw", "u", int_arm_neon_vaddwu, 0>; // VHADD : Vector Halving Add -defm VHADDs : N3VInt_QHS<0,0,0b0000,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, - IIC_VBINi4Q, "vhadd", "s", int_arm_neon_vhadds, 1>; -defm VHADDu : N3VInt_QHS<1,0,0b0000,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, - IIC_VBINi4Q, "vhadd", "u", int_arm_neon_vhaddu, 1>; +defm VHADDs : N3VInt_QHS<0, 0, 0b0000, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vhadd", "s", int_arm_neon_vhadds, 1>; +defm VHADDu : N3VInt_QHS<1, 0, 0b0000, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vhadd", "u", int_arm_neon_vhaddu, 1>; // VRHADD : Vector Rounding Halving Add -defm VRHADDs : N3VInt_QHS<0,0,0b0001,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, - IIC_VBINi4Q, "vrhadd", "s", int_arm_neon_vrhadds, 1>; -defm VRHADDu : N3VInt_QHS<1,0,0b0001,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, - IIC_VBINi4Q, "vrhadd", "u", int_arm_neon_vrhaddu, 1>; +defm VRHADDs : N3VInt_QHS<0, 0, 0b0001, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vrhadd", "s", int_arm_neon_vrhadds, 1>; +defm VRHADDu : N3VInt_QHS<1, 0, 0b0001, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vrhadd", "u", int_arm_neon_vrhaddu, 1>; // VQADD : Vector Saturating Add -defm VQADDs : N3VInt_QHSD<0,0,0b0000,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, - IIC_VBINi4Q, "vqadd", "s", int_arm_neon_vqadds, 1>; -defm VQADDu : N3VInt_QHSD<1,0,0b0000,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, - IIC_VBINi4Q, "vqadd", "u", int_arm_neon_vqaddu, 1>; +defm VQADDs : N3VInt_QHSD<0, 0, 0b0000, 1, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vqadd", "s", int_arm_neon_vqadds, 1>; +defm VQADDu : N3VInt_QHSD<1, 0, 0b0000, 1, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vqadd", "u", int_arm_neon_vqaddu, 1>; // VADDHN : Vector Add and Narrow Returning High Half (D = Q + Q) defm VADDHN : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i", int_arm_neon_vaddhn, 1>; @@ -2070,10 +2041,10 @@ defm VRADDHN : N3VNInt_HSD<1,1,0b0100,0, "vraddhn", "i", // VMUL : Vector Multiply (integer, polynomial and floating-point) defm VMUL : N3V_QHS<0, 0, 0b1001, 1, IIC_VMULi16D, IIC_VMULi32D, IIC_VMULi16Q, IIC_VMULi32Q, "vmul", "i", mul, 1>; -def VMULpd : N3VDInt<1, 0, 0b00, 0b1001, 1, IIC_VMULi16D, "vmul", "p8", - v8i8, v8i8, int_arm_neon_vmulp, 1>; -def VMULpq : N3VQInt<1, 0, 0b00, 0b1001, 1, IIC_VMULi16Q, "vmul", "p8", - v16i8, v16i8, int_arm_neon_vmulp, 1>; +def VMULpd : N3VDInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16D, "vmul", + "p8", v8i8, v8i8, int_arm_neon_vmulp, 1>; +def VMULpq : N3VQInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16Q, "vmul", + "p8", v16i8, v16i8, int_arm_neon_vmulp, 1>; def VMULfd : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VBIND, "vmul", "f32", v2f32, v2f32, fmul, 1>; def VMULfq : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VBINQ, "vmul", "f32", @@ -2103,7 +2074,7 @@ def : Pat<(v4f32 (fmul (v4f32 QPR:$src1), (SubReg_i32_lane imm:$lane)))>; // VQDMULH : Vector Saturating Doubling Multiply Returning High Half -defm VQDMULH : N3VInt_HS<0, 0, 0b1011, 0, IIC_VMULi16D, IIC_VMULi32D, +defm VQDMULH : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D, IIC_VMULi16Q, IIC_VMULi32Q, "vqdmulh", "s", int_arm_neon_vqdmulh, 1>; defm VQDMULHsl: N3VIntSL_HS<0b1100, IIC_VMULi16D, IIC_VMULi32D, @@ -2125,8 +2096,8 @@ def : Pat<(v4i32 (int_arm_neon_vqdmulh (v4i32 QPR:$src1), (SubReg_i32_lane imm:$lane)))>; // VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half -defm VQRDMULH : N3VInt_HS<1, 0, 0b1011, 0, IIC_VMULi16D, IIC_VMULi32D, - IIC_VMULi16Q, IIC_VMULi32Q, +defm VQRDMULH : N3VInt_HS<1, 0, 0b1011, 0, N3RegFrm, + IIC_VMULi16D,IIC_VMULi32D,IIC_VMULi16Q,IIC_VMULi32Q, "vqrdmulh", "s", int_arm_neon_vqrdmulh, 1>; defm VQRDMULHsl : N3VIntSL_HS<0b1101, IIC_VMULi16D, IIC_VMULi32D, IIC_VMULi16Q, IIC_VMULi32Q, @@ -2285,18 +2256,18 @@ defm VSUBLu : N3VLInt_QHS<1,1,0b0010,0, IIC_VSHLiD, "vsubl", "u", defm VSUBWs : N3VWInt_QHS<0,1,0b0011,0, "vsubw", "s", int_arm_neon_vsubws, 0>; defm VSUBWu : N3VWInt_QHS<1,1,0b0011,0, "vsubw", "u", int_arm_neon_vsubwu, 0>; // VHSUB : Vector Halving Subtract -defm VHSUBs : N3VInt_QHS<0, 0, 0b0010, 0, IIC_VBINi4D, IIC_VBINi4D, - IIC_VBINi4Q, IIC_VBINi4Q, +defm VHSUBs : N3VInt_QHS<0, 0, 0b0010, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, "vhsub", "s", int_arm_neon_vhsubs, 0>; -defm VHSUBu : N3VInt_QHS<1, 0, 0b0010, 0, IIC_VBINi4D, IIC_VBINi4D, - IIC_VBINi4Q, IIC_VBINi4Q, +defm VHSUBu : N3VInt_QHS<1, 0, 0b0010, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, "vhsub", "u", int_arm_neon_vhsubu, 0>; // VQSUB : Vector Saturing Subtract -defm VQSUBs : N3VInt_QHSD<0, 0, 0b0010, 1, IIC_VBINi4D, IIC_VBINi4D, - IIC_VBINi4Q, IIC_VBINi4Q, +defm VQSUBs : N3VInt_QHSD<0, 0, 0b0010, 1, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, "vqsub", "s", int_arm_neon_vqsubs, 0>; -defm VQSUBu : N3VInt_QHSD<1, 0, 0b0010, 1, IIC_VBINi4D, IIC_VBINi4D, - IIC_VBINi4Q, IIC_VBINi4Q, +defm VQSUBu : N3VInt_QHSD<1, 0, 0b0010, 1, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, "vqsub", "u", int_arm_neon_vqsubu, 0>; // VSUBHN : Vector Subtract and Narrow Returning High Half (D = Q - Q) defm VSUBHN : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i", @@ -2323,8 +2294,8 @@ defm VCGEs : N3V_QHS<0, 0, 0b0011, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, "vcge", "s", NEONvcge, 0>; defm VCGEu : N3V_QHS<1, 0, 0b0011, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, "vcge", "u", NEONvcgeu, 0>; -def VCGEfd : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", - v2i32, v2f32, NEONvcge, 0>; +def VCGEfd : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32, + NEONvcge, 0>; def VCGEfq : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32, NEONvcge, 0>; // For disassembly only. @@ -2351,21 +2322,27 @@ defm VCLTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s", "$dst, $src, #0">; // VACGE : Vector Absolute Compare Greater Than or Equal (aka VCAGE) -def VACGEd : N3VDInt<1, 0, 0b00, 0b1110, 1, IIC_VBIND, "vacge", "f32", - v2i32, v2f32, int_arm_neon_vacged, 0>; -def VACGEq : N3VQInt<1, 0, 0b00, 0b1110, 1, IIC_VBINQ, "vacge", "f32", - v4i32, v4f32, int_arm_neon_vacgeq, 0>; +def VACGEd : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge", + "f32", v2i32, v2f32, int_arm_neon_vacged, 0>; +def VACGEq : N3VQInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge", + "f32", v4i32, v4f32, int_arm_neon_vacgeq, 0>; // VACGT : Vector Absolute Compare Greater Than (aka VCAGT) -def VACGTd : N3VDInt<1, 0, 0b10, 0b1110, 1, IIC_VBIND, "vacgt", "f32", - v2i32, v2f32, int_arm_neon_vacgtd, 0>; -def VACGTq : N3VQInt<1, 0, 0b10, 0b1110, 1, IIC_VBINQ, "vacgt", "f32", - v4i32, v4f32, int_arm_neon_vacgtq, 0>; +def VACGTd : N3VDInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt", + "f32", v2i32, v2f32, int_arm_neon_vacgtd, 0>; +def VACGTq : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt", + "f32", v4i32, v4f32, int_arm_neon_vacgtq, 0>; // VTST : Vector Test Bits defm VTST : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, "vtst", "", NEONvtst, 1>; // Vector Bitwise Operations. +def vnot8 : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v8i8 immAllOnesV)))>; +def vnot16 : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v16i8 immAllOnesV)))>; + + // VAND : Vector Bitwise AND def VANDd : N3VDX<0, 0, 0b00, 0b0001, 1, IIC_VBINiD, "vand", v2i32, v2i32, and, 1>; @@ -2386,74 +2363,80 @@ def VORRq : N3VQX<0, 0, 0b10, 0b0001, 1, IIC_VBINiQ, "vorr", // VBIC : Vector Bitwise Bit Clear (AND NOT) def VBICd : N3VX<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2), IIC_VBINiD, - "vbic", "$dst, $src1, $src2", "", - [(set DPR:$dst, (v2i32 (and DPR:$src1, - (vnot_conv DPR:$src2))))]>; + (ins DPR:$src1, DPR:$src2), N3RegFrm, IIC_VBINiD, + "vbic", "$dst, $src1, $src2", "", + [(set DPR:$dst, (v2i32 (and DPR:$src1, + (vnot8 DPR:$src2))))]>; def VBICq : N3VX<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2), IIC_VBINiQ, - "vbic", "$dst, $src1, $src2", "", - [(set QPR:$dst, (v4i32 (and QPR:$src1, - (vnot_conv QPR:$src2))))]>; + (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINiQ, + "vbic", "$dst, $src1, $src2", "", + [(set QPR:$dst, (v4i32 (and QPR:$src1, + (vnot16 QPR:$src2))))]>; // VORN : Vector Bitwise OR NOT def VORNd : N3VX<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2), IIC_VBINiD, - "vorn", "$dst, $src1, $src2", "", - [(set DPR:$dst, (v2i32 (or DPR:$src1, - (vnot_conv DPR:$src2))))]>; + (ins DPR:$src1, DPR:$src2), N3RegFrm, IIC_VBINiD, + "vorn", "$dst, $src1, $src2", "", + [(set DPR:$dst, (v2i32 (or DPR:$src1, + (vnot8 DPR:$src2))))]>; def VORNq : N3VX<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2), IIC_VBINiQ, - "vorn", "$dst, $src1, $src2", "", - [(set QPR:$dst, (v4i32 (or QPR:$src1, - (vnot_conv QPR:$src2))))]>; + (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINiQ, + "vorn", "$dst, $src1, $src2", "", + [(set QPR:$dst, (v4i32 (or QPR:$src1, + (vnot16 QPR:$src2))))]>; // VMVN : Vector Bitwise NOT def VMVNd : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 0, 0, - (outs DPR:$dst), (ins DPR:$src), IIC_VSHLiD, - "vmvn", "$dst, $src", "", - [(set DPR:$dst, (v2i32 (vnot DPR:$src)))]>; + (outs DPR:$dst), (ins DPR:$src), IIC_VSHLiD, + "vmvn", "$dst, $src", "", + [(set DPR:$dst, (v2i32 (vnot8 DPR:$src)))]>; def VMVNq : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0, - (outs QPR:$dst), (ins QPR:$src), IIC_VSHLiD, - "vmvn", "$dst, $src", "", - [(set QPR:$dst, (v4i32 (vnot QPR:$src)))]>; -def : Pat<(v2i32 (vnot_conv DPR:$src)), (VMVNd DPR:$src)>; -def : Pat<(v4i32 (vnot_conv QPR:$src)), (VMVNq QPR:$src)>; + (outs QPR:$dst), (ins QPR:$src), IIC_VSHLiD, + "vmvn", "$dst, $src", "", + [(set QPR:$dst, (v4i32 (vnot16 QPR:$src)))]>; +def : Pat<(v2i32 (vnot8 DPR:$src)), (VMVNd DPR:$src)>; +def : Pat<(v4i32 (vnot16 QPR:$src)), (VMVNq QPR:$src)>; // VBSL : Vector Bitwise Select def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2, DPR:$src3), IIC_VCNTiD, - "vbsl", "$dst, $src2, $src3", "$src1 = $dst", - [(set DPR:$dst, - (v2i32 (or (and DPR:$src2, DPR:$src1), - (and DPR:$src3, (vnot_conv DPR:$src1)))))]>; + (ins DPR:$src1, DPR:$src2, DPR:$src3), + N3RegFrm, IIC_VCNTiD, + "vbsl", "$dst, $src2, $src3", "$src1 = $dst", + [(set DPR:$dst, + (v2i32 (or (and DPR:$src2, DPR:$src1), + (and DPR:$src3, (vnot8 DPR:$src1)))))]>; def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2, QPR:$src3), IIC_VCNTiQ, - "vbsl", "$dst, $src2, $src3", "$src1 = $dst", - [(set QPR:$dst, - (v4i32 (or (and QPR:$src2, QPR:$src1), - (and QPR:$src3, (vnot_conv QPR:$src1)))))]>; + (ins QPR:$src1, QPR:$src2, QPR:$src3), + N3RegFrm, IIC_VCNTiQ, + "vbsl", "$dst, $src2, $src3", "$src1 = $dst", + [(set QPR:$dst, + (v4i32 (or (and QPR:$src2, QPR:$src1), + (and QPR:$src3, (vnot16 QPR:$src1)))))]>; // VBIF : Vector Bitwise Insert if False // like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst", def VBIFd : N3VX<1, 0, 0b11, 0b0001, 0, 1, (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), - IIC_VBINiD, "vbif", "$dst, $src2, $src3", "$src1 = $dst", + N3RegFrm, IIC_VBINiD, + "vbif", "$dst, $src2, $src3", "$src1 = $dst", [/* For disassembly only; pattern left blank */]>; def VBIFq : N3VX<1, 0, 0b11, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), - IIC_VBINiQ, "vbif", "$dst, $src2, $src3", "$src1 = $dst", + N3RegFrm, IIC_VBINiQ, + "vbif", "$dst, $src2, $src3", "$src1 = $dst", [/* For disassembly only; pattern left blank */]>; // VBIT : Vector Bitwise Insert if True // like VBSL but with: "vbit $dst, $src2, $src1", "$src3 = $dst", def VBITd : N3VX<1, 0, 0b10, 0b0001, 0, 1, (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), - IIC_VBINiD, "vbit", "$dst, $src2, $src3", "$src1 = $dst", + N3RegFrm, IIC_VBINiD, + "vbit", "$dst, $src2, $src3", "$src1 = $dst", [/* For disassembly only; pattern left blank */]>; def VBITq : N3VX<1, 0, 0b10, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), - IIC_VBINiQ, "vbit", "$dst, $src2, $src3", "$src1 = $dst", + N3RegFrm, IIC_VBINiQ, + "vbit", "$dst, $src2, $src3", "$src1 = $dst", [/* For disassembly only; pattern left blank */]>; // VBIT/VBIF are not yet implemented. The TwoAddress pass will not go looking @@ -2463,15 +2446,15 @@ def VBITq : N3VX<1, 0, 0b10, 0b0001, 1, 1, // Vector Absolute Differences. // VABD : Vector Absolute Difference -defm VABDs : N3VInt_QHS<0, 0, 0b0111, 0, IIC_VBINi4D, IIC_VBINi4D, - IIC_VBINi4Q, IIC_VBINi4Q, +defm VABDs : N3VInt_QHS<0, 0, 0b0111, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, "vabd", "s", int_arm_neon_vabds, 0>; -defm VABDu : N3VInt_QHS<1, 0, 0b0111, 0, IIC_VBINi4D, IIC_VBINi4D, - IIC_VBINi4Q, IIC_VBINi4Q, +defm VABDu : N3VInt_QHS<1, 0, 0b0111, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, "vabd", "u", int_arm_neon_vabdu, 0>; -def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, IIC_VBIND, +def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBIND, "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 0>; -def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, IIC_VBINQ, +def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBINQ, "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 0>; // VABDL : Vector Absolute Difference Long (Q = | D - D |) @@ -2491,36 +2474,40 @@ defm VABALu : N3VLInt3_QHS<1,1,0b0101,0, "vabal", "u", int_arm_neon_vabalu>; // Vector Maximum and Minimum. // VMAX : Vector Maximum -defm VMAXs : N3VInt_QHS<0,0,0b0110,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, - IIC_VBINi4Q, "vmax", "s", int_arm_neon_vmaxs, 1>; -defm VMAXu : N3VInt_QHS<1,0,0b0110,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, - IIC_VBINi4Q, "vmax", "u", int_arm_neon_vmaxu, 1>; -def VMAXfd : N3VDInt<0, 0, 0b00, 0b1111, 0, IIC_VBIND, "vmax", "f32", - v2f32, v2f32, int_arm_neon_vmaxs, 1>; -def VMAXfq : N3VQInt<0, 0, 0b00, 0b1111, 0, IIC_VBINQ, "vmax", "f32", - v4f32, v4f32, int_arm_neon_vmaxs, 1>; +defm VMAXs : N3VInt_QHS<0, 0, 0b0110, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vmax", "s", int_arm_neon_vmaxs, 1>; +defm VMAXu : N3VInt_QHS<1, 0, 0b0110, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vmax", "u", int_arm_neon_vmaxu, 1>; +def VMAXfd : N3VDInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBIND, "vmax", + "f32", v2f32, v2f32, int_arm_neon_vmaxs, 1>; +def VMAXfq : N3VQInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINQ, "vmax", + "f32", v4f32, v4f32, int_arm_neon_vmaxs, 1>; // VMIN : Vector Minimum -defm VMINs : N3VInt_QHS<0,0,0b0110,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, - IIC_VBINi4Q, "vmin", "s", int_arm_neon_vmins, 1>; -defm VMINu : N3VInt_QHS<1,0,0b0110,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, - IIC_VBINi4Q, "vmin", "u", int_arm_neon_vminu, 1>; -def VMINfd : N3VDInt<0, 0, 0b10, 0b1111, 0, IIC_VBIND, "vmin", "f32", - v2f32, v2f32, int_arm_neon_vmins, 1>; -def VMINfq : N3VQInt<0, 0, 0b10, 0b1111, 0, IIC_VBINQ, "vmin", "f32", - v4f32, v4f32, int_arm_neon_vmins, 1>; +defm VMINs : N3VInt_QHS<0, 0, 0b0110, 1, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vmin", "s", int_arm_neon_vmins, 1>; +defm VMINu : N3VInt_QHS<1, 0, 0b0110, 1, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vmin", "u", int_arm_neon_vminu, 1>; +def VMINfd : N3VDInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBIND, "vmin", + "f32", v2f32, v2f32, int_arm_neon_vmins, 1>; +def VMINfq : N3VQInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINQ, "vmin", + "f32", v4f32, v4f32, int_arm_neon_vmins, 1>; // Vector Pairwise Operations. // VPADD : Vector Pairwise Add -def VPADDi8 : N3VDInt<0, 0, 0b00, 0b1011, 1, IIC_VBINiD, "vpadd", "i8", - v8i8, v8i8, int_arm_neon_vpadd, 0>; -def VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, IIC_VBINiD, "vpadd", "i16", - v4i16, v4i16, int_arm_neon_vpadd, 0>; -def VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, IIC_VBINiD, "vpadd", "i32", - v2i32, v2i32, int_arm_neon_vpadd, 0>; -def VPADDf : N3VDInt<1, 0, 0b00, 0b1101, 0, IIC_VBIND, "vpadd", "f32", - v2f32, v2f32, int_arm_neon_vpadd, 0>; +def VPADDi8 : N3VDInt<0, 0, 0b00, 0b1011, 1, N3RegFrm, IIC_VBINiD, "vpadd", + "i8", v8i8, v8i8, int_arm_neon_vpadd, 0>; +def VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, N3RegFrm, IIC_VBINiD, "vpadd", + "i16", v4i16, v4i16, int_arm_neon_vpadd, 0>; +def VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, N3RegFrm, IIC_VBINiD, "vpadd", + "i32", v2i32, v2i32, int_arm_neon_vpadd, 0>; +def VPADDf : N3VDInt<1, 0, 0b00, 0b1101, 0, N3RegFrm, IIC_VBIND, "vpadd", + "f32", v2f32, v2f32, int_arm_neon_vpadd, 0>; // VPADDL : Vector Pairwise Add Long defm VPADDLs : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpaddl", "s", @@ -2535,36 +2522,36 @@ defm VPADALu : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b01101, 0, "vpadal", "u", int_arm_neon_vpadalu>; // VPMAX : Vector Pairwise Maximum -def VPMAXs8 : N3VDInt<0, 0, 0b00, 0b1010, 0, IIC_VBINi4D, "vpmax", "s8", - v8i8, v8i8, int_arm_neon_vpmaxs, 0>; -def VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, IIC_VBINi4D, "vpmax", "s16", - v4i16, v4i16, int_arm_neon_vpmaxs, 0>; -def VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, IIC_VBINi4D, "vpmax", "s32", - v2i32, v2i32, int_arm_neon_vpmaxs, 0>; -def VPMAXu8 : N3VDInt<1, 0, 0b00, 0b1010, 0, IIC_VBINi4D, "vpmax", "u8", - v8i8, v8i8, int_arm_neon_vpmaxu, 0>; -def VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, IIC_VBINi4D, "vpmax", "u16", - v4i16, v4i16, int_arm_neon_vpmaxu, 0>; -def VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, IIC_VBINi4D, "vpmax", "u32", - v2i32, v2i32, int_arm_neon_vpmaxu, 0>; -def VPMAXf : N3VDInt<1, 0, 0b00, 0b1111, 0, IIC_VBINi4D, "vpmax", "f32", - v2f32, v2f32, int_arm_neon_vpmaxs, 0>; +def VPMAXs8 : N3VDInt<0, 0, 0b00, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax", + "s8", v8i8, v8i8, int_arm_neon_vpmaxs, 0>; +def VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax", + "s16", v4i16, v4i16, int_arm_neon_vpmaxs, 0>; +def VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax", + "s32", v2i32, v2i32, int_arm_neon_vpmaxs, 0>; +def VPMAXu8 : N3VDInt<1, 0, 0b00, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax", + "u8", v8i8, v8i8, int_arm_neon_vpmaxu, 0>; +def VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax", + "u16", v4i16, v4i16, int_arm_neon_vpmaxu, 0>; +def VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax", + "u32", v2i32, v2i32, int_arm_neon_vpmaxu, 0>; +def VPMAXf : N3VDInt<1, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINi4D, "vpmax", + "f32", v2f32, v2f32, int_arm_neon_vpmaxs, 0>; // VPMIN : Vector Pairwise Minimum -def VPMINs8 : N3VDInt<0, 0, 0b00, 0b1010, 1, IIC_VBINi4D, "vpmin", "s8", - v8i8, v8i8, int_arm_neon_vpmins, 0>; -def VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, IIC_VBINi4D, "vpmin", "s16", - v4i16, v4i16, int_arm_neon_vpmins, 0>; -def VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, IIC_VBINi4D, "vpmin", "s32", - v2i32, v2i32, int_arm_neon_vpmins, 0>; -def VPMINu8 : N3VDInt<1, 0, 0b00, 0b1010, 1, IIC_VBINi4D, "vpmin", "u8", - v8i8, v8i8, int_arm_neon_vpminu, 0>; -def VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, IIC_VBINi4D, "vpmin", "u16", - v4i16, v4i16, int_arm_neon_vpminu, 0>; -def VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, IIC_VBINi4D, "vpmin", "u32", - v2i32, v2i32, int_arm_neon_vpminu, 0>; -def VPMINf : N3VDInt<1, 0, 0b10, 0b1111, 0, IIC_VBINi4D, "vpmin", "f32", - v2f32, v2f32, int_arm_neon_vpmins, 0>; +def VPMINs8 : N3VDInt<0, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin", + "s8", v8i8, v8i8, int_arm_neon_vpmins, 0>; +def VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin", + "s16", v4i16, v4i16, int_arm_neon_vpmins, 0>; +def VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin", + "s32", v2i32, v2i32, int_arm_neon_vpmins, 0>; +def VPMINu8 : N3VDInt<1, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin", + "u8", v8i8, v8i8, int_arm_neon_vpminu, 0>; +def VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin", + "u16", v4i16, v4i16, int_arm_neon_vpminu, 0>; +def VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin", + "u32", v2i32, v2i32, int_arm_neon_vpminu, 0>; +def VPMINf : N3VDInt<1, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINi4D, "vpmin", + "f32", v2f32, v2f32, int_arm_neon_vpmins, 0>; // Vector Reciprocal and Reciprocal Square Root Estimate and Step. @@ -2583,10 +2570,10 @@ def VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, v4f32, v4f32, int_arm_neon_vrecpe>; // VRECPS : Vector Reciprocal Step -def VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, +def VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, N3RegFrm, IIC_VRECSD, "vrecps", "f32", v2f32, v2f32, int_arm_neon_vrecps, 1>; -def VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, +def VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, N3RegFrm, IIC_VRECSQ, "vrecps", "f32", v4f32, v4f32, int_arm_neon_vrecps, 1>; @@ -2605,25 +2592,30 @@ def VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, v4f32, v4f32, int_arm_neon_vrsqrte>; // VRSQRTS : Vector Reciprocal Square Root Step -def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, +def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, N3RegFrm, IIC_VRECSD, "vrsqrts", "f32", v2f32, v2f32, int_arm_neon_vrsqrts, 1>; -def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, +def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, N3RegFrm, IIC_VRECSQ, "vrsqrts", "f32", v4f32, v4f32, int_arm_neon_vrsqrts, 1>; // Vector Shifts. // VSHL : Vector Shift -defm VSHLs : N3VInt_QHSD<0, 0, 0b0100, 0, IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, - IIC_VSHLiQ, "vshl", "s", int_arm_neon_vshifts, 0>; -defm VSHLu : N3VInt_QHSD<1, 0, 0b0100, 0, IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, - IIC_VSHLiQ, "vshl", "u", int_arm_neon_vshiftu, 0>; +defm VSHLs : N3VInt_QHSD<0, 0, 0b0100, 0, N3RegVShFrm, + IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ, + "vshl", "s", int_arm_neon_vshifts, 0>; +defm VSHLu : N3VInt_QHSD<1, 0, 0b0100, 0, N3RegVShFrm, + IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ, + "vshl", "u", int_arm_neon_vshiftu, 0>; // VSHL : Vector Shift Left (Immediate) -defm VSHLi : N2VSh_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl>; +defm VSHLi : N2VSh_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl, + N2RegVShLFrm>; // VSHR : Vector Shift Right (Immediate) -defm VSHRs : N2VSh_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s", NEONvshrs>; -defm VSHRu : N2VSh_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", NEONvshru>; +defm VSHRs : N2VSh_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s", NEONvshrs, + N2RegVShRFrm>; +defm VSHRu : N2VSh_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", NEONvshru, + N2RegVShRFrm>; // VSHLL : Vector Shift Left Long defm VSHLLs : N2VLSh_QHS<0, 1, 0b1010, 0, 0, 1, "vshll", "s", NEONvshlls>; @@ -2649,28 +2641,37 @@ defm VSHRN : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i", NEONvshrn>; // VRSHL : Vector Rounding Shift -defm VRSHLs : N3VInt_QHSD<0,0,0b0101,0, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, - IIC_VSHLi4Q, "vrshl", "s", int_arm_neon_vrshifts,0>; -defm VRSHLu : N3VInt_QHSD<1,0,0b0101,0, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, - IIC_VSHLi4Q, "vrshl", "u", int_arm_neon_vrshiftu,0>; +defm VRSHLs : N3VInt_QHSD<0, 0, 0b0101, 0, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vrshl", "s", int_arm_neon_vrshifts, 0>; +defm VRSHLu : N3VInt_QHSD<1, 0, 0b0101, 0, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vrshl", "u", int_arm_neon_vrshiftu, 0>; // VRSHR : Vector Rounding Shift Right -defm VRSHRs : N2VSh_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", NEONvrshrs>; -defm VRSHRu : N2VSh_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u", NEONvrshru>; +defm VRSHRs : N2VSh_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", NEONvrshrs, + N2RegVShRFrm>; +defm VRSHRu : N2VSh_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u", NEONvrshru, + N2RegVShRFrm>; // VRSHRN : Vector Rounding Shift Right and Narrow defm VRSHRN : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn", "i", NEONvrshrn>; // VQSHL : Vector Saturating Shift -defm VQSHLs : N3VInt_QHSD<0,0,0b0100,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, - IIC_VSHLi4Q, "vqshl", "s", int_arm_neon_vqshifts,0>; -defm VQSHLu : N3VInt_QHSD<1,0,0b0100,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, - IIC_VSHLi4Q, "vqshl", "u", int_arm_neon_vqshiftu,0>; +defm VQSHLs : N3VInt_QHSD<0, 0, 0b0100, 1, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vqshl", "s", int_arm_neon_vqshifts, 0>; +defm VQSHLu : N3VInt_QHSD<1, 0, 0b0100, 1, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vqshl", "u", int_arm_neon_vqshiftu, 0>; // VQSHL : Vector Saturating Shift Left (Immediate) -defm VQSHLsi : N2VSh_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s", NEONvqshls>; -defm VQSHLui : N2VSh_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u", NEONvqshlu>; +defm VQSHLsi : N2VSh_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshls, + N2RegVShLFrm>; +defm VQSHLui : N2VSh_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u",NEONvqshlu, + N2RegVShLFrm>; // VQSHLU : Vector Saturating Shift Left (Immediate, Unsigned) -defm VQSHLsu : N2VSh_QHSD<1,1,0b0110,1, IIC_VSHLi4D, "vqshlu","s",NEONvqshlsu>; +defm VQSHLsu : N2VSh_QHSD<1,1,0b0110,1, IIC_VSHLi4D,"vqshlu","s",NEONvqshlsu, + N2RegVShLFrm>; // VQSHRN : Vector Saturating Shift Right and Narrow defm VQSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "s", @@ -2683,12 +2684,12 @@ defm VQSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 0, 1, IIC_VSHLi4D, "vqshrun", "s", NEONvqshrnsu>; // VQRSHL : Vector Saturating Rounding Shift -defm VQRSHLs : N3VInt_QHSD<0,0,0b0101,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, - IIC_VSHLi4Q, "vqrshl", "s", - int_arm_neon_vqrshifts, 0>; -defm VQRSHLu : N3VInt_QHSD<1,0,0b0101,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, - IIC_VSHLi4Q, "vqrshl", "u", - int_arm_neon_vqrshiftu, 0>; +defm VQRSHLs : N3VInt_QHSD<0, 0, 0b0101, 1, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vqrshl", "s", int_arm_neon_vqrshifts, 0>; +defm VQRSHLu : N3VInt_QHSD<1, 0, 0b0101, 1, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vqrshl", "u", int_arm_neon_vqrshiftu, 0>; // VQRSHRN : Vector Saturating Rounding Shift Right and Narrow defm VQRSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "s", @@ -2708,9 +2709,9 @@ defm VRSRAs : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra", "s", NEONvrshrs>; defm VRSRAu : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra", "u", NEONvrshru>; // VSLI : Vector Shift Left and Insert -defm VSLI : N2VShIns_QHSD<1, 1, 0b0101, 1, "vsli", NEONvsli>; +defm VSLI : N2VShIns_QHSD<1, 1, 0b0101, 1, "vsli", NEONvsli, N2RegVShLFrm>; // VSRI : Vector Shift Right and Insert -defm VSRI : N2VShIns_QHSD<1, 1, 0b0100, 1, "vsri", NEONvsri>; +defm VSRI : N2VShIns_QHSD<1, 1, 0b0100, 1, "vsri", NEONvsri, N2RegVShRFrm>; // Vector Absolute and Saturating Absolute. @@ -2732,19 +2733,22 @@ defm VQABS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, // Vector Negate. -def vneg : PatFrag<(ops node:$in), (sub immAllZerosV, node:$in)>; -def vneg_conv : PatFrag<(ops node:$in), (sub immAllZerosV_bc, node:$in)>; +def vneg : PatFrag<(ops node:$in), (sub immAllZerosV, node:$in)>; +def vneg8 : PatFrag<(ops node:$in), + (sub (bitconvert (v8i8 immAllZerosV)), node:$in)>; +def vneg16 : PatFrag<(ops node:$in), + (sub (bitconvert (v16i8 immAllZerosV)), node:$in)>; class VNEGD<bits<2> size, string OpcodeStr, string Dt, ValueType Ty> : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$dst), (ins DPR:$src), IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "", - [(set DPR:$dst, (Ty (vneg DPR:$src)))]>; + [(set DPR:$dst, (Ty (vneg8 DPR:$src)))]>; class VNEGQ<bits<2> size, string OpcodeStr, string Dt, ValueType Ty> : N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$dst), (ins QPR:$src), IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "", - [(set QPR:$dst, (Ty (vneg QPR:$src)))]>; + [(set QPR:$dst, (Ty (vneg16 QPR:$src)))]>; -// VNEG : Vector Negate +// VNEG : Vector Negate (integer) def VNEGs8d : VNEGD<0b00, "vneg", "s8", v8i8>; def VNEGs16d : VNEGD<0b01, "vneg", "s16", v4i16>; def VNEGs32d : VNEGD<0b10, "vneg", "s32", v2i32>; @@ -2762,12 +2766,12 @@ def VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0, "vneg", "f32", "$dst, $src", "", [(set QPR:$dst, (v4f32 (fneg QPR:$src)))]>; -def : Pat<(v8i8 (vneg_conv DPR:$src)), (VNEGs8d DPR:$src)>; -def : Pat<(v4i16 (vneg_conv DPR:$src)), (VNEGs16d DPR:$src)>; -def : Pat<(v2i32 (vneg_conv DPR:$src)), (VNEGs32d DPR:$src)>; -def : Pat<(v16i8 (vneg_conv QPR:$src)), (VNEGs8q QPR:$src)>; -def : Pat<(v8i16 (vneg_conv QPR:$src)), (VNEGs16q QPR:$src)>; -def : Pat<(v4i32 (vneg_conv QPR:$src)), (VNEGs32q QPR:$src)>; +def : Pat<(v8i8 (vneg8 DPR:$src)), (VNEGs8d DPR:$src)>; +def : Pat<(v4i16 (vneg8 DPR:$src)), (VNEGs16d DPR:$src)>; +def : Pat<(v2i32 (vneg8 DPR:$src)), (VNEGs32d DPR:$src)>; +def : Pat<(v16i8 (vneg16 QPR:$src)), (VNEGs8q QPR:$src)>; +def : Pat<(v8i16 (vneg16 QPR:$src)), (VNEGs16q QPR:$src)>; +def : Pat<(v4i32 (vneg16 QPR:$src)), (VNEGs32q QPR:$src)>; // VQNEG : Vector Saturating Negate defm VQNEG : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0, @@ -2805,9 +2809,9 @@ def VSWPq : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 1, 0, // VMOV : Vector Move (Register) def VMOVDneon: N3VX<0, 0, 0b10, 0b0001, 0, 1, (outs DPR:$dst), (ins DPR:$src), - IIC_VMOVD, "vmov", "$dst, $src", "", []>; + N3RegFrm, IIC_VMOVD, "vmov", "$dst, $src", "", []>; def VMOVQ : N3VX<0, 0, 0b10, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src), - IIC_VMOVD, "vmov", "$dst, $src", "", []>; + N3RegFrm, IIC_VMOVD, "vmov", "$dst, $src", "", []>; // VMOV : Vector Move (Immediate) @@ -3048,30 +3052,29 @@ def VDUPfq : NVDup<0b11101010, 0b1011, 0b00, (outs QPR:$dst), (ins GPR:$src), // VDUP : Vector Duplicate Lane (from scalar to all elements) -class VDUPLND<bits<2> op19_18, bits<2> op17_16, - string OpcodeStr, string Dt, ValueType Ty> - : N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 0, 0, - (outs DPR:$dst), (ins DPR:$src, nohash_imm:$lane), IIC_VMOVD, - OpcodeStr, Dt, "$dst, $src[$lane]", "", - [(set DPR:$dst, (Ty (NEONvduplane (Ty DPR:$src), imm:$lane)))]>; +class VDUPLND<bits<4> op19_16, string OpcodeStr, string Dt, + ValueType Ty> + : NVDupLane<op19_16, 0, (outs DPR:$dst), (ins DPR:$src, nohash_imm:$lane), + IIC_VMOVD, OpcodeStr, Dt, "$dst, $src[$lane]", + [(set DPR:$dst, (Ty (NEONvduplane (Ty DPR:$src), imm:$lane)))]>; -class VDUPLNQ<bits<2> op19_18, bits<2> op17_16, string OpcodeStr, string Dt, +class VDUPLNQ<bits<4> op19_16, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy> - : N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 1, 0, - (outs QPR:$dst), (ins DPR:$src, nohash_imm:$lane), IIC_VMOVD, - OpcodeStr, Dt, "$dst, $src[$lane]", "", - [(set QPR:$dst, (ResTy (NEONvduplane (OpTy DPR:$src), imm:$lane)))]>; + : NVDupLane<op19_16, 1, (outs QPR:$dst), (ins DPR:$src, nohash_imm:$lane), + IIC_VMOVD, OpcodeStr, Dt, "$dst, $src[$lane]", + [(set QPR:$dst, (ResTy (NEONvduplane (OpTy DPR:$src), + imm:$lane)))]>; // Inst{19-16} is partially specified depending on the element size. -def VDUPLN8d : VDUPLND<{?,?}, {?,1}, "vdup", "8", v8i8>; -def VDUPLN16d : VDUPLND<{?,?}, {1,0}, "vdup", "16", v4i16>; -def VDUPLN32d : VDUPLND<{?,1}, {0,0}, "vdup", "32", v2i32>; -def VDUPLNfd : VDUPLND<{?,1}, {0,0}, "vdup", "32", v2f32>; -def VDUPLN8q : VDUPLNQ<{?,?}, {?,1}, "vdup", "8", v16i8, v8i8>; -def VDUPLN16q : VDUPLNQ<{?,?}, {1,0}, "vdup", "16", v8i16, v4i16>; -def VDUPLN32q : VDUPLNQ<{?,1}, {0,0}, "vdup", "32", v4i32, v2i32>; -def VDUPLNfq : VDUPLNQ<{?,1}, {0,0}, "vdup", "32", v4f32, v2f32>; +def VDUPLN8d : VDUPLND<{?,?,?,1}, "vdup", "8", v8i8>; +def VDUPLN16d : VDUPLND<{?,?,1,0}, "vdup", "16", v4i16>; +def VDUPLN32d : VDUPLND<{?,1,0,0}, "vdup", "32", v2i32>; +def VDUPLNfd : VDUPLND<{?,1,0,0}, "vdup", "32", v2f32>; +def VDUPLN8q : VDUPLNQ<{?,?,?,1}, "vdup", "8", v16i8, v8i8>; +def VDUPLN16q : VDUPLNQ<{?,?,1,0}, "vdup", "16", v8i16, v4i16>; +def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32>; +def VDUPLNfq : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4f32, v2f32>; def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)), (v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src, @@ -3233,15 +3236,15 @@ def VREV16q8 : VREV16Q<0b00, "vrev16", "8", v16i8>; class VEXTd<string OpcodeStr, string Dt, ValueType Ty> : N3V<0,1,0b11,{?,?,?,?},0,0, (outs DPR:$dst), - (ins DPR:$lhs, DPR:$rhs, i32imm:$index), IIC_VEXTD, - OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "", + (ins DPR:$lhs, DPR:$rhs, i32imm:$index), NVExtFrm, + IIC_VEXTD, OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "", [(set DPR:$dst, (Ty (NEONvext (Ty DPR:$lhs), (Ty DPR:$rhs), imm:$index)))]>; class VEXTq<string OpcodeStr, string Dt, ValueType Ty> : N3V<0,1,0b11,{?,?,?,?},1,0, (outs QPR:$dst), - (ins QPR:$lhs, QPR:$rhs, i32imm:$index), IIC_VEXTQ, - OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "", + (ins QPR:$lhs, QPR:$rhs, i32imm:$index), NVExtFrm, + IIC_VEXTQ, OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "", [(set QPR:$dst, (Ty (NEONvext (Ty QPR:$lhs), (Ty QPR:$rhs), imm:$index)))]>; @@ -3290,25 +3293,26 @@ def VZIPq32 : N2VQShuffle<0b10, 0b00011, IIC_VPERMQ3, "vzip", "32">; // VTBL : Vector Table Lookup def VTBL1 : N3V<1,1,0b11,0b1000,0,0, (outs DPR:$dst), - (ins DPR:$tbl1, DPR:$src), IIC_VTB1, + (ins DPR:$tbl1, DPR:$src), NVTBLFrm, IIC_VTB1, "vtbl", "8", "$dst, \\{$tbl1\\}, $src", "", [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl1 DPR:$tbl1, DPR:$src)))]>; let hasExtraSrcRegAllocReq = 1 in { def VTBL2 : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$dst), - (ins DPR:$tbl1, DPR:$tbl2, DPR:$src), IIC_VTB2, + (ins DPR:$tbl1, DPR:$tbl2, DPR:$src), NVTBLFrm, IIC_VTB2, "vtbl", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "", [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl2 DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>; def VTBL3 : N3V<1,1,0b11,0b1010,0,0, (outs DPR:$dst), - (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), IIC_VTB3, + (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), NVTBLFrm, IIC_VTB3, "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", "", [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl3 DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>; def VTBL4 : N3V<1,1,0b11,0b1011,0,0, (outs DPR:$dst), - (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), IIC_VTB4, + (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), + NVTBLFrm, IIC_VTB4, "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src", "", [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl4 DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src)))]>; @@ -3317,26 +3321,27 @@ def VTBL4 // VTBX : Vector Table Extension def VTBX1 : N3V<1,1,0b11,0b1000,1,0, (outs DPR:$dst), - (ins DPR:$orig, DPR:$tbl1, DPR:$src), IIC_VTBX1, + (ins DPR:$orig, DPR:$tbl1, DPR:$src), NVTBLFrm, IIC_VTBX1, "vtbx", "8", "$dst, \\{$tbl1\\}, $src", "$orig = $dst", [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx1 DPR:$orig, DPR:$tbl1, DPR:$src)))]>; let hasExtraSrcRegAllocReq = 1 in { def VTBX2 : N3V<1,1,0b11,0b1001,1,0, (outs DPR:$dst), - (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src), IIC_VTBX2, + (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src), NVTBLFrm, IIC_VTBX2, "vtbx", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "$orig = $dst", [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx2 DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>; def VTBX3 : N3V<1,1,0b11,0b1010,1,0, (outs DPR:$dst), - (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), IIC_VTBX3, + (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), + NVTBLFrm, IIC_VTBX3, "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", "$orig = $dst", [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx3 DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>; def VTBX4 : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$dst), (ins DPR:$orig, DPR:$tbl1, - DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), IIC_VTBX4, + DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), NVTBLFrm, IIC_VTBX4, "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src", "$orig = $dst", [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx4 DPR:$orig, DPR:$tbl1, @@ -3396,12 +3401,12 @@ def : N3VSPat<fmul, VMULfd_sfp>; //let neverHasSideEffects = 1 in //def VMLAfd_sfp : N3VSMulOp<0,0,0b00,0b1101,1, IIC_VMACD, "vmla", "f32", -// v2f32, fmul, fadd>; +// v2f32, fmul, fadd>; //def : N3VSMulOpPat<fmul, fadd, VMLAfd_sfp>; //let neverHasSideEffects = 1 in //def VMLSfd_sfp : N3VSMulOp<0,0,0b10,0b1101,1, IIC_VMACD, "vmls", "f32", -// v2f32, fmul, fsub>; +// v2f32, fmul, fsub>; //def : N3VSMulOpPat<fmul, fsub, VMLSfd_sfp>; // Vector Absolute used for single-precision FP @@ -3421,14 +3426,14 @@ def : N2VSPat<fneg, f32, v2f32, VNEGfd_sfp>; // Vector Maximum used for single-precision FP let neverHasSideEffects = 1 in def VMAXfd_sfp : N3V<0, 0, 0b00, 0b1111, 0, 0, (outs DPR_VFP2:$dst), - (ins DPR_VFP2:$src1, DPR_VFP2:$src2), IIC_VBIND, + (ins DPR_VFP2:$src1, DPR_VFP2:$src2), N3RegFrm, IIC_VBIND, "vmax", "f32", "$dst, $src1, $src2", "", []>; def : N3VSPat<NEONfmax, VMAXfd_sfp>; // Vector Minimum used for single-precision FP let neverHasSideEffects = 1 in def VMINfd_sfp : N3V<0, 0, 0b00, 0b1111, 0, 0, (outs DPR_VFP2:$dst), - (ins DPR_VFP2:$src1, DPR_VFP2:$src2), IIC_VBIND, + (ins DPR_VFP2:$src1, DPR_VFP2:$src2), N3RegFrm, IIC_VBIND, "vmin", "f32", "$dst, $src1, $src2", "", []>; def : N3VSPat<NEONfmin, VMINfd_sfp>; diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td index aca8230..0458389 100644 --- a/lib/Target/ARM/ARMInstrVFP.td +++ b/lib/Target/ARM/ARMInstrVFP.td @@ -545,7 +545,7 @@ def VULTOD : AVConv1XI<0b11101, 0b11, 0b1011, 0b1011, 1, // FP FMA Operations. // -def VMLAD : ADbI<0b11100, 0b00, 0, 0, +def VMLAD : ADbI_vmlX<0b11100, 0b00, 0, 0, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b), IIC_fpMAC64, "vmla", ".f64\t$dst, $a, $b", [(set DPR:$dst, (fadd (fmul DPR:$a, DPR:$b), @@ -558,7 +558,7 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0, [(set SPR:$dst, (fadd (fmul SPR:$a, SPR:$b), SPR:$dstin))]>, RegConstraint<"$dstin = $dst">; -def VNMLSD : ADbI<0b11100, 0b01, 0, 0, +def VNMLSD : ADbI_vmlX<0b11100, 0b01, 0, 0, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b), IIC_fpMAC64, "vnmls", ".f64\t$dst, $a, $b", [(set DPR:$dst, (fsub (fmul DPR:$a, DPR:$b), @@ -571,7 +571,7 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0, [(set SPR:$dst, (fsub (fmul SPR:$a, SPR:$b), SPR:$dstin))]>, RegConstraint<"$dstin = $dst">; -def VMLSD : ADbI<0b11100, 0b00, 1, 0, +def VMLSD : ADbI_vmlX<0b11100, 0b00, 1, 0, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b), IIC_fpMAC64, "vmls", ".f64\t$dst, $a, $b", [(set DPR:$dst, (fadd (fneg (fmul DPR:$a, DPR:$b)), @@ -589,7 +589,7 @@ def : Pat<(fsub DPR:$dstin, (fmul DPR:$a, (f64 DPR:$b))), def : Pat<(fsub SPR:$dstin, (fmul SPR:$a, SPR:$b)), (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[DontUseNEONForFP]>; -def VNMLAD : ADbI<0b11100, 0b01, 1, 0, +def VNMLAD : ADbI_vmlX<0b11100, 0b01, 1, 0, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b), IIC_fpMAC64, "vnmla", ".f64\t$dst, $a, $b", [(set DPR:$dst, (fsub (fneg (fmul DPR:$a, DPR:$b)), diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index bdbec30..cb762a4 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -341,6 +341,7 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned PReg = PMO.getReg(); unsigned PRegNum = PMO.isUndef() ? UINT_MAX : ARMRegisterInfo::getRegisterNumbering(PReg); + unsigned Count = 1; for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) { int NewOffset = MemOps[i].Offset; @@ -350,11 +351,14 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, : ARMRegisterInfo::getRegisterNumbering(Reg); // AM4 - register numbers in ascending order. // AM5 - consecutive register numbers in ascending order. + // Can only do up to 16 double-word registers per insn. if (Reg != ARM::SP && NewOffset == Offset + (int)Size && - ((isAM4 && RegNum > PRegNum) || RegNum == PRegNum+1)) { + ((isAM4 && RegNum > PRegNum) + || ((Size < 8 || Count < 16) && RegNum == PRegNum+1))) { Offset += Size; PRegNum = RegNum; + ++Count; } else { // Can't merge this in. Try merge the earlier ones first. MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset, diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index 2dad7f1..9e55cd8 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -22,10 +22,6 @@ using namespace llvm; static cl::opt<bool> ReserveR9("arm-reserve-r9", cl::Hidden, cl::desc("Reserve R9, making it unavailable as GPR")); -static cl::opt<bool> -UseNEONFP("arm-use-neon-fp", - cl::desc("Use NEON for single-precision FP"), - cl::init(false), cl::Hidden); static cl::opt<bool> UseMOVT("arm-use-movt", @@ -35,7 +31,8 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS, bool isT) : ARMArchVersion(V4) , ARMFPUType(None) - , UseNEONForSinglePrecisionFP(UseNEONFP) + , UseNEONForSinglePrecisionFP(false) + , SlowVMLx(false) , IsThumb(isT) , ThumbMode(Thumb1) , PostRAScheduler(false) @@ -115,14 +112,6 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS, if (!isThumb() || hasThumb2()) PostRAScheduler = true; - - // Set CPU specific features. - if (CPUString == "cortex-a8") { - // On Cortex-a8, it's faster to perform some single-precision FP - // operations with NEON instructions. - if (UseNEONFP.getPosition() == 0) - UseNEONForSinglePrecisionFP = true; - } } /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol. diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 2dc81a4..fa56a91 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -50,6 +50,10 @@ protected: /// determine if NEON should actually be used. bool UseNEONForSinglePrecisionFP; + /// SlowVMLx - If the VFP2 instructions are available, indicates whether + /// the VML[AS] instructions are slow (if so, don't use them). + bool SlowVMLx; + /// IsThumb - True if we are in thumb mode, false if in ARM mode. bool IsThumb; @@ -119,6 +123,7 @@ protected: bool hasNEON() const { return ARMFPUType >= NEON; } bool useNEONForSinglePrecisionFP() const { return hasNEON() && UseNEONForSinglePrecisionFP; } + bool useVMLx() const {return hasVFP2() && !SlowVMLx; } bool hasFP16() const { return HasFP16; } diff --git a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp index 4a7a1e4..ba736e3 100644 --- a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp +++ b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp @@ -841,7 +841,7 @@ GetARMSetPICJumpTableLabel2(unsigned uid, unsigned uid2, raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix() << getFunctionNumber() << '_' << uid << '_' << uid2 << "_set_" << MBB->getNumber(); - return OutContext.GetOrCreateTemporarySymbol(Name.str()); + return OutContext.GetOrCreateSymbol(Name.str()); } MCSymbol *ARMAsmPrinter:: @@ -849,7 +849,7 @@ GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const { SmallString<60> Name; raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() << '_' << uid << '_' << uid2; - return OutContext.GetOrCreateTemporarySymbol(Name.str()); + return OutContext.GetOrCreateSymbol(Name.str()); } void ARMAsmPrinter::printJTBlockOperand(const MachineInstr *MI, int OpNum) { @@ -1132,6 +1132,11 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { OutStreamer.EmitIntValue(0, 4/*size*/, 0/*addrspace*/); else // Internal to current translation unit. + // + // When we place the LSDA into the TEXT section, the type info pointers + // need to be indirect and pc-rel. We accomplish this by using NLPs. + // However, sometimes the types are local to the file. So we need to + // fill in the value for the NLP in those cases. OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(), OutContext), 4/*size*/, 0/*addrspace*/); @@ -1186,7 +1191,7 @@ void ARMAsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI) { // FIXME: MOVE TO SHARED PLACE. unsigned Id = (unsigned)MI->getOperand(2).getImm(); const char *Prefix = MAI->getPrivateGlobalPrefix(); - MCSymbol *Label =OutContext.GetOrCreateTemporarySymbol(Twine(Prefix) + MCSymbol *Label =OutContext.GetOrCreateSymbol(Twine(Prefix) + "PC" + Twine(getFunctionNumber()) + "_" + Twine(Id)); OutStreamer.EmitLabel(Label); diff --git a/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp b/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp index 7cb305f..ab2b06b 100644 --- a/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp +++ b/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp @@ -75,7 +75,7 @@ GetJumpTableSymbol(const MachineOperand &MO) const { #endif // Create a symbol for the name. - return Ctx.GetOrCreateTemporarySymbol(Name.str()); + return Ctx.GetOrCreateSymbol(Name.str()); } MCSymbol *ARMMCInstLower:: @@ -91,7 +91,7 @@ GetConstantPoolIndexSymbol(const MachineOperand &MO) const { #endif // Create a symbol for the name. - return Ctx.GetOrCreateTemporarySymbol(Name.str()); + return Ctx.GetOrCreateSymbol(Name.str()); } MCOperand ARMMCInstLower:: diff --git a/lib/Target/ARM/NEONPreAllocPass.cpp b/lib/Target/ARM/NEONPreAllocPass.cpp index c36fe63..7334259 100644 --- a/lib/Target/ARM/NEONPreAllocPass.cpp +++ b/lib/Target/ARM/NEONPreAllocPass.cpp @@ -46,10 +46,13 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs, default: break; + case ARM::VLD1q8: + case ARM::VLD1q16: + case ARM::VLD1q32: + case ARM::VLD1q64: case ARM::VLD2d8: case ARM::VLD2d16: case ARM::VLD2d32: - case ARM::VLD2d64: case ARM::VLD2LNd8: case ARM::VLD2LNd16: case ARM::VLD2LNd32: @@ -83,7 +86,7 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs, case ARM::VLD3d8: case ARM::VLD3d16: case ARM::VLD3d32: - case ARM::VLD3d64: + case ARM::VLD1d64T: case ARM::VLD3LNd8: case ARM::VLD3LNd16: case ARM::VLD3LNd32: @@ -128,7 +131,7 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs, case ARM::VLD4d8: case ARM::VLD4d16: case ARM::VLD4d32: - case ARM::VLD4d64: + case ARM::VLD1d64Q: case ARM::VLD4LNd8: case ARM::VLD4LNd16: case ARM::VLD4LNd32: @@ -170,10 +173,13 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs, Stride = 2; return true; + case ARM::VST1q8: + case ARM::VST1q16: + case ARM::VST1q32: + case ARM::VST1q64: case ARM::VST2d8: case ARM::VST2d16: case ARM::VST2d32: - case ARM::VST2d64: case ARM::VST2LNd8: case ARM::VST2LNd16: case ARM::VST2LNd32: @@ -207,7 +213,7 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs, case ARM::VST3d8: case ARM::VST3d16: case ARM::VST3d32: - case ARM::VST3d64: + case ARM::VST1d64T: case ARM::VST3LNd8: case ARM::VST3LNd16: case ARM::VST3LNd32: @@ -252,7 +258,7 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs, case ARM::VST4d8: case ARM::VST4d16: case ARM::VST4d32: - case ARM::VST4d64: + case ARM::VST1d64Q: case ARM::VST4LNd8: case ARM::VST4LNd16: case ARM::VST4LNd32: diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt index 57b65cf..85d5ca0 100644 --- a/lib/Target/ARM/README.txt +++ b/lib/Target/ARM/README.txt @@ -12,6 +12,9 @@ Reimplement 'select' in terms of 'SEL'. A few ARMv6T2 ops should be pattern matched: BFI, SBFX, and UBFX +Interesting optimization for PIC codegen on arm-linux: +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43129 + //===---------------------------------------------------------------------===// Crazy idea: Consider code that uses lots of 8-bit or 16-bit values. By the diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp index 29ae631..ad98839 100644 --- a/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -200,6 +200,8 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB, // It's illegal to emit pop instruction without operands. if (NumRegs) MBB.insert(MI, &*MIB); + else + MF.DeleteMachineInstr(MIB); return true; } diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp index e4abcdb..55163f9 100644 --- a/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -69,7 +69,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL = DebugLoc::getUnknownLoc(); if (I != MBB.end()) DL = I->getDebugLoc(); - if (RC == ARM::GPRRegisterClass) { + if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass) { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); MachineMemOperand *MMO = @@ -93,7 +93,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL = DebugLoc::getUnknownLoc(); if (I != MBB.end()) DL = I->getDebugLoc(); - if (RC == ARM::GPRRegisterClass) { + if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass) { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); MachineMemOperand *MMO = diff --git a/lib/Target/Alpha/AlphaInstrInfo.cpp b/lib/Target/Alpha/AlphaInstrInfo.cpp index 39f0749..d539e08 100644 --- a/lib/Target/Alpha/AlphaInstrInfo.cpp +++ b/lib/Target/Alpha/AlphaInstrInfo.cpp @@ -301,7 +301,15 @@ bool AlphaInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TB bool AllowModify) const { // If the block has no terminators, it just falls into the block after it. MachineBasicBlock::iterator I = MBB.end(); - if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) + if (I == MBB.begin()) + return false; + --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return false; + --I; + } + if (!isUnpredicatedTerminator(I)) return false; // Get the last instruction in the block. @@ -362,6 +370,11 @@ unsigned AlphaInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { MachineBasicBlock::iterator I = MBB.end(); if (I == MBB.begin()) return 0; --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return 0; + --I; + } if (I->getOpcode() != Alpha::BR && I->getOpcode() != Alpha::COND_BRANCH_I && I->getOpcode() != Alpha::COND_BRANCH_F) diff --git a/lib/Target/Blackfin/BlackfinInstrInfo.td b/lib/Target/Blackfin/BlackfinInstrInfo.td index e3c3993..2471688 100644 --- a/lib/Target/Blackfin/BlackfinInstrInfo.td +++ b/lib/Target/Blackfin/BlackfinInstrInfo.td @@ -65,23 +65,23 @@ def HI16 : SDNodeXForm<imm, [{ //===----------------------------------------------------------------------===// def imm3 : PatLeaf<(imm), [{return isInt<3>(N->getSExtValue());}]>; -def uimm3 : PatLeaf<(imm), [{return isUint<3>(N->getZExtValue());}]>; -def uimm4 : PatLeaf<(imm), [{return isUint<4>(N->getZExtValue());}]>; -def uimm5 : PatLeaf<(imm), [{return isUint<5>(N->getZExtValue());}]>; +def uimm3 : PatLeaf<(imm), [{return isUInt<3>(N->getZExtValue());}]>; +def uimm4 : PatLeaf<(imm), [{return isUInt<4>(N->getZExtValue());}]>; +def uimm5 : PatLeaf<(imm), [{return isUInt<5>(N->getZExtValue());}]>; def uimm5m2 : PatLeaf<(imm), [{ uint64_t value = N->getZExtValue(); - return value % 2 == 0 && isUint<5>(value); + return value % 2 == 0 && isUInt<5>(value); }]>; def uimm6m4 : PatLeaf<(imm), [{ uint64_t value = N->getZExtValue(); - return value % 4 == 0 && isUint<6>(value); + return value % 4 == 0 && isUInt<6>(value); }]>; def imm7 : PatLeaf<(imm), [{return isInt<7>(N->getSExtValue());}]>; def imm16 : PatLeaf<(imm), [{return isInt<16>(N->getSExtValue());}]>; -def uimm16 : PatLeaf<(imm), [{return isUint<16>(N->getZExtValue());}]>; +def uimm16 : PatLeaf<(imm), [{return isUInt<16>(N->getZExtValue());}]>; def ximm16 : PatLeaf<(imm), [{ int64_t value = N->getSExtValue(); @@ -610,8 +610,7 @@ def MOVE_ncccc : F1<(outs NotCC:$cc), (ins JustCC:$sb), "cc = !cc;", []>; def MOVECC_zext : F1<(outs D:$dst), (ins JustCC:$cc), - "$dst = $cc;", - [/*(set D:$dst, (zext JustCC:$cc))*/]>; + "$dst = $cc;", []>; def MOVENCC_z : F1<(outs D:$dst), (ins NotCC:$cc), "$dst = cc;", []>; @@ -859,17 +858,5 @@ def : Pat<(BfinCall (i32 tglobaladdr:$dst)), (CALLa tglobaladdr:$dst)>; def : Pat<(BfinCall (i32 texternalsym:$dst)), (CALLa texternalsym:$dst)>; - -//def : Pat<(sext JustCC:$cc), -// (NEG (MOVECC_zext JustCC:$cc))>; -//def : Pat<(anyext JustCC:$cc), -// (MOVECC_zext JustCC:$cc)>; -def : Pat<(i16 (zext JustCC:$cc)), - (EXTRACT_SUBREG (MOVECC_zext JustCC:$cc), bfin_subreg_lo16)>; -def : Pat<(i16 (sext JustCC:$cc)), - (EXTRACT_SUBREG (NEG (MOVECC_zext JustCC:$cc)), bfin_subreg_lo16)>; -def : Pat<(i16 (anyext JustCC:$cc)), - (EXTRACT_SUBREG (MOVECC_zext JustCC:$cc), bfin_subreg_lo16)>; - def : Pat<(i16 (trunc D:$src)), (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS D:$src, D)), bfin_subreg_lo16)>; diff --git a/lib/Target/Blackfin/BlackfinIntrinsics.td b/lib/Target/Blackfin/BlackfinIntrinsics.td index bf02cfe..ce21b08 100644 --- a/lib/Target/Blackfin/BlackfinIntrinsics.td +++ b/lib/Target/Blackfin/BlackfinIntrinsics.td @@ -21,14 +21,14 @@ let TargetPrefix = "bfin", isTarget = 1 in { // Execute csync instruction with workarounds def int_bfin_csync : GCCBuiltin<"__builtin_bfin_csync">, - Intrinsic<[llvm_void_ty]>; + Intrinsic<[]>; // Execute ssync instruction with workarounds def int_bfin_ssync : GCCBuiltin<"__builtin_bfin_ssync">, - Intrinsic<[llvm_void_ty]>; + Intrinsic<[]>; // Execute idle instruction with workarounds def int_bfin_idle : GCCBuiltin<"__builtin_bfin_idle">, - Intrinsic<[llvm_void_ty]>; + Intrinsic<[]>; } diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.cpp b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp index b39a342..84dc9ca 100644 --- a/lib/Target/Blackfin/BlackfinRegisterInfo.cpp +++ b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp @@ -164,7 +164,7 @@ void BlackfinRegisterInfo::loadConstant(MachineBasicBlock &MBB, return; } - if (isUint<16>(value)) { + if (isUInt<16>(value)) { BuildMI(MBB, I, DL, TII.get(BF::LOADuimm16), Reg).addImm(value); return; } @@ -255,13 +255,13 @@ BlackfinRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, assert(FIPos==1 && "Bad frame index operand"); MI.getOperand(FIPos).ChangeToRegister(BaseReg, false); MI.getOperand(FIPos+1).setImm(Offset); - if (isUint<6>(Offset)) { + if (isUInt<6>(Offset)) { MI.setDesc(TII.get(isStore ? BF::STORE32p_uimm6m4 : BF::LOAD32p_uimm6m4)); return 0; } - if (BaseReg == BF::FP && isUint<7>(-Offset)) { + if (BaseReg == BF::FP && isUInt<7>(-Offset)) { MI.setDesc(TII.get(isStore ? BF::STORE32fp_nimm7m4 : BF::LOAD32fp_nimm7m4)); diff --git a/lib/Target/CellSPU/SPU.h b/lib/Target/CellSPU/SPU.h index c960974..1f21511 100644 --- a/lib/Target/CellSPU/SPU.h +++ b/lib/Target/CellSPU/SPU.h @@ -15,7 +15,6 @@ #ifndef LLVM_TARGET_IBMCELLSPU_H #define LLVM_TARGET_IBMCELLSPU_H -#include "llvm/System/DataTypes.h" #include "llvm/Target/TargetMachine.h" namespace llvm { @@ -25,73 +24,7 @@ namespace llvm { FunctionPass *createSPUISelDag(SPUTargetMachine &TM); - /*--== Utility functions/predicates/etc used all over the place: --==*/ - //! Predicate test for a signed 10-bit value - /*! - \param Value The input value to be tested - - This predicate tests for a signed 10-bit value, returning the 10-bit value - as a short if true. - */ - template<typename T> - inline bool isS10Constant(T Value); - - template<> - inline bool isS10Constant<short>(short Value) { - int SExtValue = ((int) Value << (32 - 10)) >> (32 - 10); - return ((Value > 0 && Value <= (1 << 9) - 1) - || (Value < 0 && (short) SExtValue == Value)); - } - - template<> - inline bool isS10Constant<int>(int Value) { - return (Value >= -(1 << 9) && Value <= (1 << 9) - 1); - } - - template<> - inline bool isS10Constant<uint32_t>(uint32_t Value) { - return (Value <= ((1 << 9) - 1)); - } - - template<> - inline bool isS10Constant<int64_t>(int64_t Value) { - return (Value >= -(1 << 9) && Value <= (1 << 9) - 1); - } - - template<> - inline bool isS10Constant<uint64_t>(uint64_t Value) { - return (Value <= ((1 << 9) - 1)); - } - - //! Predicate test for an unsigned 10-bit value - /*! - \param Value The input value to be tested - - This predicate tests for an unsigned 10-bit value, returning the 10-bit value - as a short if true. - */ - inline bool isU10Constant(short Value) { - return (Value == (Value & 0x3ff)); - } - - inline bool isU10Constant(int Value) { - return (Value == (Value & 0x3ff)); - } - - inline bool isU10Constant(uint32_t Value) { - return (Value == (Value & 0x3ff)); - } - - inline bool isU10Constant(int64_t Value) { - return (Value == (Value & 0x3ff)); - } - - inline bool isU10Constant(uint64_t Value) { - return (Value == (Value & 0x3ff)); - } - extern Target TheCellSPUTarget; - } // Defines symbolic names for the SPU instructions. diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp index 396a921..90f8310 100644 --- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp +++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp @@ -44,28 +44,28 @@ namespace { bool isI64IntS10Immediate(ConstantSDNode *CN) { - return isS10Constant(CN->getSExtValue()); + return isInt<10>(CN->getSExtValue()); } //! ConstantSDNode predicate for i32 sign-extended, 10-bit immediates bool isI32IntS10Immediate(ConstantSDNode *CN) { - return isS10Constant(CN->getSExtValue()); + return isInt<10>(CN->getSExtValue()); } //! ConstantSDNode predicate for i32 unsigned 10-bit immediate values bool isI32IntU10Immediate(ConstantSDNode *CN) { - return isU10Constant(CN->getSExtValue()); + return isUInt<10>(CN->getSExtValue()); } //! ConstantSDNode predicate for i16 sign-extended, 10-bit immediate values bool isI16IntS10Immediate(ConstantSDNode *CN) { - return isS10Constant(CN->getSExtValue()); + return isInt<10>(CN->getSExtValue()); } //! SDNode predicate for i16 sign-extended, 10-bit immediate values @@ -80,7 +80,7 @@ namespace { bool isI16IntU10Immediate(ConstantSDNode *CN) { - return isU10Constant((short) CN->getZExtValue()); + return isUInt<10>((short) CN->getZExtValue()); } //! SDNode predicate for i16 sign-extended, 10-bit immediate values diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp index e863ee3..4b0d442 100644 --- a/lib/Target/CellSPU/SPUISelLowering.cpp +++ b/lib/Target/CellSPU/SPUISelLowering.cpp @@ -1107,7 +1107,8 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain, VarArgsFrameIndex = MFI->CreateFixedObject(StackSlotSize, ArgOffset, true, false); SDValue FIN = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT); - SDValue ArgVal = DAG.getRegister(ArgRegs[ArgRegIdx], MVT::v16i8); + unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::R32CRegClass); + SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8); SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, NULL, 0, false, false, 0); Chain = Store.getOperand(0); @@ -1491,7 +1492,7 @@ SDValue SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG, return SDValue(); Value = Value >> 32; } - if (isS10Constant(Value)) + if (isInt<10>(Value)) return DAG.getTargetConstant(Value, ValueType); } diff --git a/lib/Target/CellSPU/SPUInstrInfo.cpp b/lib/Target/CellSPU/SPUInstrInfo.cpp index 2306665..86825c8 100644 --- a/lib/Target/CellSPU/SPUInstrInfo.cpp +++ b/lib/Target/CellSPU/SPUInstrInfo.cpp @@ -450,7 +450,15 @@ SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, bool AllowModify) const { // If the block has no terminators, it just falls into the block after it. MachineBasicBlock::iterator I = MBB.end(); - if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) + if (I == MBB.begin()) + return false; + --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return false; + --I; + } + if (!isUnpredicatedTerminator(I)) return false; // Get the last instruction in the block. @@ -513,6 +521,11 @@ SPUInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { if (I == MBB.begin()) return 0; --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return 0; + --I; + } if (!isCondBranch(I) && !isUncondBranch(I)) return 0; diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td index 5068f77..6d1f87d 100644 --- a/lib/Target/CellSPU/SPUInstrInfo.td +++ b/lib/Target/CellSPU/SPUInstrInfo.td @@ -1268,7 +1268,12 @@ multiclass BitwiseAnd defm AND : BitwiseAnd; -// N.B.: vnot_conv is one of those special target selection pattern fragments, + +def vnot_cell_conv : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v4i32 immAllOnesV)))>; + +// N.B.: vnot_cell_conv is one of those special target selection pattern +// fragments, // in which we expect there to be a bit_convert on the constant. Bear in mind // that llvm translates "not <reg>" to "xor <reg>, -1" (or in this case, a // constant -1 vector.) @@ -1301,7 +1306,7 @@ multiclass AndComplement def r8: ANDCRegInst<R8C>; // Sometimes, the xor pattern has a bitcast constant: - def v16i8_conv: ANDCVecInst<v16i8, vnot_conv>; + def v16i8_conv: ANDCVecInst<v16i8, vnot_cell_conv>; } defm ANDC : AndComplement; @@ -1934,7 +1939,7 @@ multiclass SelectBits def v16i8: SELBVecInst<v16i8>; def v8i16: SELBVecInst<v8i16>; def v4i32: SELBVecInst<v4i32>; - def v2i64: SELBVecInst<v2i64, vnot_conv>; + def v2i64: SELBVecInst<v2i64, vnot_cell_conv>; def r128: SELBRegInst<GPRC>; def r64: SELBRegInst<R64C>; @@ -4373,7 +4378,7 @@ def : Pat<(v2f64 (bitconvert (v16i8 VECREG:$src))), (v2f64 VECREG:$src)>; def : Pat<(v2f64 (bitconvert (v8i16 VECREG:$src))), (v2f64 VECREG:$src)>; def : Pat<(v2f64 (bitconvert (v4i32 VECREG:$src))), (v2f64 VECREG:$src)>; def : Pat<(v2f64 (bitconvert (v2i64 VECREG:$src))), (v2f64 VECREG:$src)>; -def : Pat<(v2f64 (bitconvert (v2f64 VECREG:$src))), (v2f64 VECREG:$src)>; +def : Pat<(v2f64 (bitconvert (v4f32 VECREG:$src))), (v2f64 VECREG:$src)>; def : Pat<(i128 (bitconvert (v16i8 VECREG:$src))), (ORi128_vec VECREG:$src)>; diff --git a/lib/Target/CellSPU/SPURegisterInfo.cpp b/lib/Target/CellSPU/SPURegisterInfo.cpp index 8c78bab..f3071f2 100644 --- a/lib/Target/CellSPU/SPURegisterInfo.cpp +++ b/lib/Target/CellSPU/SPURegisterInfo.cpp @@ -28,6 +28,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineLocation.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/Target/TargetFrameInfo.h" #include "llvm/Target/TargetInstrInfo.h" @@ -336,6 +337,7 @@ SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); MachineFrameInfo *MFI = MF.getFrameInfo(); + DebugLoc dl = II->getDebugLoc(); while (!MI.getOperand(i).isFI()) { ++i; @@ -364,11 +366,22 @@ SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, // Replace the FrameIndex with base register with $sp (aka $r1) SPOp.ChangeToRegister(SPU::R1, false); - if (Offset > SPUFrameInfo::maxFrameOffset() - || Offset < SPUFrameInfo::minFrameOffset()) { - errs() << "Large stack adjustment (" - << Offset - << ") in SPURegisterInfo::eliminateFrameIndex."; + + // if 'Offset' doesn't fit to the D-form instruction's + // immediate, convert the instruction to X-form + // if the instruction is not an AI (which takes a s10 immediate), assume + // it is a load/store that can take a s14 immediate + if ((MI.getOpcode() == SPU::AIr32 && !isInt<10>(Offset)) + || !isInt<14>(Offset)) { + int newOpcode = convertDFormToXForm(MI.getOpcode()); + unsigned tmpReg = findScratchRegister(II, RS, &SPU::R32CRegClass, SPAdj); + BuildMI(MBB, II, dl, TII.get(SPU::ILr32), tmpReg ) + .addImm(Offset); + BuildMI(MBB, II, dl, TII.get(newOpcode), MI.getOperand(0).getReg()) + .addReg(tmpReg, RegState::Kill) + .addReg(SPU::R1); + // remove the replaced D-form instruction + MBB.erase(II); } else { MO.ChangeToImmediate(Offset); } @@ -423,6 +436,14 @@ void SPURegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, MF.getRegInfo().setPhysRegUnused(SPU::R0); MF.getRegInfo().setPhysRegUnused(SPU::R1); MF.getRegInfo().setPhysRegUnused(SPU::R2); + + MachineFrameInfo *MFI = MF.getFrameInfo(); + const TargetRegisterClass *RC = &SPU::R32CRegClass; + RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(), + RC->getAlignment(), + false)); + + } void SPURegisterInfo::emitPrologue(MachineFunction &MF) const @@ -448,7 +469,8 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const assert((FrameSize & 0xf) == 0 && "SPURegisterInfo::emitPrologue: FrameSize not aligned"); - if (FrameSize > 0 || MFI->hasCalls()) { + // the "empty" frame size is 16 - just the register scavenger spill slot + if (FrameSize > 16 || MFI->hasCalls()) { FrameSize = -(FrameSize + SPUFrameInfo::minStackSize()); if (hasDebugInfo) { // Mark effective beginning of when frame pointer becomes valid. @@ -460,14 +482,14 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const // for the ABI BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R0).addImm(16) .addReg(SPU::R1); - if (isS10Constant(FrameSize)) { + if (isInt<10>(FrameSize)) { // Spill $sp to adjusted $sp BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R1).addImm(FrameSize) .addReg(SPU::R1); // Adjust $sp by required amout BuildMI(MBB, MBBI, dl, TII.get(SPU::AIr32), SPU::R1).addReg(SPU::R1) .addImm(FrameSize); - } else if (FrameSize <= (1 << 16) - 1 && FrameSize >= -(1 << 16)) { + } else if (isInt<16>(FrameSize)) { // Frame size can be loaded into ILr32n, so temporarily spill $r2 and use // $r2 to adjust $sp: BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr128), SPU::R2) @@ -475,7 +497,7 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const .addReg(SPU::R1); BuildMI(MBB, MBBI, dl, TII.get(SPU::ILr32), SPU::R2) .addImm(FrameSize); - BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R1) + BuildMI(MBB, MBBI, dl, TII.get(SPU::STQXr32), SPU::R1) .addReg(SPU::R2) .addReg(SPU::R1); BuildMI(MBB, MBBI, dl, TII.get(SPU::Ar32), SPU::R1) @@ -549,9 +571,11 @@ SPURegisterInfo::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const "Can only insert epilog into returning blocks"); assert((FrameSize & 0xf) == 0 && "SPURegisterInfo::emitEpilogue: FrameSize not aligned"); - if (FrameSize > 0 || MFI->hasCalls()) { + + // the "empty" frame size is 16 - just the register scavenger spill slot + if (FrameSize > 16 || MFI->hasCalls()) { FrameSize = FrameSize + SPUFrameInfo::minStackSize(); - if (isS10Constant(FrameSize + LinkSlotOffset)) { + if (isInt<10>(FrameSize + LinkSlotOffset)) { // Reload $lr, adjust $sp by required amount // Note: We do this to slightly improve dual issue -- not by much, but it // is an opportunity for dual issue. @@ -574,7 +598,7 @@ SPURegisterInfo::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const .addReg(SPU::R2); BuildMI(MBB, MBBI, dl, TII.get(SPU::LQDr128), SPU::R0) .addImm(16) - .addReg(SPU::R2); + .addReg(SPU::R1); BuildMI(MBB, MBBI, dl, TII.get(SPU::SFIr32), SPU::R2). addReg(SPU::R2) .addImm(16); @@ -618,4 +642,43 @@ SPURegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const { return SPUGenRegisterInfo::getDwarfRegNumFull(RegNum, 0); } +int +SPURegisterInfo::convertDFormToXForm(int dFormOpcode) const +{ + switch(dFormOpcode) + { + case SPU::AIr32: return SPU::Ar32; + case SPU::LQDr32: return SPU::LQXr32; + case SPU::LQDr128: return SPU::LQXr128; + case SPU::LQDv16i8: return SPU::LQXv16i8; + case SPU::LQDv4f32: return SPU::LQXv4f32; + case SPU::STQDr32: return SPU::STQXr32; + case SPU::STQDr128: return SPU::STQXr128; + case SPU::STQDv16i8: return SPU::STQXv16i8; + case SPU::STQDv4i32: return SPU::STQXv4i32; + case SPU::STQDv4f32: return SPU::STQXv4f32; + + default: assert( false && "Unhandled D to X-form conversion"); + } + // default will assert, but need to return something to keep the + // compiler happy. + return dFormOpcode; +} + +// TODO this is already copied from PPC. Could this convenience function +// be moved to the RegScavenger class? +unsigned +SPURegisterInfo::findScratchRegister(MachineBasicBlock::iterator II, + RegScavenger *RS, + const TargetRegisterClass *RC, + int SPAdj) const +{ + assert(RS && "Register scavenging must be on"); + unsigned Reg = RS->FindUnusedReg(RC); + if (Reg == 0) + Reg = RS->scavengeRegister(RC, II, SPAdj); + assert( Reg && "Register scavenger failed"); + return Reg; +} + #include "SPUGenRegisterInfo.inc" diff --git a/lib/Target/CellSPU/SPURegisterInfo.h b/lib/Target/CellSPU/SPURegisterInfo.h index 48feb5c..0a70318 100644 --- a/lib/Target/CellSPU/SPURegisterInfo.h +++ b/lib/Target/CellSPU/SPURegisterInfo.h @@ -53,6 +53,10 @@ namespace llvm { virtual const TargetRegisterClass* const * getCalleeSavedRegClasses(const MachineFunction *MF) const; + //! Allow for scavenging, so we can get scratch registers when needed. + virtual bool requiresRegisterScavenging(const MachineFunction &MF) const + { return true; } + //! Return the reserved registers BitVector getReservedRegs(const MachineFunction &MF) const; @@ -97,6 +101,21 @@ namespace llvm { //! Get DWARF debugging register number int getDwarfRegNum(unsigned RegNum, bool isEH) const; + + //! Convert D-form load/store to X-form load/store + /*! + Converts a regiser displacement load/store into a register-indexed + load/store for large stack frames, when the stack frame exceeds the + range of a s10 displacement. + */ + int convertDFormToXForm(int dFormOpcode) const; + + //! Acquire an unused register in an emergency. + unsigned findScratchRegister(MachineBasicBlock::iterator II, + RegScavenger *RS, + const TargetRegisterClass *RC, + int SPAdj) const; + }; } // end namespace llvm diff --git a/lib/Target/MBlaze/MBlazeIntrinsics.td b/lib/Target/MBlaze/MBlazeIntrinsics.td index 76eb563..82552fa 100644 --- a/lib/Target/MBlaze/MBlazeIntrinsics.td +++ b/lib/Target/MBlaze/MBlazeIntrinsics.td @@ -21,11 +21,11 @@ let TargetPrefix = "mblaze", isTarget = 1 in { [llvm_i32_ty], [IntrWriteMem]>; - class MBFSL_Put_Intrinsic : Intrinsic<[llvm_void_ty], + class MBFSL_Put_Intrinsic : Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrWriteMem]>; - class MBFSL_PutT_Intrinsic : Intrinsic<[llvm_void_ty], + class MBFSL_PutT_Intrinsic : Intrinsic<[], [llvm_i32_ty], [IntrWriteMem]>; } diff --git a/lib/Target/MSIL/MSILWriter.cpp b/lib/Target/MSIL/MSILWriter.cpp index ac41cc8..15d16ec 100644 --- a/lib/Target/MSIL/MSILWriter.cpp +++ b/lib/Target/MSIL/MSILWriter.cpp @@ -424,7 +424,7 @@ void MSILWriter::printPtrLoad(uint64_t N) { case Module::Pointer32: printSimpleInstruction("ldc.i4",utostr(N).c_str()); // FIXME: Need overflow test? - if (!isUInt32(N)) { + if (!isUInt<32>(N)) { errs() << "Value = " << utostr(N) << '\n'; llvm_unreachable("32-bit pointer overflowed"); } diff --git a/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.cpp b/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.cpp index 32c6b04..f4d7d8a 100644 --- a/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.cpp +++ b/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.cpp @@ -59,7 +59,7 @@ GetJumpTableSymbol(const MachineOperand &MO) const { } // Create a symbol for the name. - return Ctx.GetOrCreateTemporarySymbol(Name.str()); + return Ctx.GetOrCreateSymbol(Name.str()); } MCSymbol *MSP430MCInstLower:: @@ -75,7 +75,7 @@ GetConstantPoolIndexSymbol(const MachineOperand &MO) const { } // Create a symbol for the name. - return Ctx.GetOrCreateTemporarySymbol(Name.str()); + return Ctx.GetOrCreateSymbol(Name.str()); } MCOperand MSP430MCInstLower:: diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp index 6372482..e584770 100644 --- a/lib/Target/MSP430/MSP430InstrInfo.cpp +++ b/lib/Target/MSP430/MSP430InstrInfo.cpp @@ -173,6 +173,8 @@ unsigned MSP430InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { while (I != MBB.begin()) { --I; + if (I->isDebugValue()) + continue; if (I->getOpcode() != MSP430::JMP && I->getOpcode() != MSP430::JCC) break; @@ -241,6 +243,9 @@ bool MSP430InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock::iterator I = MBB.end(); while (I != MBB.begin()) { --I; + if (I->isDebugValue()) + continue; + // Working from the bottom, when we see a non-terminator // instruction, we're done. if (!isUnpredicatedTerminator(I)) diff --git a/lib/Target/Mangler.cpp b/lib/Target/Mangler.cpp index fb93706..1d5c511 100644 --- a/lib/Target/Mangler.cpp +++ b/lib/Target/Mangler.cpp @@ -235,10 +235,7 @@ std::string Mangler::getNameWithPrefix(const GlobalValue *GV, MCSymbol *Mangler::getSymbol(const GlobalValue *GV) { SmallString<60> NameStr; getNameWithPrefix(NameStr, GV, false); - if (!GV->hasPrivateLinkage()) - return Context.GetOrCreateSymbol(NameStr.str()); - - return Context.GetOrCreateTemporarySymbol(NameStr.str()); + return Context.GetOrCreateSymbol(NameStr.str()); } diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td index fa4518d..e948917 100644 --- a/lib/Target/Mips/MipsInstrFPU.td +++ b/lib/Target/Mips/MipsInstrFPU.td @@ -26,8 +26,9 @@ // Floating Point Compare and Branch def SDT_MipsFPBrcond : SDTypeProfile<0, 3, [SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisVT<1, OtherVT>]>; -def SDT_MipsFPCmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>, SDTCisFP<0>, - SDTCisInt<2>]>; +def SDT_MipsFPCmp : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, + SDTCisSameAs<1, 2>, SDTCisFP<1>, + SDTCisInt<3>]>; def SDT_MipsFPSelectCC : SDTypeProfile<1, 4, [SDTCisInt<1>, SDTCisInt<4>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>]>; @@ -244,12 +245,13 @@ def MIPS_FCOND_NGT : PatLeaf<(i32 15)>; /// Floating Point Compare let hasDelaySlot = 1, Defs=[FCR31] in { def FCMP_S32 : FCC<0x0, (outs), (ins FGR32:$fs, FGR32:$ft, condcode:$cc), - "c.$cc.s $fs, $ft", [(MipsFPCmp FGR32:$fs, FGR32:$ft, imm:$cc), - (implicit FCR31)]>; + "c.$cc.s $fs, $ft", + [(set FCR31, (MipsFPCmp FGR32:$fs, FGR32:$ft, imm:$cc))]>; def FCMP_D32 : FCC<0x1, (outs), (ins AFGR64:$fs, AFGR64:$ft, condcode:$cc), - "c.$cc.d $fs, $ft", [(MipsFPCmp AFGR64:$fs, AFGR64:$ft, imm:$cc), - (implicit FCR31)]>, Requires<[In32BitMode]>; + "c.$cc.d $fs, $ft", + [(set FCR31, (MipsFPCmp AFGR64:$fs, AFGR64:$ft, imm:$cc))]>, + Requires<[In32BitMode]>; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp index 1a9bffc..85cf064 100644 --- a/lib/Target/Mips/MipsInstrInfo.cpp +++ b/lib/Target/Mips/MipsInstrInfo.cpp @@ -433,7 +433,15 @@ bool MipsInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, { // If the block has no terminators, it just falls into the block after it. MachineBasicBlock::iterator I = MBB.end(); - if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) + if (I == MBB.begin()) + return false; + --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return false; + --I; + } + if (!isUnpredicatedTerminator(I)) return false; // Get the last instruction in the block. @@ -562,6 +570,11 @@ RemoveBranch(MachineBasicBlock &MBB) const MachineBasicBlock::iterator I = MBB.end(); if (I == MBB.begin()) return 0; --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return 0; + --I; + } if (I->getOpcode() != Mips::J && GetCondFromBranchOpc(I->getOpcode()) == Mips::COND_INVALID) return 0; diff --git a/lib/Target/PIC16/PIC16InstrInfo.cpp b/lib/Target/PIC16/PIC16InstrInfo.cpp index 2fb405e..da16e83 100644 --- a/lib/Target/PIC16/PIC16InstrInfo.cpp +++ b/lib/Target/PIC16/PIC16InstrInfo.cpp @@ -226,6 +226,11 @@ bool PIC16InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, // Get the terminator instruction. --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return true; + --I; + } // Handle unconditional branches. If the unconditional branch's target is // successor basic block then remove the unconditional branch. if (I->getOpcode() == PIC16::br_uncond && AllowModify) { diff --git a/lib/Target/PIC16/PIC16Section.cpp b/lib/Target/PIC16/PIC16Section.cpp index a96ebb8..2505b11 100644 --- a/lib/Target/PIC16/PIC16Section.cpp +++ b/lib/Target/PIC16/PIC16Section.cpp @@ -17,10 +17,9 @@ using namespace llvm; // This is the only way to create a PIC16Section. Sections created here // do not need to be explicitly deleted as they are managed by auto_ptrs. -PIC16Section *PIC16Section::Create(const StringRef &Name, - PIC16SectionType Ty, - const std::string &Address, - int Color, MCContext &Ctx) { +PIC16Section *PIC16Section::Create(StringRef Name, PIC16SectionType Ty, + StringRef Address, int Color, + MCContext &Ctx) { /// Determine the internal SectionKind info. /// Users of PIC16Section class should not need to know the internal @@ -59,8 +58,17 @@ PIC16Section *PIC16Section::Create(const StringRef &Name, } + // Copy strings into context allocated memory so they get free'd when the + // context is destroyed. + char *NameCopy = static_cast<char*>(Ctx.Allocate(Name.size(), 1)); + memcpy(NameCopy, Name.data(), Name.size()); + char *AddressCopy = static_cast<char*>(Ctx.Allocate(Address.size(), 1)); + memcpy(AddressCopy, Address.data(), Address.size()); + // Create the Section. - PIC16Section *S = new (Ctx) PIC16Section(Name, K, Address, Color); + PIC16Section *S = + new (Ctx) PIC16Section(StringRef(NameCopy, Name.size()), K, + StringRef(AddressCopy, Address.size()), Color); S->T = Ty; return S; } diff --git a/lib/Target/PIC16/PIC16Section.h b/lib/Target/PIC16/PIC16Section.h index 566f920..9039ca7 100644 --- a/lib/Target/PIC16/PIC16Section.h +++ b/lib/Target/PIC16/PIC16Section.h @@ -30,11 +30,11 @@ namespace llvm { PIC16SectionType T; /// Name of the section to uniquely identify it. - std::string Name; + StringRef Name; /// User can specify an address at which a section should be placed. /// Negative value here means user hasn't specified any. - std::string Address; + StringRef Address; /// Overlay information - Sections with same color can be overlaid on /// one another. @@ -43,17 +43,16 @@ namespace llvm { /// Total size of all data objects contained here. unsigned Size; - PIC16Section(const StringRef &name, SectionKind K, const std::string &addr, - int color) + PIC16Section(StringRef name, SectionKind K, StringRef addr, int color) : MCSection(K), Name(name), Address(addr), Color(color), Size(0) { } public: /// Return the name of the section. - const std::string &getName() const { return Name; } + StringRef getName() const { return Name; } /// Return the Address of the section. - const std::string &getAddress() const { return Address; } + StringRef getAddress() const { return Address; } /// Return the Color of the section. int getColor() const { return Color; } @@ -64,6 +63,8 @@ namespace llvm { void setSize(unsigned size) { Size = size; } /// Conatined data objects. + // FIXME: This vector is leaked because sections are allocated with a + // BumpPtrAllocator. std::vector<const GlobalVariable *>Items; /// Check section type. @@ -77,8 +78,8 @@ namespace llvm { PIC16SectionType getType() const { return T; } /// This would be the only way to create a section. - static PIC16Section *Create(const StringRef &Name, PIC16SectionType Ty, - const std::string &Address, int Color, + static PIC16Section *Create(StringRef Name, PIC16SectionType Ty, + StringRef Address, int Color, MCContext &Ctx); /// Override this as PIC16 has its own way of printing switching diff --git a/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp b/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp index ed6fc9d..5adefd3 100644 --- a/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp +++ b/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp @@ -309,8 +309,8 @@ namespace { const MCSymbol *&TOCEntry = TOC[Sym]; if (TOCEntry == 0) TOCEntry = OutContext. - GetOrCreateTemporarySymbol(StringRef(MAI->getPrivateGlobalPrefix()) + - "C" + Twine(LabelID++)); + GetOrCreateSymbol(StringRef(MAI->getPrivateGlobalPrefix()) + + "C" + Twine(LabelID++)); O << *TOCEntry << "@toc"; } @@ -674,14 +674,14 @@ static const MCSymbol *GetLazyPtr(const MCSymbol *Sym, MCContext &Ctx) { // Remove $stub suffix, add $lazy_ptr. SmallString<128> TmpStr(Sym->getName().begin(), Sym->getName().end()-5); TmpStr += "$lazy_ptr"; - return Ctx.GetOrCreateTemporarySymbol(TmpStr.str()); + return Ctx.GetOrCreateSymbol(TmpStr.str()); } static const MCSymbol *GetAnonSym(const MCSymbol *Sym, MCContext &Ctx) { // Add $tmp suffix to $stub, yielding $stub$tmp. SmallString<128> TmpStr(Sym->getName().begin(), Sym->getName().end()); TmpStr += "$tmp"; - return Ctx.GetOrCreateTemporarySymbol(TmpStr.str()); + return Ctx.GetOrCreateSymbol(TmpStr.str()); } void PPCDarwinAsmPrinter:: @@ -811,6 +811,11 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) { OutStreamer.EmitIntValue(0, isPPC64 ? 8 : 4/*size*/, 0/*addrspace*/); else // Internal to current translation unit. + // + // When we place the LSDA into the TEXT section, the type info pointers + // need to be indirect and pc-rel. We accomplish this by using NLPs. + // However, sometimes the types are local to the file. So we need to + // fill in the value for the NLP in those cases. OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(), OutContext), isPPC64 ? 8 : 4/*size*/, 0/*addrspace*/); diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp index a752421..52948c8 100644 --- a/lib/Target/PowerPC/PPCBranchSelector.cpp +++ b/lib/Target/PowerPC/PPCBranchSelector.cpp @@ -130,7 +130,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { } // If this branch is in range, ignore it. - if (isInt16(BranchSize)) { + if (isInt<16>(BranchSize)) { MBBStartOffset += 4; continue; } diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 9d79c0d..4f88d35d 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -470,11 +470,11 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, if (CC == ISD::SETEQ || CC == ISD::SETNE) { if (isInt32Immediate(RHS, Imm)) { // SETEQ/SETNE comparison with 16-bit immediate, fold it. - if (isUInt16(Imm)) + if (isUInt<16>(Imm)) return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS, getI32Imm(Imm & 0xFFFF)), 0); // If this is a 16-bit signed immediate, fold it. - if (isInt16((int)Imm)) + if (isInt<16>((int)Imm)) return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS, getI32Imm(Imm & 0xFFFF)), 0); @@ -494,7 +494,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, } Opc = PPC::CMPLW; } else if (ISD::isUnsignedIntSetCC(CC)) { - if (isInt32Immediate(RHS, Imm) && isUInt16(Imm)) + if (isInt32Immediate(RHS, Imm) && isUInt<16>(Imm)) return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS, getI32Imm(Imm & 0xFFFF)), 0); Opc = PPC::CMPLW; @@ -511,11 +511,11 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, if (CC == ISD::SETEQ || CC == ISD::SETNE) { if (isInt64Immediate(RHS.getNode(), Imm)) { // SETEQ/SETNE comparison with 16-bit immediate, fold it. - if (isUInt16(Imm)) + if (isUInt<16>(Imm)) return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS, getI32Imm(Imm & 0xFFFF)), 0); // If this is a 16-bit signed immediate, fold it. - if (isInt16(Imm)) + if (isInt<16>(Imm)) return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS, getI32Imm(Imm & 0xFFFF)), 0); @@ -528,7 +528,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, // xoris r0,r3,0x1234 // cmpldi cr0,r0,0x5678 // beq cr0,L6 - if (isUInt32(Imm)) { + if (isUInt<32>(Imm)) { SDValue Xor(CurDAG->getMachineNode(PPC::XORIS8, dl, MVT::i64, LHS, getI64Imm(Imm >> 16)), 0); return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, Xor, @@ -537,7 +537,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, } Opc = PPC::CMPLD; } else if (ISD::isUnsignedIntSetCC(CC)) { - if (isInt64Immediate(RHS.getNode(), Imm) && isUInt16(Imm)) + if (isInt64Immediate(RHS.getNode(), Imm) && isUInt<16>(Imm)) return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS, getI64Imm(Imm & 0xFFFF)), 0); Opc = PPC::CMPLD; @@ -761,12 +761,12 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { unsigned Shift = 0; // If it can't be represented as a 32 bit value. - if (!isInt32(Imm)) { + if (!isInt<32>(Imm)) { Shift = CountTrailingZeros_64(Imm); int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift; // If the shifted value fits 32 bits. - if (isInt32(ImmSh)) { + if (isInt<32>(ImmSh)) { // Go with the shifted value. Imm = ImmSh; } else { @@ -785,7 +785,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { unsigned Hi = (Imm >> 16) & 0xFFFF; // Simple value. - if (isInt16(Imm)) { + if (isInt<16>(Imm)) { // Just the Lo bits. Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo)); } else if (Lo) { diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 2c072c1..e67666d 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -5539,8 +5539,16 @@ PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { return false; } -EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align, - bool isSrcConst, bool isSrcStr, +/// getOptimalMemOpType - Returns the target specific optimal type for load +/// and store operations as a result of memset, memcpy, and memmove lowering. +/// If DstAlign is zero that means it's safe to destination alignment can +/// satisfy any constraint. Similarly if SrcAlign is zero it means there +/// isn't a need to check it against alignment requirement, probably because +/// the source does not need to be loaded. It returns EVT::Other if +/// SelectionDAG should be responsible for determining it. +EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, + unsigned DstAlign, unsigned SrcAlign, + bool SafeToUseFP, SelectionDAG &DAG) const { if (this->PPCSubTarget.isPPC64()) { return MVT::i64; diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 9c390ac..19fefab 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -347,9 +347,16 @@ namespace llvm { virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const; - virtual EVT getOptimalMemOpType(uint64_t Size, unsigned Align, - bool isSrcConst, bool isSrcStr, - SelectionDAG &DAG) const; + /// getOptimalMemOpType - Returns the target specific optimal type for load + /// and store operations as a result of memset, memcpy, and memmove lowering. + /// If DstAlign is zero that means it's safe to destination alignment can + /// satisfy any constraint. Similarly if SrcAlign is zero it means there + /// isn't a need to check it against alignment requirement, probably because + /// the source does not need to be loaded. It returns EVT::Other if + /// SelectionDAG should be responsible for determining it. + virtual EVT getOptimalMemOpType(uint64_t Size, + unsigned DstAlign, unsigned SrcAlign, + bool SafeToUseFP, SelectionDAG &DAG) const; /// getFunctionAlignment - Return the Log2 alignment of this function. virtual unsigned getFunctionAlignment(const Function *F) const; diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td index 3ff8f27..256370f 100644 --- a/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/lib/Target/PowerPC/PPCInstrAltivec.td @@ -15,6 +15,10 @@ // Altivec transformation functions and pattern fragments. // +// Since we canonicalize buildvectors to v16i8, all vnots "-1" operands will be +// of that type. +def vnot_ppc : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v16i8 immAllOnesV)))>; def vpkuhum_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ @@ -321,7 +325,8 @@ def VAND : VXForm_1<1028, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB), [(set VRRC:$vD, (and (v4i32 VRRC:$vA), VRRC:$vB))]>; def VANDC : VXForm_1<1092, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB), "vandc $vD, $vA, $vB", VecFP, - [(set VRRC:$vD, (and (v4i32 VRRC:$vA), (vnot VRRC:$vB)))]>; + [(set VRRC:$vD, (and (v4i32 VRRC:$vA), + (vnot_ppc VRRC:$vB)))]>; def VCFSX : VXForm_1<842, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB), "vcfsx $vD, $vB, $UIMM", VecFP, @@ -435,7 +440,8 @@ def VSUM4UBS: VX1_Int<1544, "vsum4ubs", int_ppc_altivec_vsum4ubs>; def VNOR : VXForm_1<1284, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB), "vnor $vD, $vA, $vB", VecFP, - [(set VRRC:$vD, (vnot (or (v4i32 VRRC:$vA), VRRC:$vB)))]>; + [(set VRRC:$vD, (vnot_ppc (or (v4i32 VRRC:$vA), + VRRC:$vB)))]>; def VOR : VXForm_1<1156, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB), "vor $vD, $vA, $vB", VecFP, [(set VRRC:$vD, (or (v4i32 VRRC:$vA), VRRC:$vB))]>; @@ -640,12 +646,11 @@ def:Pat<(vmrghw_unary_shuffle (v16i8 VRRC:$vA), undef), (VMRGHW VRRC:$vA, VRRC:$vA)>; // Logical Operations -def : Pat<(v4i32 (vnot VRRC:$vA)), (VNOR VRRC:$vA, VRRC:$vA)>; -def : Pat<(v4i32 (vnot_conv VRRC:$vA)), (VNOR VRRC:$vA, VRRC:$vA)>; +def : Pat<(v4i32 (vnot_ppc VRRC:$vA)), (VNOR VRRC:$vA, VRRC:$vA)>; -def : Pat<(v4i32 (vnot_conv (or VRRC:$A, VRRC:$B))), +def : Pat<(v4i32 (vnot_ppc (or VRRC:$A, VRRC:$B))), (VNOR VRRC:$A, VRRC:$B)>; -def : Pat<(v4i32 (and VRRC:$A, (vnot_conv VRRC:$B))), +def : Pat<(v4i32 (and VRRC:$A, (vnot_ppc VRRC:$B))), (VANDC VRRC:$A, VRRC:$B)>; def : Pat<(fmul VRRC:$vA, VRRC:$vB), diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index 9895bea..82c637e 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -213,7 +213,15 @@ bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, bool AllowModify) const { // If the block has no terminators, it just falls into the block after it. MachineBasicBlock::iterator I = MBB.end(); - if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) + if (I == MBB.begin()) + return false; + --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return false; + --I; + } + if (!isUnpredicatedTerminator(I)) return false; // Get the last instruction in the block. @@ -281,6 +289,11 @@ unsigned PPCInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { MachineBasicBlock::iterator I = MBB.end(); if (I == MBB.begin()) return 0; --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return 0; + --I; + } if (I->getOpcode() != PPC::B && I->getOpcode() != PPC::BCC) return 0; diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp index 6e7880e..44c5fe6 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -512,7 +512,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineInstr *MI = I; DebugLoc dl = MI->getDebugLoc(); - if (isInt16(CalleeAmt)) { + if (isInt<16>(CalleeAmt)) { BuildMI(MBB, I, dl, TII.get(ADDIInstr), StackReg).addReg(StackReg). addImm(CalleeAmt); } else { @@ -596,7 +596,7 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II, else Reg = PPC::R0; - if (MaxAlign < TargetAlign && isInt16(FrameSize)) { + if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) { BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg) .addReg(PPC::R31) .addImm(FrameSize); @@ -798,7 +798,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // clear can be encoded. This is extremely uncommon, because normally you // only "std" to a stack slot that is at least 4-byte aligned, but it can // happen in invalid code. - if (isInt16(Offset) && (!isIXAddr || (Offset & 3) == 0)) { + if (isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0)) { if (isIXAddr) Offset >>= 2; // The actual encoded value has the low two bits zero. MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset); @@ -1375,8 +1375,9 @@ PPCRegisterInfo::emitPrologue(MachineFunction &MF) const { if (!isPPC64) { // PPC32. if (ALIGN_STACK && MaxAlign > TargetAlign) { - assert(isPowerOf2_32(MaxAlign)&&isInt16(MaxAlign)&&"Invalid alignment!"); - assert(isInt16(NegFrameSize) && "Unhandled stack size and alignment!"); + assert(isPowerOf2_32(MaxAlign) && isInt<16>(MaxAlign) && + "Invalid alignment!"); + assert(isInt<16>(NegFrameSize) && "Unhandled stack size and alignment!"); BuildMI(MBB, MBBI, dl, TII.get(PPC::RLWINM), PPC::R0) .addReg(PPC::R1) @@ -1390,7 +1391,7 @@ PPCRegisterInfo::emitPrologue(MachineFunction &MF) const { .addReg(PPC::R1) .addReg(PPC::R1) .addReg(PPC::R0); - } else if (isInt16(NegFrameSize)) { + } else if (isInt<16>(NegFrameSize)) { BuildMI(MBB, MBBI, dl, TII.get(PPC::STWU), PPC::R1) .addReg(PPC::R1) .addImm(NegFrameSize) @@ -1408,8 +1409,9 @@ PPCRegisterInfo::emitPrologue(MachineFunction &MF) const { } } else { // PPC64. if (ALIGN_STACK && MaxAlign > TargetAlign) { - assert(isPowerOf2_32(MaxAlign)&&isInt16(MaxAlign)&&"Invalid alignment!"); - assert(isInt16(NegFrameSize) && "Unhandled stack size and alignment!"); + assert(isPowerOf2_32(MaxAlign) && isInt<16>(MaxAlign) && + "Invalid alignment!"); + assert(isInt<16>(NegFrameSize) && "Unhandled stack size and alignment!"); BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), PPC::X0) .addReg(PPC::X1) @@ -1422,7 +1424,7 @@ PPCRegisterInfo::emitPrologue(MachineFunction &MF) const { .addReg(PPC::X1) .addReg(PPC::X1) .addReg(PPC::X0); - } else if (isInt16(NegFrameSize)) { + } else if (isInt<16>(NegFrameSize)) { BuildMI(MBB, MBBI, dl, TII.get(PPC::STDU), PPC::X1) .addReg(PPC::X1) .addImm(NegFrameSize / 4) @@ -1591,7 +1593,7 @@ void PPCRegisterInfo::emitEpilogue(MachineFunction &MF, // enabled (=> hasFastCall()==true) the fastcc call might contain a tail // call which invalidates the stack pointer value in SP(0). So we use the // value of R31 in this case. - if (FI->hasFastCall() && isInt16(FrameSize)) { + if (FI->hasFastCall() && isInt<16>(FrameSize)) { assert(hasFP(MF) && "Expecting a valid the frame pointer."); BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI), PPC::R1) .addReg(PPC::R31).addImm(FrameSize); @@ -1605,7 +1607,7 @@ void PPCRegisterInfo::emitEpilogue(MachineFunction &MF, .addReg(PPC::R1) .addReg(PPC::R31) .addReg(PPC::R0); - } else if (isInt16(FrameSize) && + } else if (isInt<16>(FrameSize) && (!ALIGN_STACK || TargetAlign >= MaxAlign) && !MFI->hasVarSizedObjects()) { BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI), PPC::R1) @@ -1615,7 +1617,7 @@ void PPCRegisterInfo::emitEpilogue(MachineFunction &MF, .addImm(0).addReg(PPC::R1); } } else { - if (FI->hasFastCall() && isInt16(FrameSize)) { + if (FI->hasFastCall() && isInt<16>(FrameSize)) { assert(hasFP(MF) && "Expecting a valid the frame pointer."); BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI8), PPC::X1) .addReg(PPC::X31).addImm(FrameSize); @@ -1629,7 +1631,7 @@ void PPCRegisterInfo::emitEpilogue(MachineFunction &MF, .addReg(PPC::X1) .addReg(PPC::X31) .addReg(PPC::X0); - } else if (isInt16(FrameSize) && TargetAlign >= MaxAlign && + } else if (isInt<16>(FrameSize) && TargetAlign >= MaxAlign && !MFI->hasVarSizedObjects()) { BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI8), PPC::X1) .addReg(PPC::X1).addImm(FrameSize); @@ -1678,7 +1680,7 @@ void PPCRegisterInfo::emitEpilogue(MachineFunction &MF, unsigned LISInstr = isPPC64 ? PPC::LIS8 : PPC::LIS; unsigned ORIInstr = isPPC64 ? PPC::ORI8 : PPC::ORI; - if (CallerAllocatedAmt && isInt16(CallerAllocatedAmt)) { + if (CallerAllocatedAmt && isInt<16>(CallerAllocatedAmt)) { BuildMI(MBB, MBBI, dl, TII.get(ADDIInstr), StackReg) .addReg(StackReg).addImm(CallerAllocatedAmt); } else { diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td index f46840c..8c5e905 100644 --- a/lib/Target/SystemZ/SystemZInstrFP.td +++ b/lib/Target/SystemZ/SystemZInstrFP.td @@ -316,19 +316,19 @@ def FBCONVF64 : Pseudo<(outs FP64:$dst), (ins GR64:$src), let Defs = [PSW] in { def FCMP32rr : Pseudo<(outs), (ins FP32:$src1, FP32:$src2), "cebr\t$src1, $src2", - [(SystemZcmp FP32:$src1, FP32:$src2), (implicit PSW)]>; + [(set PSW, (SystemZcmp FP32:$src1, FP32:$src2))]>; def FCMP64rr : Pseudo<(outs), (ins FP64:$src1, FP64:$src2), "cdbr\t$src1, $src2", - [(SystemZcmp FP64:$src1, FP64:$src2), (implicit PSW)]>; + [(set PSW, (SystemZcmp FP64:$src1, FP64:$src2))]>; def FCMP32rm : Pseudo<(outs), (ins FP32:$src1, rriaddr12:$src2), "ceb\t$src1, $src2", - [(SystemZcmp FP32:$src1, (load rriaddr12:$src2)), - (implicit PSW)]>; + [(set PSW, (SystemZcmp FP32:$src1, + (load rriaddr12:$src2)))]>; def FCMP64rm : Pseudo<(outs), (ins FP64:$src1, rriaddr12:$src2), "cdb\t$src1, $src2", - [(SystemZcmp FP64:$src1, (load rriaddr12:$src2)), - (implicit PSW)]>; + [(set PSW, (SystemZcmp FP64:$src1, + (load rriaddr12:$src2)))]>; } // Defs = [PSW] //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp index 5fa7e8c..06f01e7 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -424,6 +424,8 @@ bool SystemZInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock::iterator I = MBB.end(); while (I != MBB.begin()) { --I; + if (I->isDebugValue()) + continue; // Working from the bottom, when we see a non-terminator // instruction, we're done. if (!isUnpredicatedTerminator(I)) @@ -500,6 +502,8 @@ unsigned SystemZInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { while (I != MBB.begin()) { --I; + if (I->isDebugValue()) + continue; if (I->getOpcode() != SystemZ::JMP && getCondFromBranchOpc(I->getOpcode()) == SystemZCC::INVALID) break; diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td index 0d1af23..22bde4e 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/lib/Target/SystemZ/SystemZInstrInfo.td @@ -31,7 +31,8 @@ class SDTCisI64<int OpNum> : SDTCisVT<OpNum, i64>; def SDT_SystemZCall : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>; def SDT_SystemZCallSeqStart : SDCallSeqStart<[SDTCisI64<0>]>; def SDT_SystemZCallSeqEnd : SDCallSeqEnd<[SDTCisI64<0>, SDTCisI64<1>]>; -def SDT_CmpTest : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>; +def SDT_CmpTest : SDTypeProfile<1, 2, [SDTCisI64<0>, + SDTCisSameAs<1, 2>]>; def SDT_BrCond : SDTypeProfile<0, 3, [SDTCisVT<0, OtherVT>, SDTCisI8<1>, SDTCisVT<2, i64>]>; @@ -980,100 +981,89 @@ let Defs = [PSW] in { def CMP32rr : RRI<0x19, (outs), (ins GR32:$src1, GR32:$src2), "cr\t$src1, $src2", - [(SystemZcmp GR32:$src1, GR32:$src2), - (implicit PSW)]>; + [(set PSW, (SystemZcmp GR32:$src1, GR32:$src2))]>; def CMP64rr : RREI<0xB920, (outs), (ins GR64:$src1, GR64:$src2), "cgr\t$src1, $src2", - [(SystemZcmp GR64:$src1, GR64:$src2), - (implicit PSW)]>; + [(set PSW, (SystemZcmp GR64:$src1, GR64:$src2))]>; def CMP32ri : RILI<0xC2D, (outs), (ins GR32:$src1, s32imm:$src2), "cfi\t$src1, $src2", - [(SystemZcmp GR32:$src1, imm:$src2), - (implicit PSW)]>; + [(set PSW, (SystemZcmp GR32:$src1, imm:$src2))]>; def CMP64ri32 : RILI<0xC2C, (outs), (ins GR64:$src1, s32imm64:$src2), "cgfi\t$src1, $src2", - [(SystemZcmp GR64:$src1, i64immSExt32:$src2), - (implicit PSW)]>; + [(set PSW, (SystemZcmp GR64:$src1, i64immSExt32:$src2))]>; def CMP32rm : RXI<0x59, (outs), (ins GR32:$src1, rriaddr12:$src2), "c\t$src1, $src2", - [(SystemZcmp GR32:$src1, (load rriaddr12:$src2)), - (implicit PSW)]>; + [(set PSW, (SystemZcmp GR32:$src1, (load rriaddr12:$src2)))]>; def CMP32rmy : RXYI<0xE359, (outs), (ins GR32:$src1, rriaddr:$src2), "cy\t$src1, $src2", - [(SystemZcmp GR32:$src1, (load rriaddr:$src2)), - (implicit PSW)]>; + [(set PSW, (SystemZcmp GR32:$src1, (load rriaddr:$src2)))]>; def CMP64rm : RXYI<0xE320, (outs), (ins GR64:$src1, rriaddr:$src2), "cg\t$src1, $src2", - [(SystemZcmp GR64:$src1, (load rriaddr:$src2)), - (implicit PSW)]>; + [(set PSW, (SystemZcmp GR64:$src1, (load rriaddr:$src2)))]>; def UCMP32rr : RRI<0x15, (outs), (ins GR32:$src1, GR32:$src2), "clr\t$src1, $src2", - [(SystemZucmp GR32:$src1, GR32:$src2), - (implicit PSW)]>; + [(set PSW, (SystemZucmp GR32:$src1, GR32:$src2))]>; def UCMP64rr : RREI<0xB921, (outs), (ins GR64:$src1, GR64:$src2), "clgr\t$src1, $src2", - [(SystemZucmp GR64:$src1, GR64:$src2), - (implicit PSW)]>; + [(set PSW, (SystemZucmp GR64:$src1, GR64:$src2))]>; def UCMP32ri : RILI<0xC2F, (outs), (ins GR32:$src1, i32imm:$src2), "clfi\t$src1, $src2", - [(SystemZucmp GR32:$src1, imm:$src2), - (implicit PSW)]>; + [(set PSW, (SystemZucmp GR32:$src1, imm:$src2))]>; def UCMP64ri32 : RILI<0xC2E, (outs), (ins GR64:$src1, i64i32imm:$src2), "clgfi\t$src1, $src2", - [(SystemZucmp GR64:$src1, i64immZExt32:$src2), - (implicit PSW)]>; + [(set PSW,(SystemZucmp GR64:$src1, i64immZExt32:$src2))]>; def UCMP32rm : RXI<0x55, (outs), (ins GR32:$src1, rriaddr12:$src2), "cl\t$src1, $src2", - [(SystemZucmp GR32:$src1, (load rriaddr12:$src2)), - (implicit PSW)]>; + [(set PSW, (SystemZucmp GR32:$src1, + (load rriaddr12:$src2)))]>; def UCMP32rmy : RXYI<0xE355, (outs), (ins GR32:$src1, rriaddr:$src2), "cly\t$src1, $src2", - [(SystemZucmp GR32:$src1, (load rriaddr:$src2)), - (implicit PSW)]>; + [(set PSW, (SystemZucmp GR32:$src1, + (load rriaddr:$src2)))]>; def UCMP64rm : RXYI<0xE351, (outs), (ins GR64:$src1, rriaddr:$src2), "clg\t$src1, $src2", - [(SystemZucmp GR64:$src1, (load rriaddr:$src2)), - (implicit PSW)]>; + [(set PSW, (SystemZucmp GR64:$src1, + (load rriaddr:$src2)))]>; def CMPSX64rr32 : RREI<0xB930, (outs), (ins GR64:$src1, GR32:$src2), "cgfr\t$src1, $src2", - [(SystemZucmp GR64:$src1, (sext GR32:$src2)), - (implicit PSW)]>; + [(set PSW, (SystemZucmp GR64:$src1, + (sext GR32:$src2)))]>; def UCMPZX64rr32 : RREI<0xB931, (outs), (ins GR64:$src1, GR32:$src2), "clgfr\t$src1, $src2", - [(SystemZucmp GR64:$src1, (zext GR32:$src2)), - (implicit PSW)]>; + [(set PSW, (SystemZucmp GR64:$src1, + (zext GR32:$src2)))]>; def CMPSX64rm32 : RXYI<0xE330, (outs), (ins GR64:$src1, rriaddr:$src2), "cgf\t$src1, $src2", - [(SystemZucmp GR64:$src1, (sextloadi64i32 rriaddr:$src2)), - (implicit PSW)]>; + [(set PSW, (SystemZucmp GR64:$src1, + (sextloadi64i32 rriaddr:$src2)))]>; def UCMPZX64rm32 : RXYI<0xE331, (outs), (ins GR64:$src1, rriaddr:$src2), "clgf\t$src1, $src2", - [(SystemZucmp GR64:$src1, (zextloadi64i32 rriaddr:$src2)), - (implicit PSW)]>; + [(set PSW, (SystemZucmp GR64:$src1, + (zextloadi64i32 rriaddr:$src2)))]>; // FIXME: Add other crazy ucmp forms diff --git a/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp b/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp index c3dcf8e..66bb914 100644 --- a/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp +++ b/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp @@ -524,6 +524,11 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { OutStreamer.EmitIntValue(0, 4/*size*/, 0/*addrspace*/); else // Internal to current translation unit. + // + // When we place the LSDA into the TEXT section, the type info + // pointers need to be indirect and pc-rel. We accomplish this by + // using NLPs. However, sometimes the types are local to the file. So + // we need to fill in the value for the NLP in those cases. OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(), OutContext), 4/*size*/, 0/*addrspace*/); diff --git a/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp b/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp index 7d29d97..c851ca3 100644 --- a/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp +++ b/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp @@ -83,7 +83,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const { case X86II::MO_DARWIN_NONLAZY: case X86II::MO_DARWIN_NONLAZY_PIC_BASE: { Name += "$non_lazy_ptr"; - MCSymbol *Sym = Ctx.GetOrCreateTemporarySymbol(Name.str()); + MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str()); MachineModuleInfoImpl::StubValueTy &StubSym = getMachOMMI().getGVStubEntry(Sym); @@ -98,7 +98,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const { } case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: { Name += "$non_lazy_ptr"; - MCSymbol *Sym = Ctx.GetOrCreateTemporarySymbol(Name.str()); + MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str()); MachineModuleInfoImpl::StubValueTy &StubSym = getMachOMMI().getHiddenGVStubEntry(Sym); if (StubSym.getPointer() == 0) { @@ -112,7 +112,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const { } case X86II::MO_DARWIN_STUB: { Name += "$stub"; - MCSymbol *Sym = Ctx.GetOrCreateTemporarySymbol(Name.str()); + MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str()); MachineModuleInfoImpl::StubValueTy &StubSym = getMachOMMI().getFnStubEntry(Sym); if (StubSym.getPointer()) @@ -127,7 +127,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const { Name.erase(Name.end()-5, Name.end()); StubSym = MachineModuleInfoImpl:: - StubValueTy(Ctx.GetOrCreateTemporarySymbol(Name.str()), false); + StubValueTy(Ctx.GetOrCreateSymbol(Name.str()), false); } return Sym; } @@ -287,7 +287,9 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { LowerUnaryToTwoAddr(OutMI, X86::MMX_PCMPEQDrr); break; case X86::FsFLD0SS: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break; case X86::FsFLD0SD: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break; - case X86::V_SET0: LowerUnaryToTwoAddr(OutMI, X86::XORPSrr); break; + case X86::V_SET0PS: LowerUnaryToTwoAddr(OutMI, X86::XORPSrr); break; + case X86::V_SET0PD: LowerUnaryToTwoAddr(OutMI, X86::XORPDrr); break; + case X86::V_SET0PI: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break; case X86::V_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break; case X86::MOV16r0: diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index 4d3dedf..22285f1 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -15,6 +15,7 @@ tablegen(X86GenCallingConv.inc -gen-callingconv) tablegen(X86GenSubtarget.inc -gen-subtarget) set(sources + SSEDomainFix.cpp X86AsmBackend.cpp X86CodeEmitter.cpp X86COFFMachineModuleInfo.cpp diff --git a/lib/Target/X86/SSEDomainFix.cpp b/lib/Target/X86/SSEDomainFix.cpp new file mode 100644 index 0000000..395ab57 --- /dev/null +++ b/lib/Target/X86/SSEDomainFix.cpp @@ -0,0 +1,499 @@ +//===- SSEDomainFix.cpp - Use proper int/float domain for SSE ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the SSEDomainFix pass. +// +// Some SSE instructions like mov, and, or, xor are available in different +// variants for different operand types. These variant instructions are +// equivalent, but on Nehalem and newer cpus there is extra latency +// transferring data between integer and floating point domains. +// +// This pass changes the variant instructions to minimize domain crossings. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "sse-domain-fix" +#include "X86InstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +namespace { + +/// Allocate objects from a pool, allow objects to be recycled, and provide a +/// way of deleting everything. +template<typename T, unsigned PageSize = 64> +class PoolAllocator { + std::vector<T*> Pages, Avail; +public: + ~PoolAllocator() { Clear(); } + + T* Alloc() { + if (Avail.empty()) { + T *p = new T[PageSize]; + Pages.push_back(p); + Avail.reserve(PageSize); + for (unsigned n = 0; n != PageSize; ++n) + Avail.push_back(p+n); + } + T *p = Avail.back(); + Avail.pop_back(); + return p; + } + + // Allow object to be reallocated. It won't be reconstructed. + void Recycle(T *p) { + p->clear(); + Avail.push_back(p); + } + + // Destroy all objects, make sure there are no external pointers to them. + void Clear() { + Avail.clear(); + while (!Pages.empty()) { + delete[] Pages.back(); + Pages.pop_back(); + } + } +}; + +/// A DomainValue is a bit like LiveIntervals' ValNo, but it also keeps track +/// of execution domains. +/// +/// An open DomainValue represents a set of instructions that can still switch +/// execution domain. Multiple registers may refer to the same open +/// DomainValue - they will eventually be collapsed to the same execution +/// domain. +/// +/// A collapsed DomainValue represents a single register that has been forced +/// into one of more execution domains. There is a separate collapsed +/// DomainValue for each register, but it may contain multiple execution +/// domains. A register value is initially created in a single execution +/// domain, but if we were forced to pay the penalty of a domain crossing, we +/// keep track of the fact the the register is now available in multiple +/// domains. +struct DomainValue { + // Basic reference counting. + unsigned Refs; + + // Available domains. For an open DomainValue, it is the still possible + // domains for collapsing. For a collapsed DomainValue it is the domains where + // the register is available for free. + unsigned Mask; + + // Position of the last defining instruction. + unsigned Dist; + + // Twiddleable instructions using or defining these registers. + SmallVector<MachineInstr*, 8> Instrs; + + // Collapsed DomainValue have no instructions to twiddle - it simply keeps + // track of the domains where the registers are already available. + bool collapsed() const { return Instrs.empty(); } + + // Is any domain in mask available? + bool compat(unsigned mask) const { + return Mask & mask; + } + + // Mark domain as available. + void add(unsigned domain) { + Mask |= 1u << domain; + } + + // First domain available in mask. + unsigned firstDomain() const { + return CountTrailingZeros_32(Mask); + } + + DomainValue() { clear(); } + + void clear() { + Refs = Mask = Dist = 0; + Instrs.clear(); + } +}; + +static const unsigned NumRegs = 16; + +class SSEDomainFixPass : public MachineFunctionPass { + static char ID; + PoolAllocator<DomainValue> Pool; + + MachineFunction *MF; + const X86InstrInfo *TII; + const TargetRegisterInfo *TRI; + MachineBasicBlock *MBB; + DomainValue **LiveRegs; + typedef DenseMap<MachineBasicBlock*,DomainValue**> LiveOutMap; + LiveOutMap LiveOuts; + unsigned Distance; + +public: + SSEDomainFixPass() : MachineFunctionPass(&ID) {} + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + virtual bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { + return "SSE execution domain fixup"; + } + +private: + // Register mapping. + int RegIndex(unsigned Reg); + + // LiveRegs manipulations. + void SetLiveReg(int rx, DomainValue *DV); + void Kill(int rx); + void Force(int rx, unsigned domain); + void Collapse(DomainValue *dv, unsigned domain); + bool Merge(DomainValue *A, DomainValue *B); + + void enterBasicBlock(); + void visitGenericInstr(MachineInstr*); + void visitSoftInstr(MachineInstr*, unsigned mask); + void visitHardInstr(MachineInstr*, unsigned domain); +}; +} + +char SSEDomainFixPass::ID = 0; + +/// Translate TRI register number to an index into our smaller tables of +/// interesting registers. Return -1 for boring registers. +int SSEDomainFixPass::RegIndex(unsigned reg) { + // Registers are sorted lexicographically. + // We just need them to be consecutive, ordering doesn't matter. + assert(X86::XMM9 == X86::XMM0+NumRegs-1 && "Unexpected sort"); + reg -= X86::XMM0; + return reg < NumRegs ? reg : -1; +} + +/// Set LiveRegs[rx] = dv, updating reference counts. +void SSEDomainFixPass::SetLiveReg(int rx, DomainValue *dv) { + assert(unsigned(rx) < NumRegs && "Invalid index"); + if (!LiveRegs) + LiveRegs = (DomainValue**)calloc(sizeof(DomainValue*), NumRegs); + + if (LiveRegs[rx] == dv) + return; + if (LiveRegs[rx]) { + assert(LiveRegs[rx]->Refs && "Bad refcount"); + if (--LiveRegs[rx]->Refs == 0) Pool.Recycle(LiveRegs[rx]); + } + LiveRegs[rx] = dv; + if (dv) ++dv->Refs; +} + +// Kill register rx, recycle or collapse any DomainValue. +void SSEDomainFixPass::Kill(int rx) { + assert(unsigned(rx) < NumRegs && "Invalid index"); + if (!LiveRegs || !LiveRegs[rx]) return; + + // Before killing the last reference to an open DomainValue, collapse it to + // the first available domain. + if (LiveRegs[rx]->Refs == 1 && !LiveRegs[rx]->collapsed()) + Collapse(LiveRegs[rx], LiveRegs[rx]->firstDomain()); + else + SetLiveReg(rx, 0); +} + +/// Force register rx into domain. +void SSEDomainFixPass::Force(int rx, unsigned domain) { + assert(unsigned(rx) < NumRegs && "Invalid index"); + DomainValue *dv; + if (LiveRegs && (dv = LiveRegs[rx])) { + if (dv->collapsed()) + dv->add(domain); + else + Collapse(dv, domain); + } else { + // Set up basic collapsed DomainValue. + dv = Pool.Alloc(); + dv->Dist = Distance; + dv->add(domain); + SetLiveReg(rx, dv); + } +} + +/// Collapse open DomainValue into given domain. If there are multiple +/// registers using dv, they each get a unique collapsed DomainValue. +void SSEDomainFixPass::Collapse(DomainValue *dv, unsigned domain) { + assert(dv->compat(1u << domain) && "Cannot collapse"); + + // Collapse all the instructions. + while (!dv->Instrs.empty()) { + MachineInstr *mi = dv->Instrs.back(); + TII->SetSSEDomain(mi, domain); + dv->Instrs.pop_back(); + } + dv->Mask = 1u << domain; + + // If there are multiple users, give them new, unique DomainValues. + if (LiveRegs && dv->Refs > 1) { + for (unsigned rx = 0; rx != NumRegs; ++rx) + if (LiveRegs[rx] == dv) { + DomainValue *dv2 = Pool.Alloc(); + dv2->Dist = Distance; + dv2->add(domain); + SetLiveReg(rx, dv2); + } + } +} + +/// Merge - All instructions and registers in B are moved to A, and B is +/// released. +bool SSEDomainFixPass::Merge(DomainValue *A, DomainValue *B) { + assert(!A->collapsed() && "Cannot merge into collapsed"); + assert(!B->collapsed() && "Cannot merge from collapsed"); + if (A == B) + return true; + if (!A->compat(B->Mask)) + return false; + A->Mask &= B->Mask; + A->Dist = std::max(A->Dist, B->Dist); + A->Instrs.append(B->Instrs.begin(), B->Instrs.end()); + for (unsigned rx = 0; rx != NumRegs; ++rx) + if (LiveRegs[rx] == B) + SetLiveReg(rx, A); + return true; +} + +void SSEDomainFixPass::enterBasicBlock() { + // Try to coalesce live-out registers from predecessors. + for (MachineBasicBlock::const_livein_iterator i = MBB->livein_begin(), + e = MBB->livein_end(); i != e; ++i) { + int rx = RegIndex(*i); + if (rx < 0) continue; + for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(), + pe = MBB->pred_end(); pi != pe; ++pi) { + LiveOutMap::const_iterator fi = LiveOuts.find(*pi); + if (fi == LiveOuts.end()) continue; + DomainValue *pdv = fi->second[rx]; + if (!pdv) continue; + if (!LiveRegs || !LiveRegs[rx]) { + SetLiveReg(rx, pdv); + continue; + } + + // We have a live DomainValue from more than one predecessor. + if (LiveRegs[rx]->collapsed()) { + // We are already collapsed, but predecessor is not. Force him. + if (!pdv->collapsed()) + Collapse(pdv, LiveRegs[rx]->firstDomain()); + continue; + } + + // Currently open, merge in predecessor. + if (!pdv->collapsed()) + Merge(LiveRegs[rx], pdv); + else + Collapse(LiveRegs[rx], pdv->firstDomain()); + } + } +} + +// A hard instruction only works in one domain. All input registers will be +// forced into that domain. +void SSEDomainFixPass::visitHardInstr(MachineInstr *mi, unsigned domain) { + // Collapse all uses. + for (unsigned i = mi->getDesc().getNumDefs(), + e = mi->getDesc().getNumOperands(); i != e; ++i) { + MachineOperand &mo = mi->getOperand(i); + if (!mo.isReg()) continue; + int rx = RegIndex(mo.getReg()); + if (rx < 0) continue; + Force(rx, domain); + } + + // Kill all defs and force them. + for (unsigned i = 0, e = mi->getDesc().getNumDefs(); i != e; ++i) { + MachineOperand &mo = mi->getOperand(i); + if (!mo.isReg()) continue; + int rx = RegIndex(mo.getReg()); + if (rx < 0) continue; + Kill(rx); + Force(rx, domain); + } +} + +// A soft instruction can be changed to work in other domains given by mask. +void SSEDomainFixPass::visitSoftInstr(MachineInstr *mi, unsigned mask) { + // Scan the explicit use operands for incoming domains. + unsigned collmask = mask; + SmallVector<int, 4> used; + if (LiveRegs) + for (unsigned i = mi->getDesc().getNumDefs(), + e = mi->getDesc().getNumOperands(); i != e; ++i) { + MachineOperand &mo = mi->getOperand(i); + if (!mo.isReg()) continue; + int rx = RegIndex(mo.getReg()); + if (rx < 0) continue; + if (DomainValue *dv = LiveRegs[rx]) { + // Is it possible to use this collapsed register for free? + if (dv->collapsed()) { + if (unsigned m = collmask & dv->Mask) + collmask = m; + } else if (dv->compat(collmask)) + used.push_back(rx); + else + Kill(rx); + } + } + + // If the collapsed operands force a single domain, propagate the collapse. + if (isPowerOf2_32(collmask)) { + unsigned domain = CountTrailingZeros_32(collmask); + TII->SetSSEDomain(mi, domain); + visitHardInstr(mi, domain); + return; + } + + // Kill off any remaining uses that don't match collmask, and build a list of + // incoming DomainValue that we want to merge. + SmallVector<DomainValue*,4> doms; + for (SmallVector<int, 4>::iterator i=used.begin(), e=used.end(); i!=e; ++i) { + int rx = *i; + DomainValue *dv = LiveRegs[rx]; + // This useless DomainValue could have been missed above. + if (!dv->compat(collmask)) { + Kill(*i); + continue; + } + // sorted, uniqued insert. + bool inserted = false; + for (SmallVector<DomainValue*,4>::iterator i = doms.begin(), e = doms.end(); + i != e && !inserted; ++i) { + if (dv == *i) + inserted = true; + else if (dv->Dist < (*i)->Dist) { + inserted = true; + doms.insert(i, dv); + } + } + if (!inserted) + doms.push_back(dv); + } + + // doms are now sorted in order of appearance. Try to merge them all, giving + // priority to the latest ones. + DomainValue *dv = 0; + while (!doms.empty()) { + if (!dv) { + dv = doms.pop_back_val(); + continue; + } + + DomainValue *ThisDV = doms.pop_back_val(); + if (Merge(dv, ThisDV)) continue; + + for (SmallVector<int,4>::iterator i=used.begin(), e=used.end(); i != e; ++i) + if (LiveRegs[*i] == ThisDV) + Kill(*i); + } + + // dv is the DomainValue we are going to use for this instruction. + if (!dv) + dv = Pool.Alloc(); + dv->Dist = Distance; + dv->Mask = collmask; + dv->Instrs.push_back(mi); + + // Finally set all defs and non-collapsed uses to dv. + for (unsigned i = 0, e = mi->getDesc().getNumOperands(); i != e; ++i) { + MachineOperand &mo = mi->getOperand(i); + if (!mo.isReg()) continue; + int rx = RegIndex(mo.getReg()); + if (rx < 0) continue; + if (!LiveRegs || !LiveRegs[rx] || (mo.isDef() && LiveRegs[rx]!=dv)) { + Kill(rx); + SetLiveReg(rx, dv); + } + } +} + +void SSEDomainFixPass::visitGenericInstr(MachineInstr *mi) { + // Process explicit defs, kill any XMM registers redefined. + for (unsigned i = 0, e = mi->getDesc().getNumDefs(); i != e; ++i) { + MachineOperand &mo = mi->getOperand(i); + if (!mo.isReg()) continue; + int rx = RegIndex(mo.getReg()); + if (rx < 0) continue; + Kill(rx); + } +} + +bool SSEDomainFixPass::runOnMachineFunction(MachineFunction &mf) { + MF = &mf; + TII = static_cast<const X86InstrInfo*>(MF->getTarget().getInstrInfo()); + TRI = MF->getTarget().getRegisterInfo(); + MBB = 0; + LiveRegs = 0; + Distance = 0; + assert(NumRegs == X86::VR128RegClass.getNumRegs() && "Bad regclass"); + + // If no XMM registers are used in the function, we can skip it completely. + bool anyregs = false; + for (TargetRegisterClass::const_iterator I = X86::VR128RegClass.begin(), + E = X86::VR128RegClass.end(); I != E; ++I) + if (MF->getRegInfo().isPhysRegUsed(*I)) { + anyregs = true; + break; + } + if (!anyregs) return false; + + MachineBasicBlock *Entry = MF->begin(); + SmallPtrSet<MachineBasicBlock*, 16> Visited; + for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*, 16> > + DFI = df_ext_begin(Entry, Visited), DFE = df_ext_end(Entry, Visited); + DFI != DFE; ++DFI) { + MBB = *DFI; + enterBasicBlock(); + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; + ++I) { + MachineInstr *mi = I; + if (mi->isDebugValue()) continue; + ++Distance; + std::pair<uint16_t, uint16_t> domp = TII->GetSSEDomain(mi); + if (domp.first) + if (domp.second) + visitSoftInstr(mi, domp.second); + else + visitHardInstr(mi, domp.first); + else if (LiveRegs) + visitGenericInstr(mi); + } + + // Save live registers at end of MBB - used by enterBasicBlock(). + if (LiveRegs) + LiveOuts.insert(std::make_pair(MBB, LiveRegs)); + LiveRegs = 0; + } + + // Clear the LiveOuts vectors. Should we also collapse any remaining + // DomainValues? + for (LiveOutMap::const_iterator i = LiveOuts.begin(), e = LiveOuts.end(); + i != e; ++i) + free(i->second); + LiveOuts.clear(); + Pool.Clear(); + + return false; +} + +FunctionPass *llvm::createSSEDomainFixPass() { + return new SSEDomainFixPass(); +} diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index c753cf2..9be38a4 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -41,6 +41,10 @@ FunctionPass *createX86ISelDag(X86TargetMachine &TM, /// FunctionPass *createX86FloatingPointStackifierPass(); +/// createSSEDomainFixPass - This pass twiddles SSE opcodes to prevent domain +/// crossings. +FunctionPass *createSSEDomainFixPass(); + /// createX87FPRegKillInserterPass - This function returns a pass which /// inserts FP_REG_KILL instructions where needed. /// diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 2be51e1..6b62795 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -59,6 +59,9 @@ def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true", [FeatureCMOV]>; def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true", "Bit testing of memory is slow">; +def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem", + "IsUAMemFast", "true", + "Fast unaligned memory access">; def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true", "Support SSE 4a instructions">; @@ -98,8 +101,10 @@ def : Proc<"nocona", [FeatureSSE3, Feature64Bit, FeatureSlowBTMem]>; def : Proc<"core2", [FeatureSSSE3, Feature64Bit, FeatureSlowBTMem]>; def : Proc<"penryn", [FeatureSSE41, Feature64Bit, FeatureSlowBTMem]>; def : Proc<"atom", [FeatureSSE3, Feature64Bit, FeatureSlowBTMem]>; -def : Proc<"corei7", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem]>; -def : Proc<"nehalem", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem]>; +def : Proc<"corei7", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem, + FeatureFastUAMem]>; +def : Proc<"nehalem", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem, + FeatureFastUAMem]>; // Sandy Bridge does not have FMA def : Proc<"sandybridge", [FeatureSSE42, FeatureAVX, Feature64Bit]>; @@ -160,10 +165,11 @@ def X86InstrInfo : InstrInfo { "hasAdSizePrefix", "Prefix", "hasREX_WPrefix", - "ImmTypeBits", - "FPFormBits", + "ImmT.Value", + "FPForm.Value", "hasLockPrefix", "SegOvrBits", + "ExeDomain.Value", "Opcode"]; let TSFlagsShifts = [0, 6, @@ -174,6 +180,7 @@ def X86InstrInfo : InstrInfo { 16, 19, 20, + 22, 24]; } diff --git a/lib/Target/X86/X86AsmBackend.cpp b/lib/Target/X86/X86AsmBackend.cpp index 754a200..8e2928c3 100644 --- a/lib/Target/X86/X86AsmBackend.cpp +++ b/lib/Target/X86/X86AsmBackend.cpp @@ -10,10 +10,14 @@ #include "llvm/Target/TargetAsmBackend.h" #include "X86.h" #include "X86FixupKinds.h" +#include "llvm/ADT/Twine.h" #include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MachObjectWriter.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetRegistry.h" #include "llvm/Target/TargetAsmBackend.h" using namespace llvm; @@ -48,8 +52,135 @@ public: for (unsigned i = 0; i != Size; ++i) DF.getContents()[Fixup.Offset + i] = uint8_t(Value >> (i * 8)); } + + bool MayNeedRelaxation(const MCInst &Inst, + const SmallVectorImpl<MCAsmFixup> &Fixups) const; + + void RelaxInstruction(const MCInstFragment *IF, MCInst &Res) const; + + bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const; }; +static unsigned getRelaxedOpcode(unsigned Op) { + switch (Op) { + default: + return Op; + + case X86::JAE_1: return X86::JAE_4; + case X86::JA_1: return X86::JA_4; + case X86::JBE_1: return X86::JBE_4; + case X86::JB_1: return X86::JB_4; + case X86::JE_1: return X86::JE_4; + case X86::JGE_1: return X86::JGE_4; + case X86::JG_1: return X86::JG_4; + case X86::JLE_1: return X86::JLE_4; + case X86::JL_1: return X86::JL_4; + case X86::JMP_1: return X86::JMP_4; + case X86::JNE_1: return X86::JNE_4; + case X86::JNO_1: return X86::JNO_4; + case X86::JNP_1: return X86::JNP_4; + case X86::JNS_1: return X86::JNS_4; + case X86::JO_1: return X86::JO_4; + case X86::JP_1: return X86::JP_4; + case X86::JS_1: return X86::JS_4; + } +} + +bool X86AsmBackend::MayNeedRelaxation(const MCInst &Inst, + const SmallVectorImpl<MCAsmFixup> &Fixups) const { + // Check for a 1byte pcrel fixup, and enforce that we would know how to relax + // this instruction. + for (unsigned i = 0, e = Fixups.size(); i != e; ++i) { + if (unsigned(Fixups[i].Kind) == X86::reloc_pcrel_1byte) { + assert(getRelaxedOpcode(Inst.getOpcode()) != Inst.getOpcode()); + return true; + } + } + + return false; +} + +// FIXME: Can tblgen help at all here to verify there aren't other instructions +// we can relax? +void X86AsmBackend::RelaxInstruction(const MCInstFragment *IF, + MCInst &Res) const { + // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel. + unsigned RelaxedOp = getRelaxedOpcode(IF->getInst().getOpcode()); + + if (RelaxedOp == IF->getInst().getOpcode()) { + SmallString<256> Tmp; + raw_svector_ostream OS(Tmp); + IF->getInst().dump_pretty(OS); + llvm_report_error("unexpected instruction to relax: " + OS.str()); + } + + Res = IF->getInst(); + Res.setOpcode(RelaxedOp); +} + +/// WriteNopData - Write optimal nops to the output file for the \arg Count +/// bytes. This returns the number of bytes written. It may return 0 if +/// the \arg Count is more than the maximum optimal nops. +/// +/// FIXME this is X86 32-bit specific and should move to a better place. +bool X86AsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const { + static const uint8_t Nops[16][16] = { + // nop + {0x90}, + // xchg %ax,%ax + {0x66, 0x90}, + // nopl (%[re]ax) + {0x0f, 0x1f, 0x00}, + // nopl 0(%[re]ax) + {0x0f, 0x1f, 0x40, 0x00}, + // nopl 0(%[re]ax,%[re]ax,1) + {0x0f, 0x1f, 0x44, 0x00, 0x00}, + // nopw 0(%[re]ax,%[re]ax,1) + {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00}, + // nopl 0L(%[re]ax) + {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00}, + // nopl 0L(%[re]ax,%[re]ax,1) + {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, + // nopw 0L(%[re]ax,%[re]ax,1) + {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, + // nopw %cs:0L(%[re]ax,%[re]ax,1) + {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, + // nopl 0(%[re]ax,%[re]ax,1) + // nopw 0(%[re]ax,%[re]ax,1) + {0x0f, 0x1f, 0x44, 0x00, 0x00, + 0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00}, + // nopw 0(%[re]ax,%[re]ax,1) + // nopw 0(%[re]ax,%[re]ax,1) + {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00, + 0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00}, + // nopw 0(%[re]ax,%[re]ax,1) + // nopl 0L(%[re]ax) */ + {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00, + 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00}, + // nopl 0L(%[re]ax) + // nopl 0L(%[re]ax) + {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00, + 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00}, + // nopl 0L(%[re]ax) + // nopl 0L(%[re]ax,%[re]ax,1) + {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00, + 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00} + }; + + // Write an optimal sequence for the first 15 bytes. + uint64_t OptimalCount = (Count < 16) ? Count : 15; + for (uint64_t i = 0, e = OptimalCount; i != e; i++) + OW->Write8(Nops[OptimalCount - 1][i]); + + // Finish with single byte nops. + for (uint64_t i = OptimalCount, e = Count; i != e; ++i) + OW->Write8(0x90); + + return true; +} + +/* *** */ + class ELFX86AsmBackend : public X86AsmBackend { public: ELFX86AsmBackend(const Target &T) diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 5d3edbb..c69eeb3 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -383,7 +383,7 @@ bool X86FastISel::X86SelectAddress(Value *V, X86AddressMode &AM) { if (ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) { uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue(); // They have to fit in the 32-bit signed displacement field though. - if (isInt32(Disp)) { + if (isInt<32>(Disp)) { AM.Disp = (uint32_t)Disp; return X86SelectAddress(U->getOperand(0), AM); } @@ -427,7 +427,7 @@ bool X86FastISel::X86SelectAddress(Value *V, X86AddressMode &AM) { } } // Check for displacement overflow. - if (!isInt32(Disp)) + if (!isInt<32>(Disp)) break; // Ok, the GEP indices were covered by constant-offset and scaled-index // addressing. Update the address state and move on to examining the base. diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 704f9c6..b24d5a1 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -802,6 +802,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (!VT.is128BitVector()) { continue; } + setOperationAction(ISD::AND, SVT, Promote); AddPromotedToType (ISD::AND, SVT, MVT::v2i64); setOperationAction(ISD::OR, SVT, Promote); @@ -1008,7 +1009,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // FIXME: These should be based on subtarget info. Plus, the values should // be smaller when we are in optimizing for size mode. maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores - maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores + maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores setPrefLoopAlignment(16); benefitFromCodePlacementOpt = true; @@ -1066,23 +1067,37 @@ unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { } /// getOptimalMemOpType - Returns the target specific optimal type for load -/// and store operations as a result of memset, memcpy, and memmove -/// lowering. It returns MVT::iAny if SelectionDAG should be responsible for -/// determining it. +/// and store operations as a result of memset, memcpy, and memmove lowering. +/// If DstAlign is zero that means it's safe to destination alignment can +/// satisfy any constraint. Similarly if SrcAlign is zero it means there +/// isn't a need to check it against alignment requirement, probably because +/// the source does not need to be loaded. It returns EVT::Other if +/// SelectionDAG should be responsible for determining it. EVT -X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align, - bool isSrcConst, bool isSrcStr, +X86TargetLowering::getOptimalMemOpType(uint64_t Size, + unsigned DstAlign, unsigned SrcAlign, + bool SafeToUseFP, SelectionDAG &DAG) const { // FIXME: This turns off use of xmm stores for memset/memcpy on targets like // linux. This is because the stack realignment code can't handle certain // cases like PR2962. This should be removed when PR2962 is fixed. const Function *F = DAG.getMachineFunction().getFunction(); - bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); - if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) { - if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16) - return MVT::v4i32; - if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16) - return MVT::v4f32; + if (!F->hasFnAttr(Attribute::NoImplicitFloat)) { + if (Size >= 16 && + (Subtarget->isUnalignedMemAccessFast() || + ((DstAlign == 0 || DstAlign >= 16) && + (SrcAlign == 0 || SrcAlign >= 16))) && + Subtarget->getStackAlignment() >= 16) { + if (Subtarget->hasSSE2()) + return MVT::v4i32; + if (SafeToUseFP && Subtarget->hasSSE1()) + return MVT::v4f32; + } else if (SafeToUseFP && + Size >= 8 && + !Subtarget->is64Bit() && + Subtarget->getStackAlignment() >= 8 && + Subtarget->hasSSE2()) + return MVT::f64; } if (Subtarget->is64Bit() && Size >= 8) return MVT::i64; @@ -1108,8 +1123,8 @@ MCSymbol * X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF, MCContext &Ctx) const { const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo(); - return Ctx.GetOrCreateTemporarySymbol(Twine(MAI.getPrivateGlobalPrefix())+ - Twine(MF->getFunctionNumber())+"$pb"); + return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+ + Twine(MF->getFunctionNumber())+"$pb"); } @@ -2290,6 +2305,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, return false; // If -tailcallopt is specified, make fastcc functions tail-callable. + const MachineFunction &MF = DAG.getMachineFunction(); const Function *CallerF = DAG.getMachineFunction().getFunction(); if (GuaranteedTailCallOpt) { if (IsTailCallConvention(CalleeCC) && @@ -2301,8 +2317,14 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // Look for obvious safe cases to perform tail call optimization that does not // requite ABI changes. This is what gcc calls sibcall. - // Do not sibcall optimize vararg calls for now. - if (isVarArg) + // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to + // emit a special epilogue. + if (RegInfo->needsStackRealignment(MF)) + return false; + + // Do not sibcall optimize vararg calls unless the call site is not passing any + // arguments. + if (isVarArg && !Outs.empty()) return false; // Also avoid sibcall optimization if either caller or callee uses struct @@ -2417,7 +2439,7 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) { bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement) { // Offset should fit into 32 bit immediate field. - if (!isInt32(Offset)) + if (!isInt<32>(Offset)) return false; // If we don't have a symbolic displacement - we don't have any extra @@ -3613,6 +3635,69 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, return SDValue(); } +/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a +/// vector of type 'VT', see if the elements can be replaced by a single large +/// load which has the same value as a build_vector whose operands are 'elts'. +/// +/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a +/// +/// FIXME: we'd also like to handle the case where the last elements are zero +/// rather than undef via VZEXT_LOAD, but we do not detect that case today. +/// There's even a handy isZeroNode for that purpose. +static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, + DebugLoc &dl, SelectionDAG &DAG) { + EVT EltVT = VT.getVectorElementType(); + unsigned NumElems = Elts.size(); + + LoadSDNode *LDBase = NULL; + unsigned LastLoadedElt = -1U; + + // For each element in the initializer, see if we've found a load or an undef. + // If we don't find an initial load element, or later load elements are + // non-consecutive, bail out. + for (unsigned i = 0; i < NumElems; ++i) { + SDValue Elt = Elts[i]; + + if (!Elt.getNode() || + (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) + return SDValue(); + if (!LDBase) { + if (Elt.getNode()->getOpcode() == ISD::UNDEF) + return SDValue(); + LDBase = cast<LoadSDNode>(Elt.getNode()); + LastLoadedElt = i; + continue; + } + if (Elt.getOpcode() == ISD::UNDEF) + continue; + + LoadSDNode *LD = cast<LoadSDNode>(Elt); + if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) + return SDValue(); + LastLoadedElt = i; + } + + // If we have found an entire vector of loads and undefs, then return a large + // load of the entire vector width starting at the base pointer. If we found + // consecutive loads for the low half, generate a vzext_load node. + if (LastLoadedElt == NumElems - 1) { + if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) + return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), + LDBase->getSrcValue(), LDBase->getSrcValueOffset(), + LDBase->isVolatile(), LDBase->isNonTemporal(), 0); + return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), + LDBase->getSrcValue(), LDBase->getSrcValueOffset(), + LDBase->isVolatile(), LDBase->isNonTemporal(), + LDBase->getAlignment()); + } else if (NumElems == 4 && LastLoadedElt == 1) { + SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); + SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; + SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); + } + return SDValue(); +} + SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); @@ -3841,14 +3926,18 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); } - if (Values.size() > 2) { - // If we have SSE 4.1, Expand into a number of inserts unless the number of - // values to be inserted is equal to the number of elements, in which case - // use the unpack code below in the hopes of matching the consecutive elts - // load merge pattern for shuffles. - // FIXME: We could probably just check that here directly. - if (Values.size() < NumElems && VT.getSizeInBits() == 128 && - getSubtarget()->hasSSE41()) { + if (Values.size() > 1 && VT.getSizeInBits() == 128) { + // Check for a build vector of consecutive loads. + for (unsigned i = 0; i < NumElems; ++i) + V[i] = Op.getOperand(i); + + // Check for elements which are consecutive loads. + SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); + if (LD.getNode()) + return LD; + + // For SSE 4.1, use inserts into undef. + if (getSubtarget()->hasSSE41()) { V[0] = DAG.getUNDEF(VT); for (unsigned i = 0; i < NumElems; ++i) if (Op.getOperand(i).getOpcode() != ISD::UNDEF) @@ -3856,7 +3945,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { Op.getOperand(i), DAG.getIntPtrConstant(i)); return V[0]; } - // Expand into a number of unpckl*. + + // Otherwise, expand into a number of unpckl* // e.g. for v4f32 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> @@ -3871,7 +3961,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { } return V[0]; } - return SDValue(); } @@ -8797,83 +8886,24 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N, return TargetLowering::isGAPlusOffset(N, GA, Offset); } -static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems, - EVT EltVT, LoadSDNode *&LDBase, - unsigned &LastLoadedElt, - SelectionDAG &DAG, MachineFrameInfo *MFI, - const TargetLowering &TLI) { - LDBase = NULL; - LastLoadedElt = -1U; - for (unsigned i = 0; i < NumElems; ++i) { - if (N->getMaskElt(i) < 0) { - if (!LDBase) - return false; - continue; - } - - SDValue Elt = DAG.getShuffleScalarElt(N, i); - if (!Elt.getNode() || - (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) - return false; - if (!LDBase) { - if (Elt.getNode()->getOpcode() == ISD::UNDEF) - return false; - LDBase = cast<LoadSDNode>(Elt.getNode()); - LastLoadedElt = i; - continue; - } - if (Elt.getOpcode() == ISD::UNDEF) - continue; - - LoadSDNode *LD = cast<LoadSDNode>(Elt); - if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) - return false; - LastLoadedElt = i; - } - return true; -} - /// PerformShuffleCombine - Combine a vector_shuffle that is equal to /// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load /// if the load addresses are consecutive, non-overlapping, and in the right -/// order. In the case of v2i64, it will see if it can rewrite the -/// shuffle to be an appropriate build vector so it can take advantage of -// performBuildVectorCombine. +/// order. static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI) { DebugLoc dl = N->getDebugLoc(); EVT VT = N->getValueType(0); - EVT EltVT = VT.getVectorElementType(); ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); - unsigned NumElems = VT.getVectorNumElements(); if (VT.getSizeInBits() != 128) return SDValue(); - // Try to combine a vector_shuffle into a 128-bit load. - MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); - LoadSDNode *LD = NULL; - unsigned LastLoadedElt; - if (!EltsFromConsecutiveLoads(SVN, NumElems, EltVT, LD, LastLoadedElt, DAG, - MFI, TLI)) - return SDValue(); - - if (LastLoadedElt == NumElems - 1) { - if (DAG.InferPtrAlignment(LD->getBasePtr()) >= 16) - return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), - LD->getSrcValue(), LD->getSrcValueOffset(), - LD->isVolatile(), LD->isNonTemporal(), 0); - return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), - LD->getSrcValue(), LD->getSrcValueOffset(), - LD->isVolatile(), LD->isNonTemporal(), - LD->getAlignment()); - } else if (NumElems == 4 && LastLoadedElt == 1) { - SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); - SDValue Ops[] = { LD->getChain(), LD->getBasePtr() }; - SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); - } - return SDValue(); + SmallVector<SDValue, 16> Elts; + for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) + Elts.push_back(DAG.getShuffleScalarElt(SVN, i)); + + return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); } /// PerformShuffleCombine - Detect vector gather/scatter index generation diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 0f15eba..4549cba 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -417,12 +417,15 @@ namespace llvm { virtual unsigned getByValTypeAlignment(const Type *Ty) const; /// getOptimalMemOpType - Returns the target specific optimal type for load - /// and store operations as a result of memset, memcpy, and memmove - /// lowering. It returns EVT::iAny if SelectionDAG should be responsible for - /// determining it. - virtual EVT getOptimalMemOpType(uint64_t Size, unsigned Align, - bool isSrcConst, bool isSrcStr, - SelectionDAG &DAG) const; + /// and store operations as a result of memset, memcpy, and memmove lowering. + /// If DstAlign is zero that means it's safe to destination alignment can + /// satisfy any constraint. Similarly if SrcAlign is zero it means there + /// isn't a need to check it against alignment requirement, probably because + /// the source does not need to be loaded. It returns EVT::Other if + /// SelectionDAG should be responsible for determining it. + virtual EVT getOptimalMemOpType(uint64_t Size, + unsigned DstAlign, unsigned SrcAlign, + bool SafeToUseFP, SelectionDAG &DAG) const; /// allowsUnalignedMemoryAccesses - Returns true if the target allows /// unaligned memory accesses. of the specified type. diff --git a/lib/Target/X86/X86Instr64bit.td b/lib/Target/X86/X86Instr64bit.td index 8cbb756..eef2ca0 100644 --- a/lib/Target/X86/X86Instr64bit.td +++ b/lib/Target/X86/X86Instr64bit.td @@ -295,19 +295,17 @@ def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src), let Defs = [EFLAGS] in { def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "bsf{q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (X86bsf GR64:$src)), (implicit EFLAGS)]>, TB; + [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))]>, TB; def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "bsf{q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (X86bsf (loadi64 addr:$src))), - (implicit EFLAGS)]>, TB; + [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))]>, TB; def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "bsr{q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (X86bsr GR64:$src)), (implicit EFLAGS)]>, TB; + [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))]>, TB; def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "bsr{q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (X86bsr (loadi64 addr:$src))), - (implicit EFLAGS)]>, TB; + [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))]>, TB; } // Defs = [EFLAGS] // Repeat string ops @@ -508,8 +506,8 @@ let isCommutable = 1 in def ADD64rr : RI<0x01, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "add{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (add GR64:$src1, GR64:$src2)), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86add_flag GR64:$src1, GR64:$src2))]>; // These are alternate spellings for use by the disassembler, we mark them as // code gen only to ensure they aren't matched by the assembler. @@ -523,21 +521,21 @@ let isCodeGenOnly = 1 in { def ADD64ri8 : RIi8<0x83, MRM0r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), "add{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (add GR64:$src1, i64immSExt8:$src2)), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86add_flag GR64:$src1, i64immSExt8:$src2))]>; def ADD64ri32 : RIi32<0x81, MRM0r, (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), "add{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (add GR64:$src1, i64immSExt32:$src2)), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86add_flag GR64:$src1, i64immSExt32:$src2))]>; } // isConvertibleToThreeAddress // Register-Memory Addition def ADD64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2), "add{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (add GR64:$src1, (load addr:$src2))), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86add_flag GR64:$src1, (load addr:$src2)))]>; } // isTwoAddress @@ -604,8 +602,8 @@ let isTwoAddress = 1 in { def SUB64rr : RI<0x29, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "sub{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (sub GR64:$src1, GR64:$src2)), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86sub_flag GR64:$src1, GR64:$src2))]>; def SUB64rr_REV : RI<0x2B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), @@ -615,20 +613,20 @@ def SUB64rr_REV : RI<0x2B, MRMSrcReg, (outs GR64:$dst), def SUB64rm : RI<0x2B, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2), "sub{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (sub GR64:$src1, (load addr:$src2))), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86sub_flag GR64:$src1, (load addr:$src2)))]>; // Register-Integer Subtraction def SUB64ri8 : RIi8<0x83, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), "sub{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (sub GR64:$src1, i64immSExt8:$src2)), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86sub_flag GR64:$src1, i64immSExt8:$src2))]>; def SUB64ri32 : RIi32<0x81, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), "sub{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (sub GR64:$src1, i64immSExt32:$src2)), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86sub_flag GR64:$src1, i64immSExt32:$src2))]>; } // isTwoAddress def SUB64i32 : RIi32<0x2D, RawFrm, (outs), (ins i32imm:$src), @@ -716,15 +714,15 @@ let isCommutable = 1 in def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "imul{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (mul GR64:$src1, GR64:$src2)), - (implicit EFLAGS)]>, TB; + [(set GR64:$dst, EFLAGS, + (X86smul_flag GR64:$src1, GR64:$src2))]>, TB; // Register-Memory Signed Integer Multiplication def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2), "imul{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (mul GR64:$src1, (load addr:$src2))), - (implicit EFLAGS)]>, TB; + [(set GR64:$dst, EFLAGS, + (X86smul_flag GR64:$src1, (load addr:$src2)))]>, TB; } // isTwoAddress // Suprisingly enough, these are not two address instructions! @@ -733,27 +731,27 @@ def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst), def IMUL64rri8 : RIi8<0x6B, MRMSrcReg, // GR64 = GR64*I8 (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR64:$dst, (mul GR64:$src1, i64immSExt8:$src2)), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86smul_flag GR64:$src1, i64immSExt8:$src2))]>; def IMUL64rri32 : RIi32<0x69, MRMSrcReg, // GR64 = GR64*I32 (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR64:$dst, (mul GR64:$src1, i64immSExt32:$src2)), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86smul_flag GR64:$src1, i64immSExt32:$src2))]>; // Memory-Integer Signed Integer Multiplication def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8 (outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2), "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR64:$dst, (mul (load addr:$src1), - i64immSExt8:$src2)), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), + i64immSExt8:$src2))]>; def IMUL64rmi32 : RIi32<0x69, MRMSrcMem, // GR64 = [mem64]*I32 (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2), "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR64:$dst, (mul (load addr:$src1), - i64immSExt32:$src2)), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), + i64immSExt32:$src2))]>; } // Defs = [EFLAGS] // Unsigned division / remainder @@ -787,16 +785,14 @@ def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst", let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src), "inc{q}\t$dst", - [(set GR64:$dst, (add GR64:$src, 1)), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src))]>; def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst", [(store (add (loadi64 addr:$dst), 1), addr:$dst), (implicit EFLAGS)]>; let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src), "dec{q}\t$dst", - [(set GR64:$dst, (add GR64:$src, -1)), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src))]>; def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", [(store (add (loadi64 addr:$dst), -1), addr:$dst), (implicit EFLAGS)]>; @@ -806,23 +802,19 @@ let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in { // Can transform into LEA. def INC64_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src), "inc{w}\t$dst", - [(set GR16:$dst, (add GR16:$src, 1)), - (implicit EFLAGS)]>, + [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src))]>, OpSize, Requires<[In64BitMode]>; def INC64_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src), "inc{l}\t$dst", - [(set GR32:$dst, (add GR32:$src, 1)), - (implicit EFLAGS)]>, + [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src))]>, Requires<[In64BitMode]>; def DEC64_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src), "dec{w}\t$dst", - [(set GR16:$dst, (add GR16:$src, -1)), - (implicit EFLAGS)]>, + [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src))]>, OpSize, Requires<[In64BitMode]>; def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src), "dec{l}\t$dst", - [(set GR32:$dst, (add GR32:$src, -1)), - (implicit EFLAGS)]>, + [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src))]>, Requires<[In64BitMode]>; } // isConvertibleToThreeAddress @@ -1092,26 +1084,26 @@ let isCommutable = 1 in def AND64rr : RI<0x21, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "and{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (and GR64:$src1, GR64:$src2)), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86and_flag GR64:$src1, GR64:$src2))]>; def AND64rr_REV : RI<0x23, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "and{q}\t{$src2, $dst|$dst, $src2}", []>; def AND64rm : RI<0x23, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2), "and{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (and GR64:$src1, (load addr:$src2))), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86and_flag GR64:$src1, (load addr:$src2)))]>; def AND64ri8 : RIi8<0x83, MRM4r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), "and{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (and GR64:$src1, i64immSExt8:$src2)), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86and_flag GR64:$src1, i64immSExt8:$src2))]>; def AND64ri32 : RIi32<0x81, MRM4r, (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), "and{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (and GR64:$src1, i64immSExt32:$src2)), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86and_flag GR64:$src1, i64immSExt32:$src2))]>; } // isTwoAddress def AND64mr : RI<0x21, MRMDestMem, @@ -1135,26 +1127,26 @@ let isCommutable = 1 in def OR64rr : RI<0x09, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "or{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (or GR64:$src1, GR64:$src2)), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86or_flag GR64:$src1, GR64:$src2))]>; def OR64rr_REV : RI<0x0B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "or{q}\t{$src2, $dst|$dst, $src2}", []>; def OR64rm : RI<0x0B, MRMSrcMem , (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2), "or{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (or GR64:$src1, (load addr:$src2))), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86or_flag GR64:$src1, (load addr:$src2)))]>; def OR64ri8 : RIi8<0x83, MRM1r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), "or{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (or GR64:$src1, i64immSExt8:$src2)), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86or_flag GR64:$src1, i64immSExt8:$src2))]>; def OR64ri32 : RIi32<0x81, MRM1r, (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), "or{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (or GR64:$src1, i64immSExt32:$src2)), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86or_flag GR64:$src1, i64immSExt32:$src2))]>; } // isTwoAddress def OR64mr : RI<0x09, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), @@ -1178,26 +1170,26 @@ let isCommutable = 1 in def XOR64rr : RI<0x31, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "xor{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (xor GR64:$src1, GR64:$src2)), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86xor_flag GR64:$src1, GR64:$src2))]>; def XOR64rr_REV : RI<0x33, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "xor{q}\t{$src2, $dst|$dst, $src2}", []>; def XOR64rm : RI<0x33, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2), "xor{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (xor GR64:$src1, (load addr:$src2))), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86xor_flag GR64:$src1, (load addr:$src2)))]>; def XOR64ri8 : RIi8<0x83, MRM6r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), "xor{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (xor GR64:$src1, i64immSExt8:$src2)), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86xor_flag GR64:$src1, i64immSExt8:$src2))]>; def XOR64ri32 : RIi32<0x81, MRM6r, (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), "xor{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (xor GR64:$src1, i64immSExt32:$src2)), - (implicit EFLAGS)]>; + [(set GR64:$dst, EFLAGS, + (X86xor_flag GR64:$src1, i64immSExt32:$src2))]>; } // isTwoAddress def XOR64mr : RI<0x31, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), @@ -2181,14 +2173,11 @@ def : Pat<(store (shld (loadi64 addr:$dst), (i8 imm:$amt1), // (or x1, x2) -> (add x1, x2) if two operands are known not to share bits. let AddedComplexity = 5 in { // Try this before the selecting to OR -def : Pat<(parallel (or_is_add GR64:$src1, i64immSExt8:$src2), - (implicit EFLAGS)), +def : Pat<(or_is_add GR64:$src1, i64immSExt8:$src2), (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>; -def : Pat<(parallel (or_is_add GR64:$src1, i64immSExt32:$src2), - (implicit EFLAGS)), +def : Pat<(or_is_add GR64:$src1, i64immSExt32:$src2), (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>; -def : Pat<(parallel (or_is_add GR64:$src1, GR64:$src2), - (implicit EFLAGS)), +def : Pat<(or_is_add GR64:$src1, GR64:$src2), (ADD64rr GR64:$src1, GR64:$src2)>; } // AddedComplexity @@ -2215,136 +2204,76 @@ def : Pat<(subc GR64:$src1, imm:$src2), // EFLAGS-defining Patterns //===----------------------------------------------------------------------===// -// Register-Register Addition with EFLAGS result -def : Pat<(parallel (X86add_flag GR64:$src1, GR64:$src2), - (implicit EFLAGS)), +// addition +def : Pat<(add GR64:$src1, GR64:$src2), (ADD64rr GR64:$src1, GR64:$src2)>; - -// Register-Integer Addition with EFLAGS result -def : Pat<(parallel (X86add_flag GR64:$src1, i64immSExt8:$src2), - (implicit EFLAGS)), +def : Pat<(add GR64:$src1, i64immSExt8:$src2), (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>; -def : Pat<(parallel (X86add_flag GR64:$src1, i64immSExt32:$src2), - (implicit EFLAGS)), +def : Pat<(add GR64:$src1, i64immSExt32:$src2), (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>; - -// Register-Memory Addition with EFLAGS result -def : Pat<(parallel (X86add_flag GR64:$src1, (loadi64 addr:$src2)), - (implicit EFLAGS)), +def : Pat<(add GR64:$src1, (loadi64 addr:$src2)), (ADD64rm GR64:$src1, addr:$src2)>; -// Register-Register Subtraction with EFLAGS result -def : Pat<(parallel (X86sub_flag GR64:$src1, GR64:$src2), - (implicit EFLAGS)), +// subtraction +def : Pat<(sub GR64:$src1, GR64:$src2), (SUB64rr GR64:$src1, GR64:$src2)>; - -// Register-Memory Subtraction with EFLAGS result -def : Pat<(parallel (X86sub_flag GR64:$src1, (loadi64 addr:$src2)), - (implicit EFLAGS)), +def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)), (SUB64rm GR64:$src1, addr:$src2)>; - -// Register-Integer Subtraction with EFLAGS result -def : Pat<(parallel (X86sub_flag GR64:$src1, i64immSExt8:$src2), - (implicit EFLAGS)), +def : Pat<(sub GR64:$src1, i64immSExt8:$src2), (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>; -def : Pat<(parallel (X86sub_flag GR64:$src1, i64immSExt32:$src2), - (implicit EFLAGS)), +def : Pat<(sub GR64:$src1, i64immSExt32:$src2), (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>; -// Register-Register Signed Integer Multiplication with EFLAGS result -def : Pat<(parallel (X86smul_flag GR64:$src1, GR64:$src2), - (implicit EFLAGS)), +// Multiply +def : Pat<(mul GR64:$src1, GR64:$src2), (IMUL64rr GR64:$src1, GR64:$src2)>; - -// Register-Memory Signed Integer Multiplication with EFLAGS result -def : Pat<(parallel (X86smul_flag GR64:$src1, (loadi64 addr:$src2)), - (implicit EFLAGS)), +def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)), (IMUL64rm GR64:$src1, addr:$src2)>; - -// Register-Integer Signed Integer Multiplication with EFLAGS result -def : Pat<(parallel (X86smul_flag GR64:$src1, i64immSExt8:$src2), - (implicit EFLAGS)), +def : Pat<(mul GR64:$src1, i64immSExt8:$src2), (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>; -def : Pat<(parallel (X86smul_flag GR64:$src1, i64immSExt32:$src2), - (implicit EFLAGS)), +def : Pat<(mul GR64:$src1, i64immSExt32:$src2), (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>; - -// Memory-Integer Signed Integer Multiplication with EFLAGS result -def : Pat<(parallel (X86smul_flag (loadi64 addr:$src1), i64immSExt8:$src2), - (implicit EFLAGS)), +def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2), (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>; -def : Pat<(parallel (X86smul_flag (loadi64 addr:$src1), i64immSExt32:$src2), - (implicit EFLAGS)), +def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2), (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>; -// INC and DEC with EFLAGS result. Note that these do not set CF. -def : Pat<(parallel (X86inc_flag GR16:$src), (implicit EFLAGS)), - (INC64_16r GR16:$src)>, Requires<[In64BitMode]>; -def : Pat<(parallel (X86dec_flag GR16:$src), (implicit EFLAGS)), - (DEC64_16r GR16:$src)>, Requires<[In64BitMode]>; - -def : Pat<(parallel (X86inc_flag GR32:$src), (implicit EFLAGS)), - (INC64_32r GR32:$src)>, Requires<[In64BitMode]>; -def : Pat<(parallel (X86dec_flag GR32:$src), (implicit EFLAGS)), - (DEC64_32r GR32:$src)>, Requires<[In64BitMode]>; - -def : Pat<(parallel (X86inc_flag GR64:$src), (implicit EFLAGS)), - (INC64r GR64:$src)>; -def : Pat<(parallel (X86dec_flag GR64:$src), (implicit EFLAGS)), - (DEC64r GR64:$src)>; - -// Register-Register Logical Or with EFLAGS result -def : Pat<(parallel (X86or_flag GR64:$src1, GR64:$src2), - (implicit EFLAGS)), - (OR64rr GR64:$src1, GR64:$src2)>; +// inc/dec +def : Pat<(add GR16:$src, 1), (INC64_16r GR16:$src)>, Requires<[In64BitMode]>; +def : Pat<(add GR16:$src, -1), (DEC64_16r GR16:$src)>, Requires<[In64BitMode]>; +def : Pat<(add GR32:$src, 1), (INC64_32r GR32:$src)>, Requires<[In64BitMode]>; +def : Pat<(add GR32:$src, -1), (DEC64_32r GR32:$src)>, Requires<[In64BitMode]>; +def : Pat<(add GR64:$src, 1), (INC64r GR64:$src)>; +def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>; -// Register-Integer Logical Or with EFLAGS result -def : Pat<(parallel (X86or_flag GR64:$src1, i64immSExt8:$src2), - (implicit EFLAGS)), +// or +def : Pat<(or GR64:$src1, GR64:$src2), + (OR64rr GR64:$src1, GR64:$src2)>; +def : Pat<(or GR64:$src1, i64immSExt8:$src2), (OR64ri8 GR64:$src1, i64immSExt8:$src2)>; -def : Pat<(parallel (X86or_flag GR64:$src1, i64immSExt32:$src2), - (implicit EFLAGS)), +def : Pat<(or GR64:$src1, i64immSExt32:$src2), (OR64ri32 GR64:$src1, i64immSExt32:$src2)>; - -// Register-Memory Logical Or with EFLAGS result -def : Pat<(parallel (X86or_flag GR64:$src1, (loadi64 addr:$src2)), - (implicit EFLAGS)), +def : Pat<(or GR64:$src1, (loadi64 addr:$src2)), (OR64rm GR64:$src1, addr:$src2)>; -// Register-Register Logical XOr with EFLAGS result -def : Pat<(parallel (X86xor_flag GR64:$src1, GR64:$src2), - (implicit EFLAGS)), +// xor +def : Pat<(xor GR64:$src1, GR64:$src2), (XOR64rr GR64:$src1, GR64:$src2)>; - -// Register-Integer Logical XOr with EFLAGS result -def : Pat<(parallel (X86xor_flag GR64:$src1, i64immSExt8:$src2), - (implicit EFLAGS)), +def : Pat<(xor GR64:$src1, i64immSExt8:$src2), (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>; -def : Pat<(parallel (X86xor_flag GR64:$src1, i64immSExt32:$src2), - (implicit EFLAGS)), +def : Pat<(xor GR64:$src1, i64immSExt32:$src2), (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>; - -// Register-Memory Logical XOr with EFLAGS result -def : Pat<(parallel (X86xor_flag GR64:$src1, (loadi64 addr:$src2)), - (implicit EFLAGS)), +def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)), (XOR64rm GR64:$src1, addr:$src2)>; -// Register-Register Logical And with EFLAGS result -def : Pat<(parallel (X86and_flag GR64:$src1, GR64:$src2), - (implicit EFLAGS)), +// and +def : Pat<(and GR64:$src1, GR64:$src2), (AND64rr GR64:$src1, GR64:$src2)>; - -// Register-Integer Logical And with EFLAGS result -def : Pat<(parallel (X86and_flag GR64:$src1, i64immSExt8:$src2), - (implicit EFLAGS)), +def : Pat<(and GR64:$src1, i64immSExt8:$src2), (AND64ri8 GR64:$src1, i64immSExt8:$src2)>; -def : Pat<(parallel (X86and_flag GR64:$src1, i64immSExt32:$src2), - (implicit EFLAGS)), +def : Pat<(and GR64:$src1, i64immSExt32:$src2), (AND64ri32 GR64:$src1, i64immSExt32:$src2)>; - -// Register-Memory Logical And with EFLAGS result -def : Pat<(parallel (X86and_flag GR64:$src1, (loadi64 addr:$src2)), - (implicit EFLAGS)), +def : Pat<(and GR64:$src1, (loadi64 addr:$src2)), (AND64rm GR64:$src1, addr:$src2)>; //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index bb81cbf..d25ec26 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -68,6 +68,16 @@ def CompareFP : FPFormat<5>; def CondMovFP : FPFormat<6>; def SpecialFP : FPFormat<7>; +// Class specifying the SSE execution domain, used by the SSEDomainFix pass. +// Keep in sync with tables in X86InstrInfo.cpp. +class Domain<bits<2> val> { + bits<2> Value = val; +} +def GenericDomain : Domain<0>; +def SSEPackedSingle : Domain<1>; +def SSEPackedDouble : Domain<2>; +def SSEPackedInt : Domain<3>; + // Prefix byte classes which are used to indicate to the ad-hoc machine code // emitter that various prefix bytes are required. class OpSize { bit hasOpSizePrefix = 1; } @@ -93,7 +103,7 @@ class TA { bits<4> Prefix = 14; } class TF { bits<4> Prefix = 15; } class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, - string AsmStr> + string AsmStr, Domain d = GenericDomain> : Instruction { let Namespace = "X86"; @@ -101,7 +111,6 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, Format Form = f; bits<6> FormBits = Form.Value; ImmType ImmT = i; - bits<3> ImmTypeBits = ImmT.Value; dag OutOperandList = outs; dag InOperandList = ins; @@ -115,20 +124,21 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, bits<4> Prefix = 0; // Which prefix byte does this inst have? bit hasREX_WPrefix = 0; // Does this inst requires the REX.W prefix? - FPFormat FPForm; // What flavor of FP instruction is this? - bits<3> FPFormBits = 0; + FPFormat FPForm = NotFP; // What flavor of FP instruction is this? bit hasLockPrefix = 0; // Does this inst have a 0xF0 prefix? bits<2> SegOvrBits = 0; // Segment override prefix. + Domain ExeDomain = d; } -class I<bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern> - : X86Inst<o, f, NoImm, outs, ins, asm> { +class I<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, Domain d = GenericDomain> + : X86Inst<o, f, NoImm, outs, ins, asm, d> { let Pattern = pattern; let CodeSize = 3; } class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm, - list<dag> pattern> - : X86Inst<o, f, Imm8 , outs, ins, asm> { + list<dag> pattern, Domain d = GenericDomain> + : X86Inst<o, f, Imm8, outs, ins, asm, d> { let Pattern = pattern; let CodeSize = 3; } @@ -166,7 +176,7 @@ class FPI<bits<8> o, Format F, dag outs, dag ins, string asm> // FpI_ - Floating Point Psuedo Instruction template. Not Predicated. class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern> : X86Inst<0, Pseudo, NoImm, outs, ins, ""> { - let FPForm = fp; let FPFormBits = FPForm.Value; + let FPForm = fp; let Pattern = pattern; } @@ -196,14 +206,16 @@ class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm, class SSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> : I<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE1]>; -class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, +class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE1]>; class PSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> - : I<o, F, outs, ins, asm, pattern>, TB, Requires<[HasSSE1]>; + : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB, + Requires<[HasSSE1]>; class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> - : Ii8<o, F, outs, ins, asm, pattern>, TB, Requires<[HasSSE1]>; + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB, + Requires<[HasSSE1]>; // SSE2 Instruction Templates: // @@ -222,10 +234,12 @@ class SSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>; class PDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> - : I<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>; + : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, OpSize, + Requires<[HasSSE2]>; class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> - : Ii8<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>; + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, OpSize, + Requires<[HasSSE2]>; // SSE3 Instruction Templates: // @@ -235,12 +249,15 @@ class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> - : I<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE3]>; + : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, XS, + Requires<[HasSSE3]>; class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> - : I<o, F, outs, ins, asm, pattern>, XD, Requires<[HasSSE3]>; + : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, XD, + Requires<[HasSSE3]>; class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> - : I<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasSSE3]>; + : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, OpSize, + Requires<[HasSSE3]>; // SSSE3 Instruction Templates: @@ -254,10 +271,12 @@ class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> - : Ii8<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSSE3]>; + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, + Requires<[HasSSSE3]>; class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> - : Ii8<o, F, outs, ins, asm, pattern>, TA, Requires<[HasSSSE3]>; + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, + Requires<[HasSSSE3]>; // SSE4.1 Instruction Templates: // @@ -266,17 +285,20 @@ class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm, // class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> - : I<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSE41]>; + : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, + Requires<[HasSSE41]>; class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> - : Ii8<o, F, outs, ins, asm, pattern>, TA, Requires<[HasSSE41]>; + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, + Requires<[HasSSE41]>; // SSE4.2 Instruction Templates: // // SS428I - SSE 4.2 instructions with T8 prefix. class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> - : I<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSE42]>; + : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, + Requires<[HasSSE42]>; // SS42FI - SSE 4.2 instructions with TF prefix. class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm, @@ -286,7 +308,8 @@ class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm, // SS42AI = SSE 4.2 instructions with TA prefix class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> - : I<o, F, outs, ins, asm, pattern>, TA, Requires<[HasSSE42]>; + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, + Requires<[HasSSE42]>; // X86-64 Instruction templates... // diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 139a905..c0c9d98 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -597,7 +597,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::PMULHUWrr, X86::PMULHUWrm, 16 }, { X86::PMULHWrr, X86::PMULHWrm, 16 }, { X86::PMULLDrr, X86::PMULLDrm, 16 }, - { X86::PMULLDrr_int, X86::PMULLDrm_int, 16 }, { X86::PMULLWrr, X86::PMULLWrm, 16 }, { X86::PMULUDQrr, X86::PMULUDQrm, 16 }, { X86::PORrr, X86::PORrm, 16 }, @@ -992,8 +991,10 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, /// a few instructions in each direction it assumes it's not safe. static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { + MachineBasicBlock::iterator E = MBB.end(); + // It's always safe to clobber EFLAGS at the end of a block. - if (I == MBB.end()) + if (I == E) return true; // For compile time consideration, if we are not able to determine the @@ -1017,20 +1018,28 @@ static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB, // This instruction defines EFLAGS, no need to look any further. return true; ++Iter; + // Skip over DBG_VALUE. + while (Iter != E && Iter->isDebugValue()) + ++Iter; // If we make it to the end of the block, it's safe to clobber EFLAGS. - if (Iter == MBB.end()) + if (Iter == E) return true; } + MachineBasicBlock::iterator B = MBB.begin(); Iter = I; for (unsigned i = 0; i < 4; ++i) { // If we make it to the beginning of the block, it's safe to clobber // EFLAGS iff EFLAGS is not live-in. - if (Iter == MBB.begin()) + if (Iter == B) return !MBB.isLiveIn(X86::EFLAGS); --Iter; + // Skip over DBG_VALUE. + while (Iter != B && Iter->isDebugValue()) + --Iter; + bool SawKill = false; for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) { MachineOperand &MO = Iter->getOperand(j); @@ -1677,6 +1686,8 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock::iterator I = MBB.end(); while (I != MBB.begin()) { --I; + if (I->isDebugValue()) + continue; // Working from the bottom, when we see a non-terminator instruction, we're // done. @@ -1773,6 +1784,8 @@ unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { while (I != MBB.begin()) { --I; + if (I->isDebugValue()) + continue; if (I->getOpcode() != X86::JMP_4 && GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID) break; @@ -2505,7 +2518,9 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, Alignment = (*LoadMI->memoperands_begin())->getAlignment(); else switch (LoadMI->getOpcode()) { - case X86::V_SET0: + case X86::V_SET0PS: + case X86::V_SET0PD: + case X86::V_SET0PI: case X86::V_SETALLONES: Alignment = 16; break; @@ -2535,11 +2550,13 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, SmallVector<MachineOperand,X86AddrNumOperands> MOs; switch (LoadMI->getOpcode()) { - case X86::V_SET0: + case X86::V_SET0PS: + case X86::V_SET0PD: + case X86::V_SET0PI: case X86::V_SETALLONES: case X86::FsFLD0SD: case X86::FsFLD0SS: { - // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure. + // Folding a V_SET0P? or V_SETALLONES as a load, to ease register pressure. // Create a constant-pool entry and operands to load from it. // Medium and large mode can't fold loads this way. @@ -3648,3 +3665,51 @@ unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const { X86FI->setGlobalBaseReg(GlobalBaseReg); return GlobalBaseReg; } + +// These are the replaceable SSE instructions. Some of these have Int variants +// that we don't include here. We don't want to replace instructions selected +// by intrinsics. +static const unsigned ReplaceableInstrs[][3] = { + //PackedInt PackedSingle PackedDouble + { X86::MOVAPSmr, X86::MOVAPDmr, X86::MOVDQAmr }, + { X86::MOVAPSrm, X86::MOVAPDrm, X86::MOVDQArm }, + { X86::MOVAPSrr, X86::MOVAPDrr, X86::MOVDQArr }, + { X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr }, + { X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm }, + { X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr }, + { X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm }, + { X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr }, + { X86::ANDPSrm, X86::ANDPDrm, X86::PANDrm }, + { X86::ANDPSrr, X86::ANDPDrr, X86::PANDrr }, + { X86::ORPSrm, X86::ORPDrm, X86::PORrm }, + { X86::ORPSrr, X86::ORPDrr, X86::PORrr }, + { X86::V_SET0PS, X86::V_SET0PD, X86::V_SET0PI }, + { X86::XORPSrm, X86::XORPDrm, X86::PXORrm }, + { X86::XORPSrr, X86::XORPDrr, X86::PXORrr }, +}; + +// FIXME: Some shuffle and unpack instructions have equivalents in different +// domains, but they require a bit more work than just switching opcodes. + +static const unsigned *lookup(unsigned opcode, unsigned domain) { + for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i) + if (ReplaceableInstrs[i][domain-1] == opcode) + return ReplaceableInstrs[i]; + return 0; +} + +std::pair<uint16_t, uint16_t> +X86InstrInfo::GetSSEDomain(const MachineInstr *MI) const { + uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3; + return std::make_pair(domain, + domain && lookup(MI->getOpcode(), domain) ? 0xe : 0); +} + +void X86InstrInfo::SetSSEDomain(MachineInstr *MI, unsigned Domain) const { + assert(Domain>0 && Domain<4 && "Invalid execution domain"); + uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3; + assert(dom && "Not an SSE instruction"); + const unsigned *table = lookup(MI->getOpcode(), dom); + assert(table && "Cannot change domain"); + MI->setDesc(get(table[Domain-1])); +} diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 5111719..f0bdd06 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -398,7 +398,10 @@ namespace X86II { FS = 1 << SegOvrShift, GS = 2 << SegOvrShift, - // Bits 22 -> 23 are unused + // Execution domain for SSE instructions in bits 22, 23. + // 0 in bits 22-23 means normal, non-SSE instruction. + SSEDomainShift = 22, + OpcodeShift = 24, OpcodeMask = 0xFF << OpcodeShift }; @@ -486,7 +489,7 @@ class X86InstrInfo : public TargetInstrInfoImpl { /// MemOp2RegOpTable - Load / store unfolding opcode map. /// DenseMap<unsigned*, std::pair<unsigned, unsigned> > MemOp2RegOpTable; - + public: explicit X86InstrInfo(X86TargetMachine &tm); @@ -716,6 +719,13 @@ public: /// unsigned getGlobalBaseReg(MachineFunction *MF) const; + /// GetSSEDomain - Return the SSE execution domain of MI as the first element, + /// and a bitmask of possible arguments to SetSSEDomain ase the second. + std::pair<uint16_t, uint16_t> GetSSEDomain(const MachineInstr *MI) const; + + /// SetSSEDomain - Set the SSEDomain of MI. + void SetSSEDomain(MachineInstr *MI, unsigned Domain) const; + private: MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc, MachineFunction::iterator &MFI, diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index c80a18d..8fccc8a 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -1,4 +1,4 @@ - +//===----------------------------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -28,12 +28,13 @@ def SDTX86Cmov : SDTypeProfile<1, 4, SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; // Unary and binary operator instructions that set EFLAGS as a side-effect. -def SDTUnaryArithWithFlags : SDTypeProfile<1, 1, - [SDTCisInt<0>]>; -def SDTBinaryArithWithFlags : SDTypeProfile<1, 2, - [SDTCisSameAs<0, 1>, - SDTCisSameAs<0, 2>, - SDTCisInt<0>]>; +def SDTUnaryArithWithFlags : SDTypeProfile<2, 1, + [SDTCisInt<0>, SDTCisVT<1, i32>]>; + +def SDTBinaryArithWithFlags : SDTypeProfile<2, 2, + [SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, SDTCisVT<1, i32>]>; def SDTX86BrCond : SDTypeProfile<0, 3, [SDTCisVT<0, OtherVT>, SDTCisVT<1, i8>, SDTCisVT<2, i32>]>; @@ -77,8 +78,8 @@ def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; -def X86bsf : SDNode<"X86ISD::BSF", SDTIntUnaryOp>; -def X86bsr : SDNode<"X86ISD::BSR", SDTIntUnaryOp>; +def X86bsf : SDNode<"X86ISD::BSF", SDTUnaryArithWithFlags>; +def X86bsr : SDNode<"X86ISD::BSR", SDTUnaryArithWithFlags>; def X86shld : SDNode<"X86ISD::SHLD", SDTIntShiftDOp>; def X86shrd : SDNode<"X86ISD::SHRD", SDTIntShiftDOp>; @@ -167,6 +168,7 @@ def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags, [SDNPCommutative]>; def X86umul_flag : SDNode<"X86ISD::UMUL", SDTUnaryArithWithFlags, [SDNPCommutative]>; + def X86inc_flag : SDNode<"X86ISD::INC", SDTUnaryArithWithFlags>; def X86dec_flag : SDNode<"X86ISD::DEC", SDTUnaryArithWithFlags>; def X86or_flag : SDNode<"X86ISD::OR", SDTBinaryArithWithFlags, @@ -323,6 +325,7 @@ def FarData : Predicate<"TM.getCodeModel() != CodeModel::Small &&" def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||" "TM.getCodeModel() == CodeModel::Kernel">; def IsStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">; +def IsNotPIC : Predicate<"TM.getRelocationModel() != Reloc::PIC_">; def OptForSize : Predicate<"OptForSize">; def OptForSpeed : Predicate<"!OptForSize">; def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">; @@ -473,15 +476,14 @@ def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{ def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1))) return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue()); - else { - unsigned BitWidth = N->getValueType(0).getScalarType().getSizeInBits(); - APInt Mask = APInt::getAllOnesValue(BitWidth); - APInt KnownZero0, KnownOne0; - CurDAG->ComputeMaskedBits(N->getOperand(0), Mask, KnownZero0, KnownOne0, 0); - APInt KnownZero1, KnownOne1; - CurDAG->ComputeMaskedBits(N->getOperand(1), Mask, KnownZero1, KnownOne1, 0); - return (~KnownZero0 & ~KnownZero1) == 0; - } + + unsigned BitWidth = N->getValueType(0).getScalarType().getSizeInBits(); + APInt Mask = APInt::getAllOnesValue(BitWidth); + APInt KnownZero0, KnownOne0; + CurDAG->ComputeMaskedBits(N->getOperand(0), Mask, KnownZero0, KnownOne0, 0); + APInt KnownZero1, KnownOne1; + CurDAG->ComputeMaskedBits(N->getOperand(1), Mask, KnownZero1, KnownOne1, 0); + return (~KnownZero0 & ~KnownZero1) == 0; }]>; // 'shld' and 'shrd' instruction patterns. Note that even though these have @@ -585,7 +587,7 @@ let neverHasSideEffects = 1, isNotDuplicable = 1, Uses = [ESP] in // Return instructions. let isTerminator = 1, isReturn = 1, isBarrier = 1, - hasCtrlDep = 1, FPForm = SpecialFP, FPFormBits = SpecialFP.Value in { + hasCtrlDep = 1, FPForm = SpecialFP in { def RET : I <0xC3, RawFrm, (outs), (ins variable_ops), "ret", [(X86retflag 0)]>; @@ -806,33 +808,29 @@ let isTwoAddress = 1 in // GR32 = bswap GR32 let Defs = [EFLAGS] in { def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), "bsf{w}\t{$src, $dst|$dst, $src}", - [(set GR16:$dst, (X86bsf GR16:$src)), (implicit EFLAGS)]>, TB; + [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))]>, TB; def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "bsf{w}\t{$src, $dst|$dst, $src}", - [(set GR16:$dst, (X86bsf (loadi16 addr:$src))), - (implicit EFLAGS)]>, TB; + [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))]>, TB; def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "bsf{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (X86bsf GR32:$src)), (implicit EFLAGS)]>, TB; + [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))]>, TB; def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "bsf{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (X86bsf (loadi32 addr:$src))), - (implicit EFLAGS)]>, TB; + [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))]>, TB; def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), "bsr{w}\t{$src, $dst|$dst, $src}", - [(set GR16:$dst, (X86bsr GR16:$src)), (implicit EFLAGS)]>, TB; + [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))]>, TB; def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "bsr{w}\t{$src, $dst|$dst, $src}", - [(set GR16:$dst, (X86bsr (loadi16 addr:$src))), - (implicit EFLAGS)]>, TB; + [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))]>, TB; def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "bsr{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (X86bsr GR32:$src)), (implicit EFLAGS)]>, TB; + [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))]>, TB; def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "bsr{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (X86bsr (loadi32 addr:$src))), - (implicit EFLAGS)]>, TB; + [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))]>, TB; } // Defs = [EFLAGS] let neverHasSideEffects = 1 in @@ -1697,18 +1695,17 @@ let isTwoAddress = 0 in { let Defs = [EFLAGS] in { let CodeSize = 2 in def INC8r : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src), "inc{b}\t$dst", - [(set GR8:$dst, (add GR8:$src, 1)), - (implicit EFLAGS)]>; + [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src))]>; + let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA. def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src), "inc{w}\t$dst", - [(set GR16:$dst, (add GR16:$src, 1)), - (implicit EFLAGS)]>, + [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src))]>, OpSize, Requires<[In32BitMode]>; def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src), "inc{l}\t$dst", - [(set GR32:$dst, (add GR32:$src, 1)), - (implicit EFLAGS)]>, Requires<[In32BitMode]>; + [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src))]>, + Requires<[In32BitMode]>; } let isTwoAddress = 0, CodeSize = 2 in { def INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst", @@ -1726,18 +1723,16 @@ let isTwoAddress = 0, CodeSize = 2 in { let CodeSize = 2 in def DEC8r : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src), "dec{b}\t$dst", - [(set GR8:$dst, (add GR8:$src, -1)), - (implicit EFLAGS)]>; + [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src))]>; let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA. def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src), "dec{w}\t$dst", - [(set GR16:$dst, (add GR16:$src, -1)), - (implicit EFLAGS)]>, + [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src))]>, OpSize, Requires<[In32BitMode]>; def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src), "dec{l}\t$dst", - [(set GR32:$dst, (add GR32:$src, -1)), - (implicit EFLAGS)]>, Requires<[In32BitMode]>; + [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src))]>, + Requires<[In32BitMode]>; } let isTwoAddress = 0, CodeSize = 2 in { @@ -1758,21 +1753,20 @@ let isTwoAddress = 0, CodeSize = 2 in { // Logical operators... let Defs = [EFLAGS] in { let isCommutable = 1 in { // X = AND Y, Z --> X = AND Z, Y -def AND8rr : I<0x20, MRMDestReg, - (outs GR8 :$dst), (ins GR8 :$src1, GR8 :$src2), - "and{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (and GR8:$src1, GR8:$src2)), - (implicit EFLAGS)]>; -def AND16rr : I<0x21, MRMDestReg, - (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), - "and{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (and GR16:$src1, GR16:$src2)), - (implicit EFLAGS)]>, OpSize; -def AND32rr : I<0x21, MRMDestReg, - (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), - "and{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (and GR32:$src1, GR32:$src2)), - (implicit EFLAGS)]>; +def AND8rr : I<0x20, MRMDestReg, + (outs GR8 :$dst), (ins GR8 :$src1, GR8 :$src2), + "and{b}\t{$src2, $dst|$dst, $src2}", + [(set GR8:$dst, EFLAGS, (X86and_flag GR8:$src1, GR8:$src2))]>; +def AND16rr : I<0x21, MRMDestReg, + (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), + "and{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, EFLAGS, (X86and_flag GR16:$src1, + GR16:$src2))]>, OpSize; +def AND32rr : I<0x21, MRMDestReg, + (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), + "and{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, EFLAGS, (X86and_flag GR32:$src1, + GR32:$src2))]>; } // AND instructions with the destination register in REG and the source register @@ -1789,45 +1783,46 @@ def AND32rr_REV : I<0x23, MRMSrcReg, (outs GR32:$dst), def AND8rm : I<0x22, MRMSrcMem, (outs GR8 :$dst), (ins GR8 :$src1, i8mem :$src2), "and{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (and GR8:$src1, (loadi8 addr:$src2))), - (implicit EFLAGS)]>; + [(set GR8:$dst, EFLAGS, (X86and_flag GR8:$src1, + (loadi8 addr:$src2)))]>; def AND16rm : I<0x23, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2), "and{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (and GR16:$src1, (loadi16 addr:$src2))), - (implicit EFLAGS)]>, OpSize; + [(set GR16:$dst, EFLAGS, (X86and_flag GR16:$src1, + (loadi16 addr:$src2)))]>, + OpSize; def AND32rm : I<0x23, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), "and{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (and GR32:$src1, (loadi32 addr:$src2))), - (implicit EFLAGS)]>; + [(set GR32:$dst, EFLAGS, (X86and_flag GR32:$src1, + (loadi32 addr:$src2)))]>; def AND8ri : Ii8<0x80, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm :$src2), "and{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (and GR8:$src1, imm:$src2)), - (implicit EFLAGS)]>; + [(set GR8:$dst, EFLAGS, (X86and_flag GR8:$src1, + imm:$src2))]>; def AND16ri : Ii16<0x81, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), "and{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (and GR16:$src1, imm:$src2)), - (implicit EFLAGS)]>, OpSize; + [(set GR16:$dst, EFLAGS, (X86and_flag GR16:$src1, + imm:$src2))]>, OpSize; def AND32ri : Ii32<0x81, MRM4r, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), "and{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (and GR32:$src1, imm:$src2)), - (implicit EFLAGS)]>; + [(set GR32:$dst, EFLAGS, (X86and_flag GR32:$src1, + imm:$src2))]>; def AND16ri8 : Ii8<0x83, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "and{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (and GR16:$src1, i16immSExt8:$src2)), - (implicit EFLAGS)]>, + [(set GR16:$dst, EFLAGS, (X86and_flag GR16:$src1, + i16immSExt8:$src2))]>, OpSize; def AND32ri8 : Ii8<0x83, MRM4r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), "and{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (and GR32:$src1, i32immSExt8:$src2)), - (implicit EFLAGS)]>; + [(set GR32:$dst, EFLAGS, (X86and_flag GR32:$src1, + i32immSExt8:$src2))]>; let isTwoAddress = 0 in { def AND8mr : I<0x20, MRMDestMem, @@ -1888,18 +1883,16 @@ let isCommutable = 1 in { // X = OR Y, Z --> X = OR Z, Y def OR8rr : I<0x08, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src1, GR8 :$src2), "or{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (or GR8:$src1, GR8:$src2)), - (implicit EFLAGS)]>; + [(set GR8:$dst, EFLAGS, (X86or_flag GR8:$src1, GR8:$src2))]>; def OR16rr : I<0x09, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "or{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (or GR16:$src1, GR16:$src2)), - (implicit EFLAGS)]>, OpSize; + [(set GR16:$dst, EFLAGS, (X86or_flag GR16:$src1,GR16:$src2))]>, + OpSize; def OR32rr : I<0x09, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), "or{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (or GR32:$src1, GR32:$src2)), - (implicit EFLAGS)]>; + [(set GR32:$dst, EFLAGS, (X86or_flag GR32:$src1,GR32:$src2))]>; } // OR instructions with the destination register in REG and the source register @@ -1913,48 +1906,48 @@ def OR32rr_REV : I<0x0B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), "or{l}\t{$src2, $dst|$dst, $src2}", []>; -def OR8rm : I<0x0A, MRMSrcMem , (outs GR8 :$dst), +def OR8rm : I<0x0A, MRMSrcMem, (outs GR8 :$dst), (ins GR8 :$src1, i8mem :$src2), "or{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (or GR8:$src1, (load addr:$src2))), - (implicit EFLAGS)]>; -def OR16rm : I<0x0B, MRMSrcMem , (outs GR16:$dst), + [(set GR8:$dst, EFLAGS, (X86or_flag GR8:$src1, + (load addr:$src2)))]>; +def OR16rm : I<0x0B, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2), "or{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (or GR16:$src1, (load addr:$src2))), - (implicit EFLAGS)]>, OpSize; -def OR32rm : I<0x0B, MRMSrcMem , (outs GR32:$dst), + [(set GR16:$dst, EFLAGS, (X86or_flag GR16:$src1, + (load addr:$src2)))]>, + OpSize; +def OR32rm : I<0x0B, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), "or{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (or GR32:$src1, (load addr:$src2))), - (implicit EFLAGS)]>; + [(set GR32:$dst, EFLAGS, (X86or_flag GR32:$src1, + (load addr:$src2)))]>; def OR8ri : Ii8 <0x80, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), "or{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (or GR8:$src1, imm:$src2)), - (implicit EFLAGS)]>; + [(set GR8:$dst,EFLAGS, (X86or_flag GR8:$src1, imm:$src2))]>; def OR16ri : Ii16<0x81, MRM1r, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), "or{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (or GR16:$src1, imm:$src2)), - (implicit EFLAGS)]>, OpSize; + [(set GR16:$dst, EFLAGS, (X86or_flag GR16:$src1, + imm:$src2))]>, OpSize; def OR32ri : Ii32<0x81, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), "or{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (or GR32:$src1, imm:$src2)), - (implicit EFLAGS)]>; + [(set GR32:$dst, EFLAGS, (X86or_flag GR32:$src1, + imm:$src2))]>; def OR16ri8 : Ii8<0x83, MRM1r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "or{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (or GR16:$src1, i16immSExt8:$src2)), - (implicit EFLAGS)]>, OpSize; + [(set GR16:$dst, EFLAGS, (X86or_flag GR16:$src1, + i16immSExt8:$src2))]>, OpSize; def OR32ri8 : Ii8<0x83, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), "or{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (or GR32:$src1, i32immSExt8:$src2)), - (implicit EFLAGS)]>; + [(set GR32:$dst, EFLAGS, (X86or_flag GR32:$src1, + i32immSExt8:$src2))]>; let isTwoAddress = 0 in { def OR8mr : I<0x08, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src), "or{b}\t{$src, $dst|$dst, $src}", @@ -2004,18 +1997,18 @@ let isCommutable = 1 in { // X = XOR Y, Z --> X = XOR Z, Y def XOR8rr : I<0x30, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src1, GR8 :$src2), "xor{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (xor GR8:$src1, GR8:$src2)), - (implicit EFLAGS)]>; + [(set GR8:$dst, EFLAGS, (X86xor_flag GR8:$src1, + GR8:$src2))]>; def XOR16rr : I<0x31, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "xor{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (xor GR16:$src1, GR16:$src2)), - (implicit EFLAGS)]>, OpSize; + [(set GR16:$dst, EFLAGS, (X86xor_flag GR16:$src1, + GR16:$src2))]>, OpSize; def XOR32rr : I<0x31, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), "xor{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (xor GR32:$src1, GR32:$src2)), - (implicit EFLAGS)]>; + [(set GR32:$dst, EFLAGS, (X86xor_flag GR32:$src1, + GR32:$src2))]>; } // isCommutable = 1 // XOR instructions with the destination register in REG and the source register @@ -2029,49 +2022,48 @@ def XOR32rr_REV : I<0x33, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), "xor{l}\t{$src2, $dst|$dst, $src2}", []>; -def XOR8rm : I<0x32, MRMSrcMem , +def XOR8rm : I<0x32, MRMSrcMem, (outs GR8 :$dst), (ins GR8:$src1, i8mem :$src2), "xor{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (xor GR8:$src1, (load addr:$src2))), - (implicit EFLAGS)]>; -def XOR16rm : I<0x33, MRMSrcMem , + [(set GR8:$dst, EFLAGS, (X86xor_flag GR8:$src1, + (load addr:$src2)))]>; +def XOR16rm : I<0x33, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2), "xor{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (xor GR16:$src1, (load addr:$src2))), - (implicit EFLAGS)]>, + [(set GR16:$dst, EFLAGS, (X86xor_flag GR16:$src1, + (load addr:$src2)))]>, OpSize; -def XOR32rm : I<0x33, MRMSrcMem , +def XOR32rm : I<0x33, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), "xor{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (xor GR32:$src1, (load addr:$src2))), - (implicit EFLAGS)]>; - -def XOR8ri : Ii8<0x80, MRM6r, - (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2), - "xor{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (xor GR8:$src1, imm:$src2)), - (implicit EFLAGS)]>; -def XOR16ri : Ii16<0x81, MRM6r, - (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), - "xor{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (xor GR16:$src1, imm:$src2)), - (implicit EFLAGS)]>, OpSize; + [(set GR32:$dst, EFLAGS, (X86xor_flag GR32:$src1, + (load addr:$src2)))]>; + +def XOR8ri : Ii8<0x80, MRM6r, + (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2), + "xor{b}\t{$src2, $dst|$dst, $src2}", + [(set GR8:$dst, EFLAGS, (X86xor_flag GR8:$src1, imm:$src2))]>; +def XOR16ri : Ii16<0x81, MRM6r, + (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), + "xor{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, EFLAGS, (X86xor_flag GR16:$src1, + imm:$src2))]>, OpSize; def XOR32ri : Ii32<0x81, MRM6r, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), "xor{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (xor GR32:$src1, imm:$src2)), - (implicit EFLAGS)]>; + [(set GR32:$dst, EFLAGS, (X86xor_flag GR32:$src1, + imm:$src2))]>; def XOR16ri8 : Ii8<0x83, MRM6r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "xor{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (xor GR16:$src1, i16immSExt8:$src2)), - (implicit EFLAGS)]>, + [(set GR16:$dst, EFLAGS, (X86xor_flag GR16:$src1, + i16immSExt8:$src2))]>, OpSize; def XOR32ri8 : Ii8<0x83, MRM6r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), "xor{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (xor GR32:$src1, i32immSExt8:$src2)), - (implicit EFLAGS)]>; + [(set GR32:$dst, EFLAGS, (X86xor_flag GR32:$src1, + i32immSExt8:$src2))]>; let isTwoAddress = 0 in { def XOR8mr : I<0x30, MRMDestMem, @@ -2118,12 +2110,12 @@ let isTwoAddress = 0 in { [(store (xor (load addr:$dst), i32immSExt8:$src), addr:$dst), (implicit EFLAGS)]>; - def XOR8i8 : Ii8 <0x34, RawFrm, (outs), (ins i8imm:$src), - "xor{b}\t{$src, %al|%al, $src}", []>; - def XOR16i16 : Ii16 <0x35, RawFrm, (outs), (ins i16imm:$src), - "xor{w}\t{$src, %ax|%ax, $src}", []>, OpSize; - def XOR32i32 : Ii32 <0x35, RawFrm, (outs), (ins i32imm:$src), - "xor{l}\t{$src, %eax|%eax, $src}", []>; + def XOR8i8 : Ii8 <0x34, RawFrm, (outs), (ins i8imm:$src), + "xor{b}\t{$src, %al|%al, $src}", []>; + def XOR16i16 : Ii16<0x35, RawFrm, (outs), (ins i16imm:$src), + "xor{w}\t{$src, %ax|%ax, $src}", []>, OpSize; + def XOR32i32 : Ii32<0x35, RawFrm, (outs), (ins i32imm:$src), + "xor{l}\t{$src, %eax|%eax, $src}", []>; } // isTwoAddress = 0 } // Defs = [EFLAGS] @@ -2690,21 +2682,20 @@ let isCommutable = 1 in { // X = ADD Y, Z --> X = ADD Z, Y def ADD8rr : I<0x00, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src1, GR8 :$src2), "add{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (add GR8:$src1, GR8:$src2)), - (implicit EFLAGS)]>; + [(set GR8:$dst, EFLAGS, (X86add_flag GR8:$src1, GR8:$src2))]>; let isConvertibleToThreeAddress = 1 in { // Can transform into LEA. // Register-Register Addition def ADD16rr : I<0x01, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "add{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (add GR16:$src1, GR16:$src2)), - (implicit EFLAGS)]>, OpSize; + [(set GR16:$dst, EFLAGS, (X86add_flag GR16:$src1, + GR16:$src2))]>, OpSize; def ADD32rr : I<0x01, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), "add{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (add GR32:$src1, GR32:$src2)), - (implicit EFLAGS)]>; + [(set GR32:$dst, EFLAGS, (X86add_flag GR32:$src1, + GR32:$src2))]>; } // end isConvertibleToThreeAddress } // end isCommutable @@ -2723,47 +2714,47 @@ let isCodeGenOnly = 1 in { def ADD8rm : I<0x02, MRMSrcMem, (outs GR8 :$dst), (ins GR8 :$src1, i8mem :$src2), "add{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (add GR8:$src1, (load addr:$src2))), - (implicit EFLAGS)]>; + [(set GR8:$dst, EFLAGS, (X86add_flag GR8:$src1, + (load addr:$src2)))]>; def ADD16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2), "add{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (add GR16:$src1, (load addr:$src2))), - (implicit EFLAGS)]>, OpSize; + [(set GR16:$dst, EFLAGS, (X86add_flag GR16:$src1, + (load addr:$src2)))]>, OpSize; def ADD32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), "add{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (add GR32:$src1, (load addr:$src2))), - (implicit EFLAGS)]>; + [(set GR32:$dst, EFLAGS, (X86add_flag GR32:$src1, + (load addr:$src2)))]>; // Register-Integer Addition def ADD8ri : Ii8<0x80, MRM0r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2), "add{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (add GR8:$src1, imm:$src2)), - (implicit EFLAGS)]>; + [(set GR8:$dst, EFLAGS, + (X86add_flag GR8:$src1, imm:$src2))]>; let isConvertibleToThreeAddress = 1 in { // Can transform into LEA. // Register-Integer Addition def ADD16ri : Ii16<0x81, MRM0r, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), "add{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (add GR16:$src1, imm:$src2)), - (implicit EFLAGS)]>, OpSize; + [(set GR16:$dst, EFLAGS, + (X86add_flag GR16:$src1, imm:$src2))]>, OpSize; def ADD32ri : Ii32<0x81, MRM0r, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), "add{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (add GR32:$src1, imm:$src2)), - (implicit EFLAGS)]>; + [(set GR32:$dst, EFLAGS, + (X86add_flag GR32:$src1, imm:$src2))]>; def ADD16ri8 : Ii8<0x83, MRM0r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "add{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (add GR16:$src1, i16immSExt8:$src2)), - (implicit EFLAGS)]>, OpSize; + [(set GR16:$dst, EFLAGS, + (X86add_flag GR16:$src1, i16immSExt8:$src2))]>, OpSize; def ADD32ri8 : Ii8<0x83, MRM0r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), "add{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (add GR32:$src1, i32immSExt8:$src2)), - (implicit EFLAGS)]>; + [(set GR32:$dst, EFLAGS, + (X86add_flag GR32:$src1, i32immSExt8:$src2))]>; } let isTwoAddress = 0 in { @@ -2911,16 +2902,16 @@ let isTwoAddress = 0 in { // Register-Register Subtraction def SUB8rr : I<0x28, MRMDestReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2), "sub{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (sub GR8:$src1, GR8:$src2)), - (implicit EFLAGS)]>; + [(set GR8:$dst, EFLAGS, + (X86sub_flag GR8:$src1, GR8:$src2))]>; def SUB16rr : I<0x29, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2), "sub{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (sub GR16:$src1, GR16:$src2)), - (implicit EFLAGS)]>, OpSize; + [(set GR16:$dst, EFLAGS, + (X86sub_flag GR16:$src1, GR16:$src2))]>, OpSize; def SUB32rr : I<0x29, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2), "sub{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (sub GR32:$src1, GR32:$src2)), - (implicit EFLAGS)]>; + [(set GR32:$dst, EFLAGS, + (X86sub_flag GR32:$src1, GR32:$src2))]>; def SUB8rr_REV : I<0x2A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2), "sub{b}\t{$src2, $dst|$dst, $src2}", []>; @@ -2935,45 +2926,45 @@ def SUB32rr_REV : I<0x2B, MRMSrcReg, (outs GR32:$dst), def SUB8rm : I<0x2A, MRMSrcMem, (outs GR8 :$dst), (ins GR8 :$src1, i8mem :$src2), "sub{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (sub GR8:$src1, (load addr:$src2))), - (implicit EFLAGS)]>; + [(set GR8:$dst, EFLAGS, + (X86sub_flag GR8:$src1, (load addr:$src2)))]>; def SUB16rm : I<0x2B, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2), "sub{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (sub GR16:$src1, (load addr:$src2))), - (implicit EFLAGS)]>, OpSize; + [(set GR16:$dst, EFLAGS, + (X86sub_flag GR16:$src1, (load addr:$src2)))]>, OpSize; def SUB32rm : I<0x2B, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), "sub{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (sub GR32:$src1, (load addr:$src2))), - (implicit EFLAGS)]>; + [(set GR32:$dst, EFLAGS, + (X86sub_flag GR32:$src1, (load addr:$src2)))]>; // Register-Integer Subtraction def SUB8ri : Ii8 <0x80, MRM5r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2), "sub{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (sub GR8:$src1, imm:$src2)), - (implicit EFLAGS)]>; + [(set GR8:$dst, EFLAGS, + (X86sub_flag GR8:$src1, imm:$src2))]>; def SUB16ri : Ii16<0x81, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), "sub{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (sub GR16:$src1, imm:$src2)), - (implicit EFLAGS)]>, OpSize; + [(set GR16:$dst, EFLAGS, + (X86sub_flag GR16:$src1, imm:$src2))]>, OpSize; def SUB32ri : Ii32<0x81, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), "sub{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (sub GR32:$src1, imm:$src2)), - (implicit EFLAGS)]>; + [(set GR32:$dst, EFLAGS, + (X86sub_flag GR32:$src1, imm:$src2))]>; def SUB16ri8 : Ii8<0x83, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "sub{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (sub GR16:$src1, i16immSExt8:$src2)), - (implicit EFLAGS)]>, OpSize; + [(set GR16:$dst, EFLAGS, + (X86sub_flag GR16:$src1, i16immSExt8:$src2))]>, OpSize; def SUB32ri8 : Ii8<0x83, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), "sub{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (sub GR32:$src1, i32immSExt8:$src2)), - (implicit EFLAGS)]>; + [(set GR32:$dst, EFLAGS, + (X86sub_flag GR32:$src1, i32immSExt8:$src2))]>; let isTwoAddress = 0 in { // Memory-Register Subtraction @@ -3122,25 +3113,26 @@ let isCommutable = 1 in { // X = IMUL Y, Z --> X = IMUL Z, Y // Register-Register Signed Integer Multiply def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2), "imul{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (mul GR16:$src1, GR16:$src2)), - (implicit EFLAGS)]>, TB, OpSize; + [(set GR16:$dst, EFLAGS, + (X86smul_flag GR16:$src1, GR16:$src2))]>, TB, OpSize; def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2), "imul{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (mul GR32:$src1, GR32:$src2)), - (implicit EFLAGS)]>, TB; + [(set GR32:$dst, EFLAGS, + (X86smul_flag GR32:$src1, GR32:$src2))]>, TB; } // Register-Memory Signed Integer Multiply def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2), "imul{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (mul GR16:$src1, (load addr:$src2))), - (implicit EFLAGS)]>, TB, OpSize; + [(set GR16:$dst, EFLAGS, + (X86smul_flag GR16:$src1, (load addr:$src2)))]>, + TB, OpSize; def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), "imul{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (mul GR32:$src1, (load addr:$src2))), - (implicit EFLAGS)]>, TB; + [(set GR32:$dst, EFLAGS, + (X86smul_flag GR32:$src1, (load addr:$src2)))]>, TB; } // Defs = [EFLAGS] } // end Two Address instructions @@ -3150,47 +3142,49 @@ let Defs = [EFLAGS] in { def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16 (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR16:$dst, (mul GR16:$src1, imm:$src2)), - (implicit EFLAGS)]>, OpSize; + [(set GR16:$dst, EFLAGS, + (X86smul_flag GR16:$src1, imm:$src2))]>, OpSize; def IMUL32rri : Ii32<0x69, MRMSrcReg, // GR32 = GR32*I32 (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR32:$dst, (mul GR32:$src1, imm:$src2)), - (implicit EFLAGS)]>; + [(set GR32:$dst, EFLAGS, + (X86smul_flag GR32:$src1, imm:$src2))]>; def IMUL16rri8 : Ii8<0x6B, MRMSrcReg, // GR16 = GR16*I8 (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR16:$dst, (mul GR16:$src1, i16immSExt8:$src2)), - (implicit EFLAGS)]>, OpSize; + [(set GR16:$dst, EFLAGS, + (X86smul_flag GR16:$src1, i16immSExt8:$src2))]>, + OpSize; def IMUL32rri8 : Ii8<0x6B, MRMSrcReg, // GR32 = GR32*I8 (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR32:$dst, (mul GR32:$src1, i32immSExt8:$src2)), - (implicit EFLAGS)]>; + [(set GR32:$dst, EFLAGS, + (X86smul_flag GR32:$src1, i32immSExt8:$src2))]>; // Memory-Integer Signed Integer Multiply def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16 (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2), "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR16:$dst, (mul (load addr:$src1), imm:$src2)), - (implicit EFLAGS)]>, OpSize; + [(set GR16:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), imm:$src2))]>, + OpSize; def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32 (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2), "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR32:$dst, (mul (load addr:$src1), imm:$src2)), - (implicit EFLAGS)]>; + [(set GR32:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), imm:$src2))]>; def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8 (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2), "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR16:$dst, (mul (load addr:$src1), - i16immSExt8:$src2)), - (implicit EFLAGS)]>, OpSize; + [(set GR16:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), + i16immSExt8:$src2))]>, OpSize; def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8 (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2), "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR32:$dst, (mul (load addr:$src1), - i32immSExt8:$src2)), - (implicit EFLAGS)]>; + [(set GR32:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), + i32immSExt8:$src2))]>; } // Defs = [EFLAGS] //===----------------------------------------------------------------------===// @@ -4345,9 +4339,12 @@ def : Pat<(X86tcret GR32_TC:$dst, imm:$off), (TCRETURNri GR32_TC:$dst, imm:$off)>, Requires<[In32BitMode]>; +// FIXME: This is disabled for 32-bit PIC mode because the global base +// register which is part of the address mode may be assigned a +// callee-saved register. def : Pat<(X86tcret (load addr:$dst), imm:$off), (TCRETURNmi addr:$dst, imm:$off)>, - Requires<[In32BitMode]>; + Requires<[In32BitMode, IsNotPIC]>; def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off), (TCRETURNdi texternalsym:$dst, imm:$off)>, @@ -4722,23 +4719,17 @@ def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), // (or x1, x2) -> (add x1, x2) if two operands are known not to share bits. let AddedComplexity = 5 in { // Try this before the selecting to OR -def : Pat<(parallel (or_is_add GR16:$src1, imm:$src2), - (implicit EFLAGS)), +def : Pat<(or_is_add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>; -def : Pat<(parallel (or_is_add GR32:$src1, imm:$src2), - (implicit EFLAGS)), +def : Pat<(or_is_add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>; -def : Pat<(parallel (or_is_add GR16:$src1, i16immSExt8:$src2), - (implicit EFLAGS)), +def : Pat<(or_is_add GR16:$src1, i16immSExt8:$src2), (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>; -def : Pat<(parallel (or_is_add GR32:$src1, i32immSExt8:$src2), - (implicit EFLAGS)), +def : Pat<(or_is_add GR32:$src1, i32immSExt8:$src2), (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>; -def : Pat<(parallel (or_is_add GR16:$src1, GR16:$src2), - (implicit EFLAGS)), +def : Pat<(or_is_add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>; -def : Pat<(parallel (or_is_add GR32:$src1, GR32:$src2), - (implicit EFLAGS)), +def : Pat<(or_is_add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>; } // AddedComplexity @@ -4746,270 +4737,173 @@ def : Pat<(parallel (or_is_add GR32:$src1, GR32:$src2), // EFLAGS-defining Patterns //===----------------------------------------------------------------------===// -// Register-Register Addition with EFLAGS result -def : Pat<(parallel (X86add_flag GR8:$src1, GR8:$src2), - (implicit EFLAGS)), - (ADD8rr GR8:$src1, GR8:$src2)>; -def : Pat<(parallel (X86add_flag GR16:$src1, GR16:$src2), - (implicit EFLAGS)), - (ADD16rr GR16:$src1, GR16:$src2)>; -def : Pat<(parallel (X86add_flag GR32:$src1, GR32:$src2), - (implicit EFLAGS)), - (ADD32rr GR32:$src1, GR32:$src2)>; +// add reg, reg +def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>; +def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>; -// Register-Memory Addition with EFLAGS result -def : Pat<(parallel (X86add_flag GR8:$src1, (loadi8 addr:$src2)), - (implicit EFLAGS)), +// add reg, mem +def : Pat<(add GR8:$src1, (loadi8 addr:$src2)), (ADD8rm GR8:$src1, addr:$src2)>; -def : Pat<(parallel (X86add_flag GR16:$src1, (loadi16 addr:$src2)), - (implicit EFLAGS)), +def : Pat<(add GR16:$src1, (loadi16 addr:$src2)), (ADD16rm GR16:$src1, addr:$src2)>; -def : Pat<(parallel (X86add_flag GR32:$src1, (loadi32 addr:$src2)), - (implicit EFLAGS)), +def : Pat<(add GR32:$src1, (loadi32 addr:$src2)), (ADD32rm GR32:$src1, addr:$src2)>; -// Register-Integer Addition with EFLAGS result -def : Pat<(parallel (X86add_flag GR8:$src1, imm:$src2), - (implicit EFLAGS)), - (ADD8ri GR8:$src1, imm:$src2)>; -def : Pat<(parallel (X86add_flag GR16:$src1, imm:$src2), - (implicit EFLAGS)), - (ADD16ri GR16:$src1, imm:$src2)>; -def : Pat<(parallel (X86add_flag GR32:$src1, imm:$src2), - (implicit EFLAGS)), - (ADD32ri GR32:$src1, imm:$src2)>; -def : Pat<(parallel (X86add_flag GR16:$src1, i16immSExt8:$src2), - (implicit EFLAGS)), +// add reg, imm +def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri GR8:$src1 , imm:$src2)>; +def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>; +def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>; +def : Pat<(add GR16:$src1, i16immSExt8:$src2), (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>; -def : Pat<(parallel (X86add_flag GR32:$src1, i32immSExt8:$src2), - (implicit EFLAGS)), +def : Pat<(add GR32:$src1, i32immSExt8:$src2), (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>; -// Register-Register Subtraction with EFLAGS result -def : Pat<(parallel (X86sub_flag GR8:$src1, GR8:$src2), - (implicit EFLAGS)), - (SUB8rr GR8:$src1, GR8:$src2)>; -def : Pat<(parallel (X86sub_flag GR16:$src1, GR16:$src2), - (implicit EFLAGS)), - (SUB16rr GR16:$src1, GR16:$src2)>; -def : Pat<(parallel (X86sub_flag GR32:$src1, GR32:$src2), - (implicit EFLAGS)), - (SUB32rr GR32:$src1, GR32:$src2)>; +// sub reg, reg +def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>; +def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>; -// Register-Memory Subtraction with EFLAGS result -def : Pat<(parallel (X86sub_flag GR8:$src1, (loadi8 addr:$src2)), - (implicit EFLAGS)), +// sub reg, mem +def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)), (SUB8rm GR8:$src1, addr:$src2)>; -def : Pat<(parallel (X86sub_flag GR16:$src1, (loadi16 addr:$src2)), - (implicit EFLAGS)), +def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)), (SUB16rm GR16:$src1, addr:$src2)>; -def : Pat<(parallel (X86sub_flag GR32:$src1, (loadi32 addr:$src2)), - (implicit EFLAGS)), +def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)), (SUB32rm GR32:$src1, addr:$src2)>; -// Register-Integer Subtraction with EFLAGS result -def : Pat<(parallel (X86sub_flag GR8:$src1, imm:$src2), - (implicit EFLAGS)), +// sub reg, imm +def : Pat<(sub GR8:$src1, imm:$src2), (SUB8ri GR8:$src1, imm:$src2)>; -def : Pat<(parallel (X86sub_flag GR16:$src1, imm:$src2), - (implicit EFLAGS)), +def : Pat<(sub GR16:$src1, imm:$src2), (SUB16ri GR16:$src1, imm:$src2)>; -def : Pat<(parallel (X86sub_flag GR32:$src1, imm:$src2), - (implicit EFLAGS)), +def : Pat<(sub GR32:$src1, imm:$src2), (SUB32ri GR32:$src1, imm:$src2)>; -def : Pat<(parallel (X86sub_flag GR16:$src1, i16immSExt8:$src2), - (implicit EFLAGS)), +def : Pat<(sub GR16:$src1, i16immSExt8:$src2), (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>; -def : Pat<(parallel (X86sub_flag GR32:$src1, i32immSExt8:$src2), - (implicit EFLAGS)), +def : Pat<(sub GR32:$src1, i32immSExt8:$src2), (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>; -// Register-Register Signed Integer Multiply with EFLAGS result -def : Pat<(parallel (X86smul_flag GR16:$src1, GR16:$src2), - (implicit EFLAGS)), +// mul reg, reg +def : Pat<(mul GR16:$src1, GR16:$src2), (IMUL16rr GR16:$src1, GR16:$src2)>; -def : Pat<(parallel (X86smul_flag GR32:$src1, GR32:$src2), - (implicit EFLAGS)), +def : Pat<(mul GR32:$src1, GR32:$src2), (IMUL32rr GR32:$src1, GR32:$src2)>; -// Register-Memory Signed Integer Multiply with EFLAGS result -def : Pat<(parallel (X86smul_flag GR16:$src1, (loadi16 addr:$src2)), - (implicit EFLAGS)), +// mul reg, mem +def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)), (IMUL16rm GR16:$src1, addr:$src2)>; -def : Pat<(parallel (X86smul_flag GR32:$src1, (loadi32 addr:$src2)), - (implicit EFLAGS)), +def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)), (IMUL32rm GR32:$src1, addr:$src2)>; -// Register-Integer Signed Integer Multiply with EFLAGS result -def : Pat<(parallel (X86smul_flag GR16:$src1, imm:$src2), - (implicit EFLAGS)), +// mul reg, imm +def : Pat<(mul GR16:$src1, imm:$src2), (IMUL16rri GR16:$src1, imm:$src2)>; -def : Pat<(parallel (X86smul_flag GR32:$src1, imm:$src2), - (implicit EFLAGS)), +def : Pat<(mul GR32:$src1, imm:$src2), (IMUL32rri GR32:$src1, imm:$src2)>; -def : Pat<(parallel (X86smul_flag GR16:$src1, i16immSExt8:$src2), - (implicit EFLAGS)), +def : Pat<(mul GR16:$src1, i16immSExt8:$src2), (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>; -def : Pat<(parallel (X86smul_flag GR32:$src1, i32immSExt8:$src2), - (implicit EFLAGS)), +def : Pat<(mul GR32:$src1, i32immSExt8:$src2), (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>; -// Memory-Integer Signed Integer Multiply with EFLAGS result -def : Pat<(parallel (X86smul_flag (loadi16 addr:$src1), imm:$src2), - (implicit EFLAGS)), +// reg = mul mem, imm +def : Pat<(mul (loadi16 addr:$src1), imm:$src2), (IMUL16rmi addr:$src1, imm:$src2)>; -def : Pat<(parallel (X86smul_flag (loadi32 addr:$src1), imm:$src2), - (implicit EFLAGS)), +def : Pat<(mul (loadi32 addr:$src1), imm:$src2), (IMUL32rmi addr:$src1, imm:$src2)>; -def : Pat<(parallel (X86smul_flag (loadi16 addr:$src1), i16immSExt8:$src2), - (implicit EFLAGS)), +def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2), (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>; -def : Pat<(parallel (X86smul_flag (loadi32 addr:$src1), i32immSExt8:$src2), - (implicit EFLAGS)), +def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2), (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>; // Optimize multiply by 2 with EFLAGS result. let AddedComplexity = 2 in { -def : Pat<(parallel (X86smul_flag GR16:$src1, 2), - (implicit EFLAGS)), - (ADD16rr GR16:$src1, GR16:$src1)>; - -def : Pat<(parallel (X86smul_flag GR32:$src1, 2), - (implicit EFLAGS)), - (ADD32rr GR32:$src1, GR32:$src1)>; +def : Pat<(X86smul_flag GR16:$src1, 2), (ADD16rr GR16:$src1, GR16:$src1)>; +def : Pat<(X86smul_flag GR32:$src1, 2), (ADD32rr GR32:$src1, GR32:$src1)>; } -// INC and DEC with EFLAGS result. Note that these do not set CF. -def : Pat<(parallel (X86inc_flag GR8:$src), (implicit EFLAGS)), - (INC8r GR8:$src)>; -def : Pat<(parallel (X86dec_flag GR8:$src), (implicit EFLAGS)), - (DEC8r GR8:$src)>; - -def : Pat<(parallel (X86inc_flag GR16:$src), (implicit EFLAGS)), - (INC16r GR16:$src)>, Requires<[In32BitMode]>; -def : Pat<(parallel (X86dec_flag GR16:$src), (implicit EFLAGS)), - (DEC16r GR16:$src)>, Requires<[In32BitMode]>; - -def : Pat<(parallel (X86inc_flag GR32:$src), (implicit EFLAGS)), - (INC32r GR32:$src)>, Requires<[In32BitMode]>; -def : Pat<(parallel (X86dec_flag GR32:$src), (implicit EFLAGS)), - (DEC32r GR32:$src)>, Requires<[In32BitMode]>; - -// Register-Register Or with EFLAGS result -def : Pat<(parallel (X86or_flag GR8:$src1, GR8:$src2), - (implicit EFLAGS)), - (OR8rr GR8:$src1, GR8:$src2)>; -def : Pat<(parallel (X86or_flag GR16:$src1, GR16:$src2), - (implicit EFLAGS)), - (OR16rr GR16:$src1, GR16:$src2)>; -def : Pat<(parallel (X86or_flag GR32:$src1, GR32:$src2), - (implicit EFLAGS)), - (OR32rr GR32:$src1, GR32:$src2)>; - -// Register-Memory Or with EFLAGS result -def : Pat<(parallel (X86or_flag GR8:$src1, (loadi8 addr:$src2)), - (implicit EFLAGS)), +// Patterns for nodes that do not produce flags, for instructions that do. + +// Increment reg. +def : Pat<(add GR8:$src , 1), (INC8r GR8:$src)>; +def : Pat<(add GR16:$src, 1), (INC16r GR16:$src)>, Requires<[In32BitMode]>; +def : Pat<(add GR32:$src, 1), (INC32r GR32:$src)>, Requires<[In32BitMode]>; + +// Decrement reg. +def : Pat<(add GR8:$src , -1), (DEC8r GR8:$src)>; +def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>, Requires<[In32BitMode]>; +def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>, Requires<[In32BitMode]>; + +// or reg/reg. +def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>; +def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>; + +// or reg/mem +def : Pat<(or GR8:$src1, (loadi8 addr:$src2)), (OR8rm GR8:$src1, addr:$src2)>; -def : Pat<(parallel (X86or_flag GR16:$src1, (loadi16 addr:$src2)), - (implicit EFLAGS)), +def : Pat<(or GR16:$src1, (loadi16 addr:$src2)), (OR16rm GR16:$src1, addr:$src2)>; -def : Pat<(parallel (X86or_flag GR32:$src1, (loadi32 addr:$src2)), - (implicit EFLAGS)), +def : Pat<(or GR32:$src1, (loadi32 addr:$src2)), (OR32rm GR32:$src1, addr:$src2)>; -// Register-Integer Or with EFLAGS result -def : Pat<(parallel (X86or_flag GR8:$src1, imm:$src2), - (implicit EFLAGS)), - (OR8ri GR8:$src1, imm:$src2)>; -def : Pat<(parallel (X86or_flag GR16:$src1, imm:$src2), - (implicit EFLAGS)), - (OR16ri GR16:$src1, imm:$src2)>; -def : Pat<(parallel (X86or_flag GR32:$src1, imm:$src2), - (implicit EFLAGS)), - (OR32ri GR32:$src1, imm:$src2)>; -def : Pat<(parallel (X86or_flag GR16:$src1, i16immSExt8:$src2), - (implicit EFLAGS)), +// or reg/imm +def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri GR8 :$src1, imm:$src2)>; +def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>; +def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>; +def : Pat<(or GR16:$src1, i16immSExt8:$src2), (OR16ri8 GR16:$src1, i16immSExt8:$src2)>; -def : Pat<(parallel (X86or_flag GR32:$src1, i32immSExt8:$src2), - (implicit EFLAGS)), +def : Pat<(or GR32:$src1, i32immSExt8:$src2), (OR32ri8 GR32:$src1, i32immSExt8:$src2)>; -// Register-Register XOr with EFLAGS result -def : Pat<(parallel (X86xor_flag GR8:$src1, GR8:$src2), - (implicit EFLAGS)), - (XOR8rr GR8:$src1, GR8:$src2)>; -def : Pat<(parallel (X86xor_flag GR16:$src1, GR16:$src2), - (implicit EFLAGS)), - (XOR16rr GR16:$src1, GR16:$src2)>; -def : Pat<(parallel (X86xor_flag GR32:$src1, GR32:$src2), - (implicit EFLAGS)), - (XOR32rr GR32:$src1, GR32:$src2)>; - -// Register-Memory XOr with EFLAGS result -def : Pat<(parallel (X86xor_flag GR8:$src1, (loadi8 addr:$src2)), - (implicit EFLAGS)), +// xor reg/reg +def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>; +def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>; + +// xor reg/mem +def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)), (XOR8rm GR8:$src1, addr:$src2)>; -def : Pat<(parallel (X86xor_flag GR16:$src1, (loadi16 addr:$src2)), - (implicit EFLAGS)), +def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)), (XOR16rm GR16:$src1, addr:$src2)>; -def : Pat<(parallel (X86xor_flag GR32:$src1, (loadi32 addr:$src2)), - (implicit EFLAGS)), +def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)), (XOR32rm GR32:$src1, addr:$src2)>; -// Register-Integer XOr with EFLAGS result -def : Pat<(parallel (X86xor_flag GR8:$src1, imm:$src2), - (implicit EFLAGS)), +// xor reg/imm +def : Pat<(xor GR8:$src1, imm:$src2), (XOR8ri GR8:$src1, imm:$src2)>; -def : Pat<(parallel (X86xor_flag GR16:$src1, imm:$src2), - (implicit EFLAGS)), +def : Pat<(xor GR16:$src1, imm:$src2), (XOR16ri GR16:$src1, imm:$src2)>; -def : Pat<(parallel (X86xor_flag GR32:$src1, imm:$src2), - (implicit EFLAGS)), +def : Pat<(xor GR32:$src1, imm:$src2), (XOR32ri GR32:$src1, imm:$src2)>; -def : Pat<(parallel (X86xor_flag GR16:$src1, i16immSExt8:$src2), - (implicit EFLAGS)), +def : Pat<(xor GR16:$src1, i16immSExt8:$src2), (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>; -def : Pat<(parallel (X86xor_flag GR32:$src1, i32immSExt8:$src2), - (implicit EFLAGS)), +def : Pat<(xor GR32:$src1, i32immSExt8:$src2), (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>; -// Register-Register And with EFLAGS result -def : Pat<(parallel (X86and_flag GR8:$src1, GR8:$src2), - (implicit EFLAGS)), - (AND8rr GR8:$src1, GR8:$src2)>; -def : Pat<(parallel (X86and_flag GR16:$src1, GR16:$src2), - (implicit EFLAGS)), - (AND16rr GR16:$src1, GR16:$src2)>; -def : Pat<(parallel (X86and_flag GR32:$src1, GR32:$src2), - (implicit EFLAGS)), - (AND32rr GR32:$src1, GR32:$src2)>; - -// Register-Memory And with EFLAGS result -def : Pat<(parallel (X86and_flag GR8:$src1, (loadi8 addr:$src2)), - (implicit EFLAGS)), +// and reg/reg +def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>; +def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>; + +// and reg/mem +def : Pat<(and GR8:$src1, (loadi8 addr:$src2)), (AND8rm GR8:$src1, addr:$src2)>; -def : Pat<(parallel (X86and_flag GR16:$src1, (loadi16 addr:$src2)), - (implicit EFLAGS)), +def : Pat<(and GR16:$src1, (loadi16 addr:$src2)), (AND16rm GR16:$src1, addr:$src2)>; -def : Pat<(parallel (X86and_flag GR32:$src1, (loadi32 addr:$src2)), - (implicit EFLAGS)), +def : Pat<(and GR32:$src1, (loadi32 addr:$src2)), (AND32rm GR32:$src1, addr:$src2)>; -// Register-Integer And with EFLAGS result -def : Pat<(parallel (X86and_flag GR8:$src1, imm:$src2), - (implicit EFLAGS)), +// and reg/imm +def : Pat<(and GR8:$src1, imm:$src2), (AND8ri GR8:$src1, imm:$src2)>; -def : Pat<(parallel (X86and_flag GR16:$src1, imm:$src2), - (implicit EFLAGS)), +def : Pat<(and GR16:$src1, imm:$src2), (AND16ri GR16:$src1, imm:$src2)>; -def : Pat<(parallel (X86and_flag GR32:$src1, imm:$src2), - (implicit EFLAGS)), +def : Pat<(and GR32:$src1, imm:$src2), (AND32ri GR32:$src1, imm:$src2)>; -def : Pat<(parallel (X86and_flag GR16:$src1, i16immSExt8:$src2), - (implicit EFLAGS)), +def : Pat<(and GR16:$src1, i16immSExt8:$src2), (AND16ri8 GR16:$src1, i16immSExt8:$src2)>; -def : Pat<(parallel (X86and_flag GR32:$src1, i32immSExt8:$src2), - (implicit EFLAGS)), +def : Pat<(and GR32:$src1, i32immSExt8:$src2), (AND32ri8 GR32:$src1, i32immSExt8:$src2)>; // -disable-16bit support. diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index e1203e2..1c81c5e 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -605,22 +605,9 @@ let AddedComplexity = 10 in { def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))), VR64:$src2)), (MMX_PANDNrr VR64:$src1, VR64:$src2)>; -def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV_bc))), - VR64:$src2)), - (MMX_PANDNrr VR64:$src1, VR64:$src2)>; -def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8 immAllOnesV_bc))), - VR64:$src2)), - (MMX_PANDNrr VR64:$src1, VR64:$src2)>; - def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))), (load addr:$src2))), (MMX_PANDNrm VR64:$src1, addr:$src2)>; -def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV_bc))), - (load addr:$src2))), - (MMX_PANDNrm VR64:$src1, addr:$src2)>; -def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8 immAllOnesV_bc))), - (load addr:$src2))), - (MMX_PANDNrm VR64:$src1, addr:$src2)>; // Move MMX to lower 64-bit of XMM def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert (v8i8 VR64:$src))))), diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 720b663..dadc2a6 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -69,8 +69,9 @@ def X86pcmpgtw : SDNode<"X86ISD::PCMPGTW", SDTIntBinOp>; def X86pcmpgtd : SDNode<"X86ISD::PCMPGTD", SDTIntBinOp>; def X86pcmpgtq : SDNode<"X86ISD::PCMPGTQ", SDTIntBinOp>; -def SDTX86CmpPTest : SDTypeProfile<0, 2, [SDTCisVT<0, v4f32>, - SDTCisVT<1, v4f32>]>; +def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, + SDTCisVT<1, v4f32>, + SDTCisVT<2, v4f32>]>; def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; //===----------------------------------------------------------------------===// @@ -1114,15 +1115,19 @@ def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), // load of an all-zeros value if folding it would be beneficial. // FIXME: Change encoding to pseudo! let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isCodeGenOnly = 1 in -def V_SET0 : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "", + isCodeGenOnly = 1 in { +def V_SET0PS : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "", + [(set VR128:$dst, (v4f32 immAllZerosV))]>; +def V_SET0PD : PDI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "", + [(set VR128:$dst, (v2f64 immAllZerosV))]>; +let ExeDomain = SSEPackedInt in +def V_SET0PI : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "", [(set VR128:$dst, (v4i32 immAllZerosV))]>; +} -def : Pat<(v2i64 immAllZerosV), (V_SET0)>; -def : Pat<(v8i16 immAllZerosV), (V_SET0)>; -def : Pat<(v16i8 immAllZerosV), (V_SET0)>; -def : Pat<(v2f64 immAllZerosV), (V_SET0)>; -def : Pat<(v4f32 immAllZerosV), (V_SET0)>; +def : Pat<(v2i64 immAllZerosV), (V_SET0PI)>; +def : Pat<(v8i16 immAllZerosV), (V_SET0PI)>; +def : Pat<(v16i8 immAllZerosV), (V_SET0PI)>; def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), x86_subreg_ss))>; @@ -1935,6 +1940,7 @@ let Constraints = "$src1 = $dst" in { //===---------------------------------------------------------------------===// // SSE integer instructions +let ExeDomain = SSEPackedInt in { // Move Instructions let neverHasSideEffects = 1 in @@ -2043,6 +2049,7 @@ multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode, } } // Constraints = "$src1 = $dst" +} // ExeDomain = SSEPackedInt // 128-bit Integer Arithmetic @@ -2105,7 +2112,8 @@ defm PSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad", int_x86_sse2_psra_d, int_x86_sse2_psrai_d>; // 128-bit logical shifts. -let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in { +let Constraints = "$src1 = $dst", neverHasSideEffects = 1, + ExeDomain = SSEPackedInt in { def PSLLDQri : PDIi8<0x73, MRM7r, (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), "pslldq\t{$src2, $dst|$dst, $src2}", []>; @@ -2139,7 +2147,7 @@ defm PAND : PDI_binop_rm_v2i64<0xDB, "pand", and, 1>; defm POR : PDI_binop_rm_v2i64<0xEB, "por" , or , 1>; defm PXOR : PDI_binop_rm_v2i64<0xEF, "pxor", xor, 1>; -let Constraints = "$src1 = $dst" in { +let Constraints = "$src1 = $dst", ExeDomain = SSEPackedInt in { def PANDNrr : PDI<0xDF, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "pandn\t{$src2, $dst|$dst, $src2}", @@ -2193,6 +2201,8 @@ defm PACKSSWB : PDI_binop_rm_int<0x63, "packsswb", int_x86_sse2_packsswb_128>; defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128>; defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128>; +let ExeDomain = SSEPackedInt in { + // Shuffle and unpack instructions let AddedComplexity = 5 in { def PSHUFDri : PDIi8<0x70, MRMSrcReg, @@ -2369,10 +2379,13 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), "maskmovdqu\t{$mask, $src|$src, $mask}", [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>; +} // ExeDomain = SSEPackedInt + // Non-temporal stores def MOVNTPDmr_Int : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "movntpd\t{$src, $dst|$dst, $src}", [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>; +let ExeDomain = SSEPackedInt in def MOVNTDQmr_Int : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntdq\t{$src, $dst|$dst, $src}", [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>; @@ -2386,6 +2399,7 @@ def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntpd\t{$src, $dst|$dst, $src}", [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>; +let ExeDomain = SSEPackedInt in def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntdq\t{$src, $dst|$dst, $src}", [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>; @@ -2414,7 +2428,7 @@ def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), // We set canFoldAsLoad because this can be converted to a constant-pool // load of an all-ones value if folding it would be beneficial. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isCodeGenOnly = 1 in + isCodeGenOnly = 1, ExeDomain = SSEPackedInt in // FIXME: Change encoding to pseudo. def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "", [(set VR128:$dst, (v4i32 immAllOnesV))]>; @@ -3016,14 +3030,14 @@ let Predicates = [HasSSE2] in { let AddedComplexity = 15 in { // Zeroing a VR128 then do a MOVS{S|D} to the lower bits. def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), - (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>; + (MOVSDrr (v2f64 (V_SET0PS)), FR64:$src)>; def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), - (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>; + (MOVSSrr (v4f32 (V_SET0PS)), FR32:$src)>; def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), - (MOVSSrr (v4f32 (V_SET0)), + (MOVSSrr (v4f32 (V_SET0PS)), (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), x86_subreg_ss)))>; def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), - (MOVSSrr (v4i32 (V_SET0)), + (MOVSSrr (v4i32 (V_SET0PI)), (EXTRACT_SUBREG (v4i32 VR128:$src), x86_subreg_ss))>; } @@ -3181,9 +3195,6 @@ def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))), (SHUFFLE_get_shuf_imm VR128:$src3))>; // Set lowest element and zero upper elements. -let AddedComplexity = 15 in -def : Pat<(v2f64 (movl immAllZerosV_bc, VR128:$src)), - (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>; def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>; @@ -3441,8 +3452,28 @@ let Constraints = "$src1 = $dst" in { OpSize; } } -defm PMULLD : SS41I_binop_patint<0x40, "pmulld", v4i32, mul, - int_x86_sse41_pmulld, 1>; + +/// SS48I_binop_rm - Simple SSE41 binary operator. +let Constraints = "$src1 = $dst" in { +multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, bit Commutable = 0> { + def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]>, + OpSize { + let isCommutable = Commutable; + } + def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (OpNode VR128:$src1, + (bc_v4i32 (memopv2i64 addr:$src2))))]>, + OpSize; +} +} + +defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, 1>; /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate let Constraints = "$src1 = $dst" in { @@ -3772,12 +3803,12 @@ def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3), let Defs = [EFLAGS] in { def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), "ptest \t{$src2, $src1|$src1, $src2}", - [(X86ptest VR128:$src1, VR128:$src2), - (implicit EFLAGS)]>, OpSize; + [(set EFLAGS, (X86ptest VR128:$src1, VR128:$src2))]>, + OpSize; def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2), "ptest \t{$src2, $src1|$src1, $src2}", - [(X86ptest VR128:$src1, (load addr:$src2)), - (implicit EFLAGS)]>, OpSize; + [(set EFLAGS, (X86ptest VR128:$src1, (load addr:$src2)))]>, + OpSize; } def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), @@ -3817,6 +3848,53 @@ def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)), def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))), (PCMPGTQrm VR128:$src1, addr:$src2)>; +// TODO: These should be AES as a feature set. +defm AESIMC : SS42I_binop_rm_int<0xDB, "aesimc", + int_x86_aesni_aesimc>; +defm AESENC : SS42I_binop_rm_int<0xDC, "aesenc", + int_x86_aesni_aesenc>; +defm AESENCLAST : SS42I_binop_rm_int<0xDD, "aesenclast", + int_x86_aesni_aesenclast>; +defm AESDEC : SS42I_binop_rm_int<0xDE, "aesdec", + int_x86_aesni_aesdec>; +defm AESDECLAST : SS42I_binop_rm_int<0xDF, "aesdeclast", + int_x86_aesni_aesdeclast>; + +def : Pat<(v2i64 (int_x86_aesni_aesimc VR128:$src1, VR128:$src2)), + (AESIMCrr VR128:$src1, VR128:$src2)>; +def : Pat<(v2i64 (int_x86_aesni_aesimc VR128:$src1, (memop addr:$src2))), + (AESIMCrm VR128:$src1, addr:$src2)>; +def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, VR128:$src2)), + (AESENCrr VR128:$src1, VR128:$src2)>; +def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, (memop addr:$src2))), + (AESENCrm VR128:$src1, addr:$src2)>; +def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, VR128:$src2)), + (AESENCLASTrr VR128:$src1, VR128:$src2)>; +def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, (memop addr:$src2))), + (AESENCLASTrm VR128:$src1, addr:$src2)>; +def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, VR128:$src2)), + (AESDECrr VR128:$src1, VR128:$src2)>; +def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, (memop addr:$src2))), + (AESDECrm VR128:$src1, addr:$src2)>; +def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, VR128:$src2)), + (AESDECLASTrr VR128:$src1, VR128:$src2)>; +def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))), + (AESDECLASTrm VR128:$src1, addr:$src2)>; + +def AESKEYGENASSIST128rr : SS42AI<0xDF, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, i32i8imm:$src2), + "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, + OpSize; +def AESKEYGENASSIST128rm : SS42AI<0xDF, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1, i32i8imm:$src2), + "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_aesni_aeskeygenassist (bitconvert (memopv2i64 addr:$src1)), + imm:$src2))]>, + OpSize; + // crc intrinsic instruction // This set of instructions are only rm, the only difference is the size // of r and m. diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index cd56816..8a0cde4 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -266,6 +266,9 @@ void X86Subtarget::AutoDetectSubtargetFeatures() { unsigned Model = 0; DetectFamilyModel(EAX, Family, Model); IsBTMemSlow = IsAMD || (Family == 6 && Model >= 13); + // If it's Nehalem, unaligned memory access is fast. + if (Family == 15 && Model == 26) + IsUAMemFast = true; GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX); HasX86_64 = (EDX >> 29) & 0x1; @@ -286,6 +289,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &FS, , HasFMA3(false) , HasFMA4(false) , IsBTMemSlow(false) + , IsUAMemFast(false) , HasVectorUAMem(false) , DarwinVers(0) , stackAlignment(8) diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 56220db..bf30154 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -78,6 +78,9 @@ protected: /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow. bool IsBTMemSlow; + /// IsUAMemFast - True if unaligned memory access is fast. + bool IsUAMemFast; + /// HasVectorUAMem - True if SIMD operations can have unaligned memory /// operands. This may require setting a feature bit in the /// processor. @@ -148,6 +151,7 @@ public: bool hasFMA3() const { return HasFMA3; } bool hasFMA4() const { return HasFMA4; } bool isBTMemSlow() const { return IsBTMemSlow; } + bool isUnalignedMemAccessFast() const { return IsUAMemFast; } bool hasVectorUAMem() const { return HasVectorUAMem; } bool isTargetDarwin() const { return TargetType == isDarwin; } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index f13e6f3..c608e56 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -17,6 +17,7 @@ #include "llvm/PassManager.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetRegistry.h" @@ -169,6 +170,15 @@ bool X86TargetMachine::addPostRegAlloc(PassManagerBase &PM, return true; // -print-machineinstr should print after this. } +bool X86TargetMachine::addPreEmitPass(PassManagerBase &PM, + CodeGenOpt::Level OptLevel) { + if (OptLevel != CodeGenOpt::None && Subtarget.hasSSE2()) { + PM.add(createSSEDomainFixPass()); + return true; + } + return false; +} + bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel, JITCodeEmitter &JCE) { diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index 2bb5454..ae7b5b2 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -66,6 +66,7 @@ public: virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel); virtual bool addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel); virtual bool addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel); + virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel); virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel, JITCodeEmitter &JCE); }; diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp index e5f5a6d..54df33c 100644 --- a/lib/Target/XCore/XCoreInstrInfo.cpp +++ b/lib/Target/XCore/XCoreInstrInfo.cpp @@ -215,7 +215,15 @@ XCoreInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, bool AllowModify) const { // If the block has no terminators, it just falls into the block after it. MachineBasicBlock::iterator I = MBB.end(); - if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) + if (I == MBB.begin()) + return false; + --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return false; + --I; + } + if (!isUnpredicatedTerminator(I)) return false; // Get the last instruction in the block. @@ -326,6 +334,11 @@ XCoreInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { MachineBasicBlock::iterator I = MBB.end(); if (I == MBB.begin()) return 0; --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return 0; + --I; + } if (!IsBRU(I->getOpcode()) && !IsCondBranch(I->getOpcode())) return 0; diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td index 2e9a1e5..dd3cbc1 100644 --- a/lib/Target/XCore/XCoreInstrInfo.td +++ b/lib/Target/XCore/XCoreInstrInfo.td @@ -32,7 +32,7 @@ def XCoreBranchLink : SDNode<"XCoreISD::BL",SDT_XCoreBranchLink, [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag, SDNPVariadic]>; -def XCoreRetsp : SDNode<"XCoreISD::RETSP", SDTNone, +def XCoreRetsp : SDNode<"XCoreISD::RETSP", SDTBrind, [SDNPHasChain, SDNPOptInFlag]>; def SDT_XCoreBR_JT : SDTypeProfile<0, 2, |