diff options
author | rdivacky <rdivacky@FreeBSD.org> | 2009-10-14 17:57:32 +0000 |
---|---|---|
committer | rdivacky <rdivacky@FreeBSD.org> | 2009-10-14 17:57:32 +0000 |
commit | cd749a9c07f1de2fb8affde90537efa4bc3e7c54 (patch) | |
tree | b21f6de4e08b89bb7931806bab798fc2a5e3a686 /lib/Target/ARM | |
parent | 72621d11de5b873f1695f391eb95f0b336c3d2d4 (diff) | |
download | FreeBSD-src-cd749a9c07f1de2fb8affde90537efa4bc3e7c54.zip FreeBSD-src-cd749a9c07f1de2fb8affde90537efa4bc3e7c54.tar.gz |
Update llvm to r84119.
Diffstat (limited to 'lib/Target/ARM')
65 files changed, 20707 insertions, 7571 deletions
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h index 08dc07c..487ce1d 100644 --- a/lib/Target/ARM/ARM.h +++ b/lib/Target/ARM/ARM.h @@ -15,6 +15,7 @@ #ifndef TARGET_ARM_H #define TARGET_ARM_H +#include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetMachine.h" #include <cassert> @@ -24,7 +25,8 @@ class ARMBaseTargetMachine; class FunctionPass; class MachineCodeEmitter; class JITCodeEmitter; -class raw_ostream; +class ObjectCodeEmitter; +class formatted_raw_ostream; // Enums corresponding to ARM condition codes namespace ARMCC { @@ -50,7 +52,7 @@ namespace ARMCC { inline static CondCodes getOppositeCondition(CondCodes CC){ switch (CC) { - default: assert(0 && "Unknown condition code"); + default: llvm_unreachable("Unknown condition code"); case EQ: return NE; case NE: return EQ; case HS: return LO; @@ -71,7 +73,7 @@ namespace ARMCC { inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) { switch (CC) { - default: assert(0 && "Unknown condition code"); + default: llvm_unreachable("Unknown condition code"); case ARMCC::EQ: return "eq"; case ARMCC::NE: return "ne"; case ARMCC::HS: return "hs"; @@ -90,20 +92,23 @@ inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) { } } -FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM); -FunctionPass *createARMCodePrinterPass(raw_ostream &O, - ARMBaseTargetMachine &TM, - bool Verbose); -FunctionPass *createARMCodeEmitterPass(ARMBaseTargetMachine &TM, - MachineCodeEmitter &MCE); +FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM, + CodeGenOpt::Level OptLevel); FunctionPass *createARMCodeEmitterPass(ARMBaseTargetMachine &TM, MachineCodeEmitter &MCE); FunctionPass *createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM, JITCodeEmitter &JCE); +FunctionPass *createARMObjectCodeEmitterPass(ARMBaseTargetMachine &TM, + ObjectCodeEmitter &OCE); FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false); FunctionPass *createARMConstantIslandPass(); +FunctionPass *createNEONPreAllocPass(); +FunctionPass *createThumb2ITBlockPass(); +FunctionPass *createThumb2SizeReductionPass(); + +extern Target TheARMTarget, TheThumbTarget; } // end namespace llvm; diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index 9001e50..8851fbb 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -89,27 +89,20 @@ def : ProcNoItin<"xscale", [ArchV5TE]>; def : ProcNoItin<"iwmmxt", [ArchV5TE]>; // V6 Processors. -def : Processor<"arm1136j-s", V6Itineraries, - [ArchV6]>; -def : Processor<"arm1136jf-s", V6Itineraries, - [ArchV6, FeatureVFP2]>; -def : Processor<"arm1176jz-s", V6Itineraries, - [ArchV6]>; -def : Processor<"arm1176jzf-s", V6Itineraries, - [ArchV6, FeatureVFP2]>; -def : Processor<"mpcorenovfp", V6Itineraries, - [ArchV6]>; -def : Processor<"mpcore", V6Itineraries, - [ArchV6, FeatureVFP2]>; +def : ProcNoItin<"arm1136j-s", [ArchV6]>; +def : ProcNoItin<"arm1136jf-s", [ArchV6, FeatureVFP2]>; +def : ProcNoItin<"arm1176jz-s", [ArchV6]>; +def : ProcNoItin<"arm1176jzf-s", [ArchV6, FeatureVFP2]>; +def : ProcNoItin<"mpcorenovfp", [ArchV6]>; +def : ProcNoItin<"mpcore", [ArchV6, FeatureVFP2]>; // V6T2 Processors. -def : Processor<"arm1156t2-s", V6Itineraries, - [ArchV6T2, FeatureThumb2]>; -def : Processor<"arm1156t2f-s", V6Itineraries, - [ArchV6T2, FeatureThumb2, FeatureVFP2]>; +def : ProcNoItin<"arm1156t2-s", [ArchV6T2, FeatureThumb2]>; +def : ProcNoItin<"arm1156t2f-s", [ArchV6T2, FeatureThumb2, FeatureVFP2]>; // V7 Processors. -def : ProcNoItin<"cortex-a8", [ArchV7A, FeatureThumb2, FeatureNEON]>; +def : Processor<"cortex-a8", CortexA8Itineraries, + [ArchV7A, FeatureThumb2, FeatureNEON]>; def : ProcNoItin<"cortex-a9", [ArchV7A, FeatureThumb2, FeatureNEON]>; //===----------------------------------------------------------------------===// @@ -131,13 +124,13 @@ def ARMInstrInfo : InstrInfo { let TSFlagsFields = ["AddrModeBits", "SizeFlag", "IndexModeBits", - "isUnaryDataProc", - "Form"]; + "Form", + "isUnaryDataProc"]; let TSFlagsShifts = [0, 4, 7, 9, - 10]; + 15]; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/ARMAddressingModes.h b/lib/Target/ARM/ARMAddressingModes.h index 15c9ec1..1839153 100644 --- a/lib/Target/ARM/ARMAddressingModes.h +++ b/lib/Target/ARM/ARMAddressingModes.h @@ -15,11 +15,12 @@ #define LLVM_TARGET_ARM_ARMADDRESSINGMODES_H #include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include <cassert> namespace llvm { - + /// ARM_AM - ARM Addressing Mode Stuff namespace ARM_AM { enum ShiftOpc { @@ -30,14 +31,14 @@ namespace ARM_AM { ror, rrx }; - + enum AddrOpc { add = '+', sub = '-' }; - + static inline const char *getShiftOpcStr(ShiftOpc Op) { switch (Op) { - default: assert(0 && "Unknown shift opc!"); + default: llvm_unreachable("Unknown shift opc!"); case ARM_AM::asr: return "asr"; case ARM_AM::lsl: return "lsl"; case ARM_AM::lsr: return "lsr"; @@ -45,7 +46,7 @@ namespace ARM_AM { case ARM_AM::rrx: return "rrx"; } } - + static inline ShiftOpc getShiftOpcForNode(SDValue N) { switch (N.getOpcode()) { default: return ARM_AM::no_shift; @@ -70,7 +71,7 @@ namespace ARM_AM { static inline const char *getAMSubModeStr(AMSubMode Mode) { switch (Mode) { - default: assert(0 && "Unknown addressing sub-mode!"); + default: llvm_unreachable("Unknown addressing sub-mode!"); case ARM_AM::ia: return "ia"; case ARM_AM::ib: return "ib"; case ARM_AM::da: return "da"; @@ -80,7 +81,7 @@ namespace ARM_AM { static inline const char *getAMSubModeAltStr(AMSubMode Mode, bool isLD) { switch (Mode) { - default: assert(0 && "Unknown addressing sub-mode!"); + default: llvm_unreachable("Unknown addressing sub-mode!"); case ARM_AM::ia: return isLD ? "fd" : "ea"; case ARM_AM::ib: return isLD ? "ed" : "fa"; case ARM_AM::da: return isLD ? "fa" : "ed"; @@ -94,14 +95,14 @@ namespace ARM_AM { assert(Amt < 32 && "Invalid rotate amount"); return (Val >> Amt) | (Val << ((32-Amt)&31)); } - + /// rotl32 - Rotate a 32-bit unsigned value left by a specified # bits. /// static inline unsigned rotl32(unsigned Val, unsigned Amt) { assert(Amt < 32 && "Invalid rotate amount"); return (Val << Amt) | (Val >> ((32-Amt)&31)); } - + //===--------------------------------------------------------------------===// // Addressing Mode #1: shift_operand with registers //===--------------------------------------------------------------------===// @@ -136,7 +137,7 @@ namespace ARM_AM { static inline unsigned getSOImmValRot(unsigned Imm) { return (Imm >> 8) * 2; } - + /// getSOImmValRotate - Try to handle Imm with an immediate shifter operand, /// computing the rotate amount to use. If this immediate value cannot be /// handled with a single shifter-op, determine a good rotate amount that will @@ -145,14 +146,14 @@ namespace ARM_AM { // 8-bit (or less) immediates are trivially shifter_operands with a rotate // of zero. if ((Imm & ~255U) == 0) return 0; - + // Use CTZ to compute the rotate amount. unsigned TZ = CountTrailingZeros_32(Imm); - + // Rotate amount must be even. Something like 0x200 must be rotated 8 bits, // not 9. unsigned RotAmt = TZ & ~1; - + // If we can handle this spread, return it. if ((rotr32(Imm, RotAmt) & ~255U) == 0) return (32-RotAmt)&31; // HW rotates right, not left. @@ -165,16 +166,16 @@ namespace ARM_AM { // Restart the search for a high-order bit after the initial seconds of // ones. unsigned TZ2 = CountTrailingZeros_32(Imm & ~((1 << TrailingOnes)-1)); - + // Rotate amount must be even. unsigned RotAmt2 = TZ2 & ~1; - + // If this fits, use it. if (RotAmt2 != 32 && (rotr32(Imm, RotAmt2) & ~255U) == 0) return (32-RotAmt2)&31; // HW rotates right, not left. } } - + // Otherwise, we have no way to cover this span of bits with a single // shifter_op immediate. Return a chunk of bits that will be useful to // handle. @@ -188,17 +189,17 @@ namespace ARM_AM { // 8-bit (or less) immediates are trivially shifter_operands with a rotate // of zero. if ((Arg & ~255U) == 0) return Arg; - + unsigned RotAmt = getSOImmValRotate(Arg); // If this cannot be handled with a single shifter_op, bail out. if (rotr32(~255U, RotAmt) & Arg) return -1; - + // Encode this correctly. return rotl32(Arg, RotAmt) | ((RotAmt>>1) << 8); } - + /// isSOImmTwoPartVal - Return true if the specified value can be obtained by /// or'ing together two SOImmVal's. static inline bool isSOImmTwoPartVal(unsigned V) { @@ -206,12 +207,12 @@ namespace ARM_AM { V = rotr32(~255U, getSOImmValRotate(V)) & V; if (V == 0) return false; - + // If this can be handled with two shifter_op's, accept. V = rotr32(~255U, getSOImmValRotate(V)) & V; return V == 0; } - + /// getSOImmTwoPartFirst - If V is a value that satisfies isSOImmTwoPartVal, /// return the first chunk of it. static inline unsigned getSOImmTwoPartFirst(unsigned V) { @@ -221,14 +222,14 @@ namespace ARM_AM { /// getSOImmTwoPartSecond - If V is a value that satisfies isSOImmTwoPartVal, /// return the second chunk of it. static inline unsigned getSOImmTwoPartSecond(unsigned V) { - // Mask out the first hunk. + // Mask out the first hunk. V = rotr32(~255U, getSOImmValRotate(V)) & V; - + // Take what's left. assert(V == (rotr32(255U, getSOImmValRotate(V)) & V)); return V; } - + /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed /// by a left shift. Returns the shift amount to use. static inline unsigned getThumbImmValShift(unsigned Imm) { @@ -243,7 +244,7 @@ namespace ARM_AM { /// isThumbImmShiftedVal - Return true if the specified value can be obtained /// by left shifting a 8-bit immediate. static inline bool isThumbImmShiftedVal(unsigned V) { - // If this can be handled with + // If this can be handled with V = (~255U << getThumbImmValShift(V)) & V; return V == 0; } @@ -259,10 +260,10 @@ namespace ARM_AM { return CountTrailingZeros_32(Imm); } - /// isThumbImm16ShiftedVal - Return true if the specified value can be + /// isThumbImm16ShiftedVal - Return true if the specified value can be /// obtained by left shifting a 16-bit immediate. static inline bool isThumbImm16ShiftedVal(unsigned V) { - // If this can be handled with + // If this can be handled with V = (~65535U << getThumbImm16ValShift(V)) & V; return V == 0; } @@ -273,28 +274,6 @@ namespace ARM_AM { return V >> getThumbImmValShift(V); } - /// getT2SOImmValDecode - Given a 12-bit encoded Thumb-2 modified immediate, - /// return the corresponding 32-bit immediate value. - /// See ARM Reference Manual A6.3.2. - static inline unsigned getT2SOImmValDecode(unsigned Imm) { - unsigned Base = Imm & 0xff; - switch ((Imm >> 8) & 0xf) { - case 0: - return Base; - case 1: - return Base | (Base << 16); - case 2: - return (Base << 8) | (Base << 24); - case 3: - return Base | (Base << 8) | (Base << 16) | (Base << 24); - default: - break; - } - - // shifted immediate - unsigned RotAmount = ((Imm >> 7) & 0x1f) - 8; - return (Base | 0x80) << (24 - RotAmount); - } /// getT2SOImmValSplat - Return the 12-bit encoded representation /// if the specified value can be obtained by splatting the low 8 bits @@ -305,12 +284,12 @@ namespace ARM_AM { /// abcdefgh abcdefgh abcdefgh abcdefgh control = 3 /// Return -1 if none of the above apply. /// See ARM Reference Manual A6.3.2. - static inline int getT2SOImmValSplat(unsigned V) { + static inline int getT2SOImmValSplatVal(unsigned V) { unsigned u, Vs, Imm; // control = 0 - if ((V & 0xffffff00) == 0) + if ((V & 0xffffff00) == 0) return V; - + // If the value is zeroes in the first byte, just shift those off Vs = ((V & 0xff) == 0) ? V >> 8 : V; // Any passing value only has 8 bits of payload, splatted across the word @@ -329,11 +308,11 @@ namespace ARM_AM { return -1; } - /// getT2SOImmValRotate - Return the 12-bit encoded representation if the + /// getT2SOImmValRotateVal - Return the 12-bit encoded representation if the /// specified value is a rotated 8-bit value. Return -1 if no rotation /// encoding is possible. /// See ARM Reference Manual A6.3.2. - static inline int getT2SOImmValRotate (unsigned V) { + static inline int getT2SOImmValRotateVal(unsigned V) { unsigned RotAmt = CountLeadingZeros_32(V); if (RotAmt >= 24) return -1; @@ -346,23 +325,23 @@ namespace ARM_AM { } /// getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit - /// into a Thumb-2 shifter_operand immediate operand, return the 12-bit + /// into a Thumb-2 shifter_operand immediate operand, return the 12-bit /// encoding for it. If not, return -1. /// See ARM Reference Manual A6.3.2. static inline int getT2SOImmVal(unsigned Arg) { // If 'Arg' is an 8-bit splat, then get the encoded value. - int Splat = getT2SOImmValSplat(Arg); + int Splat = getT2SOImmValSplatVal(Arg); if (Splat != -1) return Splat; - + // If 'Arg' can be handled with a single shifter_op return the value. - int Rot = getT2SOImmValRotate(Arg); + int Rot = getT2SOImmValRotateVal(Arg); if (Rot != -1) return Rot; return -1; } - + //===--------------------------------------------------------------------===// // Addressing Mode #2 @@ -380,7 +359,7 @@ namespace ARM_AM { // If this addressing mode is a frame index (before prolog/epilog insertion // and code rewriting), this operand will have the form: FI#, reg0, <offs> // with no shift amount for the frame offset. - // + // static inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO) { assert(Imm12 < (1 << 12) && "Imm too large!"); bool isSub = Opc == sub; @@ -395,8 +374,8 @@ namespace ARM_AM { static inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) { return (ShiftOpc)(AM2Opc >> 13); } - - + + //===--------------------------------------------------------------------===// // Addressing Mode #3 //===--------------------------------------------------------------------===// @@ -409,7 +388,7 @@ namespace ARM_AM { // The first operand is always a Reg. The second operand is a reg if in // reg/reg form, otherwise it's reg#0. The third field encodes the operation // in bit 8, the immediate in bits 0-7. - + /// getAM3Opc - This function encodes the addrmode3 opc field. static inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset) { bool isSub = Opc == sub; @@ -421,7 +400,7 @@ namespace ARM_AM { static inline AddrOpc getAM3Op(unsigned AM3Opc) { return ((AM3Opc >> 8) & 1) ? sub : add; } - + //===--------------------------------------------------------------------===// // Addressing Mode #4 //===--------------------------------------------------------------------===// @@ -469,7 +448,7 @@ namespace ARM_AM { // // IA - Increment after // DB - Decrement before - + /// getAM5Opc - This function encodes the addrmode5 opc field. static inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) { bool isSub = Opc == sub; diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp new file mode 100644 index 0000000..ecdf5a0 --- /dev/null +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -0,0 +1,1060 @@ +//===- ARMBaseInstrInfo.cpp - ARM Instruction Information -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Base ARM implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "ARMBaseInstrInfo.h" +#include "ARM.h" +#include "ARMAddressingModes.h" +#include "ARMGenInstrInfo.inc" +#include "ARMMachineFunctionInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +using namespace llvm; + +static cl::opt<bool> +EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden, + cl::desc("Enable ARM 2-addr to 3-addr conv")); + +ARMBaseInstrInfo::ARMBaseInstrInfo() + : TargetInstrInfoImpl(ARMInsts, array_lengthof(ARMInsts)) { +} + +MachineInstr * +ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const { + // FIXME: Thumb2 support. + + if (!EnableARM3Addr) + return NULL; + + MachineInstr *MI = MBBI; + MachineFunction &MF = *MI->getParent()->getParent(); + unsigned TSFlags = MI->getDesc().TSFlags; + bool isPre = false; + switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) { + default: return NULL; + case ARMII::IndexModePre: + isPre = true; + break; + case ARMII::IndexModePost: + break; + } + + // Try splitting an indexed load/store to an un-indexed one plus an add/sub + // operation. + unsigned MemOpc = getUnindexedOpcode(MI->getOpcode()); + if (MemOpc == 0) + return NULL; + + MachineInstr *UpdateMI = NULL; + MachineInstr *MemMI = NULL; + unsigned AddrMode = (TSFlags & ARMII::AddrModeMask); + const TargetInstrDesc &TID = MI->getDesc(); + unsigned NumOps = TID.getNumOperands(); + bool isLoad = !TID.mayStore(); + const MachineOperand &WB = isLoad ? MI->getOperand(1) : MI->getOperand(0); + const MachineOperand &Base = MI->getOperand(2); + const MachineOperand &Offset = MI->getOperand(NumOps-3); + unsigned WBReg = WB.getReg(); + unsigned BaseReg = Base.getReg(); + unsigned OffReg = Offset.getReg(); + unsigned OffImm = MI->getOperand(NumOps-2).getImm(); + ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NumOps-1).getImm(); + switch (AddrMode) { + default: + assert(false && "Unknown indexed op!"); + return NULL; + case ARMII::AddrMode2: { + bool isSub = ARM_AM::getAM2Op(OffImm) == ARM_AM::sub; + unsigned Amt = ARM_AM::getAM2Offset(OffImm); + if (OffReg == 0) { + if (ARM_AM::getSOImmVal(Amt) == -1) + // Can't encode it in a so_imm operand. This transformation will + // add more than 1 instruction. Abandon! + return NULL; + UpdateMI = BuildMI(MF, MI->getDebugLoc(), + get(isSub ? ARM::SUBri : ARM::ADDri), WBReg) + .addReg(BaseReg).addImm(Amt) + .addImm(Pred).addReg(0).addReg(0); + } else if (Amt != 0) { + ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm); + unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt); + UpdateMI = BuildMI(MF, MI->getDebugLoc(), + get(isSub ? ARM::SUBrs : ARM::ADDrs), WBReg) + .addReg(BaseReg).addReg(OffReg).addReg(0).addImm(SOOpc) + .addImm(Pred).addReg(0).addReg(0); + } else + UpdateMI = BuildMI(MF, MI->getDebugLoc(), + get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg) + .addReg(BaseReg).addReg(OffReg) + .addImm(Pred).addReg(0).addReg(0); + break; + } + case ARMII::AddrMode3 : { + bool isSub = ARM_AM::getAM3Op(OffImm) == ARM_AM::sub; + unsigned Amt = ARM_AM::getAM3Offset(OffImm); + if (OffReg == 0) + // Immediate is 8-bits. It's guaranteed to fit in a so_imm operand. + UpdateMI = BuildMI(MF, MI->getDebugLoc(), + get(isSub ? ARM::SUBri : ARM::ADDri), WBReg) + .addReg(BaseReg).addImm(Amt) + .addImm(Pred).addReg(0).addReg(0); + else + UpdateMI = BuildMI(MF, MI->getDebugLoc(), + get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg) + .addReg(BaseReg).addReg(OffReg) + .addImm(Pred).addReg(0).addReg(0); + break; + } + } + + std::vector<MachineInstr*> NewMIs; + if (isPre) { + if (isLoad) + MemMI = BuildMI(MF, MI->getDebugLoc(), + get(MemOpc), MI->getOperand(0).getReg()) + .addReg(WBReg).addReg(0).addImm(0).addImm(Pred); + else + MemMI = BuildMI(MF, MI->getDebugLoc(), + get(MemOpc)).addReg(MI->getOperand(1).getReg()) + .addReg(WBReg).addReg(0).addImm(0).addImm(Pred); + NewMIs.push_back(MemMI); + NewMIs.push_back(UpdateMI); + } else { + if (isLoad) + MemMI = BuildMI(MF, MI->getDebugLoc(), + get(MemOpc), MI->getOperand(0).getReg()) + .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred); + else + MemMI = BuildMI(MF, MI->getDebugLoc(), + get(MemOpc)).addReg(MI->getOperand(1).getReg()) + .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred); + if (WB.isDead()) + UpdateMI->getOperand(0).setIsDead(); + NewMIs.push_back(UpdateMI); + NewMIs.push_back(MemMI); + } + + // Transfer LiveVariables states, kill / dead info. + if (LV) { + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (MO.isReg() && MO.getReg() && + TargetRegisterInfo::isVirtualRegister(MO.getReg())) { + unsigned Reg = MO.getReg(); + + LiveVariables::VarInfo &VI = LV->getVarInfo(Reg); + if (MO.isDef()) { + MachineInstr *NewMI = (Reg == WBReg) ? UpdateMI : MemMI; + if (MO.isDead()) + LV->addVirtualRegisterDead(Reg, NewMI); + } + if (MO.isUse() && MO.isKill()) { + for (unsigned j = 0; j < 2; ++j) { + // Look at the two new MI's in reverse order. + MachineInstr *NewMI = NewMIs[j]; + if (!NewMI->readsRegister(Reg)) + continue; + LV->addVirtualRegisterKilled(Reg, NewMI); + if (VI.removeKill(MI)) + VI.Kills.push_back(NewMI); + break; + } + } + } + } + } + + MFI->insert(MBBI, NewMIs[1]); + MFI->insert(MBBI, NewMIs[0]); + return NewMIs[0]; +} + +// Branch analysis. +bool +ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + // If the block has no terminators, it just falls into the block after it. + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) + return false; + + // Get the last instruction in the block. + MachineInstr *LastInst = I; + + // If there is only one terminator instruction, process it. + unsigned LastOpc = LastInst->getOpcode(); + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + if (isUncondBranchOpcode(LastOpc)) { + TBB = LastInst->getOperand(0).getMBB(); + return false; + } + if (isCondBranchOpcode(LastOpc)) { + // Block ends with fall-through condbranch. + TBB = LastInst->getOperand(0).getMBB(); + Cond.push_back(LastInst->getOperand(1)); + Cond.push_back(LastInst->getOperand(2)); + return false; + } + return true; // Can't handle indirect branch. + } + + // Get the instruction before it if it is a terminator. + MachineInstr *SecondLastInst = I; + + // If there are three terminators, we don't know what sort of block this is. + if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I)) + return true; + + // If the block ends with a B and a Bcc, handle it. + unsigned SecondLastOpc = SecondLastInst->getOpcode(); + if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { + TBB = SecondLastInst->getOperand(0).getMBB(); + Cond.push_back(SecondLastInst->getOperand(1)); + Cond.push_back(SecondLastInst->getOperand(2)); + FBB = LastInst->getOperand(0).getMBB(); + return false; + } + + // If the block ends with two unconditional branches, handle it. The second + // one is not executed, so remove it. + if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { + TBB = SecondLastInst->getOperand(0).getMBB(); + I = LastInst; + if (AllowModify) + I->eraseFromParent(); + return false; + } + + // ...likewise if it ends with a branch table followed by an unconditional + // branch. The branch folder can create these, and we must get rid of them for + // correctness of Thumb constant islands. + if (isJumpTableBranchOpcode(SecondLastOpc) && + isUncondBranchOpcode(LastOpc)) { + I = LastInst; + if (AllowModify) + I->eraseFromParent(); + return true; + } + + // Otherwise, can't handle this. + return true; +} + + +unsigned ARMBaseInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin()) return 0; + --I; + if (!isUncondBranchOpcode(I->getOpcode()) && + !isCondBranchOpcode(I->getOpcode())) + return 0; + + // Remove the branch. + I->eraseFromParent(); + + I = MBB.end(); + + if (I == MBB.begin()) return 1; + --I; + if (!isCondBranchOpcode(I->getOpcode())) + return 1; + + // Remove the branch. + I->eraseFromParent(); + return 2; +} + +unsigned +ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const SmallVectorImpl<MachineOperand> &Cond) const { + // FIXME this should probably have a DebugLoc argument + DebugLoc dl = DebugLoc::getUnknownLoc(); + + ARMFunctionInfo *AFI = MBB.getParent()->getInfo<ARMFunctionInfo>(); + int BOpc = !AFI->isThumbFunction() + ? ARM::B : (AFI->isThumb2Function() ? ARM::t2B : ARM::tB); + int BccOpc = !AFI->isThumbFunction() + ? ARM::Bcc : (AFI->isThumb2Function() ? ARM::t2Bcc : ARM::tBcc); + + // Shouldn't be a fall through. + assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + assert((Cond.size() == 2 || Cond.size() == 0) && + "ARM branch conditions have two components!"); + + if (FBB == 0) { + if (Cond.empty()) // Unconditional branch? + BuildMI(&MBB, dl, get(BOpc)).addMBB(TBB); + else + BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB) + .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()); + return 1; + } + + // Two-way conditional branch. + BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB) + .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()); + BuildMI(&MBB, dl, get(BOpc)).addMBB(FBB); + return 2; +} + +bool ARMBaseInstrInfo:: +ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { + ARMCC::CondCodes CC = (ARMCC::CondCodes)(int)Cond[0].getImm(); + Cond[0].setImm(ARMCC::getOppositeCondition(CC)); + return false; +} + +bool ARMBaseInstrInfo:: +PredicateInstruction(MachineInstr *MI, + const SmallVectorImpl<MachineOperand> &Pred) const { + unsigned Opc = MI->getOpcode(); + if (isUncondBranchOpcode(Opc)) { + MI->setDesc(get(getMatchingCondBranchOpcode(Opc))); + MI->addOperand(MachineOperand::CreateImm(Pred[0].getImm())); + MI->addOperand(MachineOperand::CreateReg(Pred[1].getReg(), false)); + return true; + } + + int PIdx = MI->findFirstPredOperandIdx(); + if (PIdx != -1) { + MachineOperand &PMO = MI->getOperand(PIdx); + PMO.setImm(Pred[0].getImm()); + MI->getOperand(PIdx+1).setReg(Pred[1].getReg()); + return true; + } + return false; +} + +bool ARMBaseInstrInfo:: +SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, + const SmallVectorImpl<MachineOperand> &Pred2) const { + if (Pred1.size() > 2 || Pred2.size() > 2) + return false; + + ARMCC::CondCodes CC1 = (ARMCC::CondCodes)Pred1[0].getImm(); + ARMCC::CondCodes CC2 = (ARMCC::CondCodes)Pred2[0].getImm(); + if (CC1 == CC2) + return true; + + switch (CC1) { + default: + return false; + case ARMCC::AL: + return true; + case ARMCC::HS: + return CC2 == ARMCC::HI; + case ARMCC::LS: + return CC2 == ARMCC::LO || CC2 == ARMCC::EQ; + case ARMCC::GE: + return CC2 == ARMCC::GT; + case ARMCC::LE: + return CC2 == ARMCC::LT; + } +} + +bool ARMBaseInstrInfo::DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const { + // FIXME: This confuses implicit_def with optional CPSR def. + const TargetInstrDesc &TID = MI->getDesc(); + if (!TID.getImplicitDefs() && !TID.hasOptionalDef()) + return false; + + bool Found = false; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg() && MO.getReg() == ARM::CPSR) { + Pred.push_back(MO); + Found = true; + } + } + + return Found; +} + + +/// FIXME: Works around a gcc miscompilation with -fstrict-aliasing +static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT, + unsigned JTI) DISABLE_INLINE; +static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT, + unsigned JTI) { + return JT[JTI].MBBs.size(); +} + +/// GetInstSize - Return the size of the specified MachineInstr. +/// +unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { + const MachineBasicBlock &MBB = *MI->getParent(); + const MachineFunction *MF = MBB.getParent(); + const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); + + // Basic size info comes from the TSFlags field. + const TargetInstrDesc &TID = MI->getDesc(); + unsigned TSFlags = TID.TSFlags; + + unsigned Opc = MI->getOpcode(); + switch ((TSFlags & ARMII::SizeMask) >> ARMII::SizeShift) { + default: { + // If this machine instr is an inline asm, measure it. + if (MI->getOpcode() == ARM::INLINEASM) + return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI); + if (MI->isLabel()) + return 0; + switch (Opc) { + default: + llvm_unreachable("Unknown or unset size field for instr!"); + case TargetInstrInfo::IMPLICIT_DEF: + case TargetInstrInfo::KILL: + case TargetInstrInfo::DBG_LABEL: + case TargetInstrInfo::EH_LABEL: + return 0; + } + break; + } + case ARMII::Size8Bytes: return 8; // ARM instruction x 2. + case ARMII::Size4Bytes: return 4; // ARM / Thumb2 instruction. + case ARMII::Size2Bytes: return 2; // Thumb1 instruction. + case ARMII::SizeSpecial: { + switch (Opc) { + case ARM::CONSTPOOL_ENTRY: + // If this machine instr is a constant pool entry, its size is recorded as + // operand #2. + return MI->getOperand(2).getImm(); + case ARM::Int_eh_sjlj_setjmp: + return 24; + case ARM::t2Int_eh_sjlj_setjmp: + return 20; + case ARM::BR_JTr: + case ARM::BR_JTm: + case ARM::BR_JTadd: + case ARM::tBR_JTr: + case ARM::t2BR_JT: + case ARM::t2TBB: + case ARM::t2TBH: { + // These are jumptable branches, i.e. a branch followed by an inlined + // jumptable. The size is 4 + 4 * number of entries. For TBB, each + // entry is one byte; TBH two byte each. + unsigned EntrySize = (Opc == ARM::t2TBB) + ? 1 : ((Opc == ARM::t2TBH) ? 2 : 4); + unsigned NumOps = TID.getNumOperands(); + MachineOperand JTOP = + MI->getOperand(NumOps - (TID.isPredicable() ? 3 : 2)); + unsigned JTI = JTOP.getIndex(); + const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); + assert(JTI < JT.size()); + // Thumb instructions are 2 byte aligned, but JT entries are 4 byte + // 4 aligned. The assembler / linker may add 2 byte padding just before + // the JT entries. The size does not include this padding; the + // constant islands pass does separate bookkeeping for it. + // FIXME: If we know the size of the function is less than (1 << 16) *2 + // bytes, we can use 16-bit entries instead. Then there won't be an + // alignment issue. + unsigned InstSize = (Opc == ARM::tBR_JTr || Opc == ARM::t2BR_JT) ? 2 : 4; + unsigned NumEntries = getNumJTEntries(JT, JTI); + if (Opc == ARM::t2TBB && (NumEntries & 1)) + // Make sure the instruction that follows TBB is 2-byte aligned. + // FIXME: Constant island pass should insert an "ALIGN" instruction + // instead. + ++NumEntries; + return NumEntries * EntrySize + InstSize; + } + default: + // Otherwise, pseudo-instruction sizes are zero. + return 0; + } + } + } + return 0; // Not reached +} + +/// Return true if the instruction is a register to register move and +/// leave the source and dest operands in the passed parameters. +/// +bool +ARMBaseInstrInfo::isMoveInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned& SrcSubIdx, unsigned& DstSubIdx) const { + SrcSubIdx = DstSubIdx = 0; // No sub-registers. + + switch (MI.getOpcode()) { + default: break; + case ARM::FCPYS: + case ARM::FCPYD: + case ARM::VMOVD: + case ARM::VMOVQ: { + SrcReg = MI.getOperand(1).getReg(); + DstReg = MI.getOperand(0).getReg(); + return true; + } + case ARM::MOVr: + case ARM::tMOVr: + case ARM::tMOVgpr2tgpr: + case ARM::tMOVtgpr2gpr: + case ARM::tMOVgpr2gpr: + case ARM::t2MOVr: { + assert(MI.getDesc().getNumOperands() >= 2 && + MI.getOperand(0).isReg() && + MI.getOperand(1).isReg() && + "Invalid ARM MOV instruction"); + SrcReg = MI.getOperand(1).getReg(); + DstReg = MI.getOperand(0).getReg(); + return true; + } + } + + return false; +} + +unsigned +ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + switch (MI->getOpcode()) { + default: break; + case ARM::LDR: + case ARM::t2LDRs: // FIXME: don't use t2LDRs to access frame. + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isReg() && + MI->getOperand(3).isImm() && + MI->getOperand(2).getReg() == 0 && + MI->getOperand(3).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::t2LDRi12: + case ARM::tRestore: + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isImm() && + MI->getOperand(2).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::FLDD: + case ARM::FLDS: + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isImm() && + MI->getOperand(2).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + + return 0; +} + +unsigned +ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + switch (MI->getOpcode()) { + default: break; + case ARM::STR: + case ARM::t2STRs: // FIXME: don't use t2STRs to access frame. + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isReg() && + MI->getOperand(3).isImm() && + MI->getOperand(2).getReg() == 0 && + MI->getOperand(3).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::t2STRi12: + case ARM::tSpill: + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isImm() && + MI->getOperand(2).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::FSTD: + case ARM::FSTS: + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isImm() && + MI->getOperand(2).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + + return 0; +} + +bool +ARMBaseInstrInfo::copyRegToReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *DestRC, + const TargetRegisterClass *SrcRC) const { + DebugLoc DL = DebugLoc::getUnknownLoc(); + if (I != MBB.end()) DL = I->getDebugLoc(); + + if (DestRC != SrcRC) { + // Allow DPR / DPR_VFP2 / DPR_8 cross-class copies + // Allow QPR / QPR_VFP2 cross-class copies + if (DestRC == ARM::DPRRegisterClass) { + if (SrcRC == ARM::DPR_VFP2RegisterClass || + SrcRC == ARM::DPR_8RegisterClass) { + } else + return false; + } else if (DestRC == ARM::DPR_VFP2RegisterClass) { + if (SrcRC == ARM::DPRRegisterClass || + SrcRC == ARM::DPR_8RegisterClass) { + } else + return false; + } else if (DestRC == ARM::DPR_8RegisterClass) { + if (SrcRC == ARM::DPRRegisterClass || + SrcRC == ARM::DPR_VFP2RegisterClass) { + } else + return false; + } else if ((DestRC == ARM::QPRRegisterClass && + SrcRC == ARM::QPR_VFP2RegisterClass) || + (DestRC == ARM::QPR_VFP2RegisterClass && + SrcRC == ARM::QPRRegisterClass)) { + } else + return false; + } + + if (DestRC == ARM::GPRRegisterClass) { + AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr), + DestReg).addReg(SrcReg))); + } else if (DestRC == ARM::SPRRegisterClass) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYS), DestReg) + .addReg(SrcReg)); + } else if ((DestRC == ARM::DPRRegisterClass) || + (DestRC == ARM::DPR_VFP2RegisterClass) || + (DestRC == ARM::DPR_8RegisterClass)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYD), DestReg) + .addReg(SrcReg)); + } else if (DestRC == ARM::QPRRegisterClass || + DestRC == ARM::QPR_VFP2RegisterClass) { + BuildMI(MBB, I, DL, get(ARM::VMOVQ), DestReg).addReg(SrcReg); + } else { + return false; + } + + return true; +} + +void ARMBaseInstrInfo:: +storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned SrcReg, bool isKill, int FI, + const TargetRegisterClass *RC) const { + DebugLoc DL = DebugLoc::getUnknownLoc(); + if (I != MBB.end()) DL = I->getDebugLoc(); + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + + MachineMemOperand *MMO = + MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI), + MachineMemOperand::MOStore, 0, + MFI.getObjectSize(FI), + MFI.getObjectAlignment(FI)); + + if (RC == ARM::GPRRegisterClass) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STR)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO)); + } else if (RC == ARM::DPRRegisterClass || + RC == ARM::DPR_VFP2RegisterClass || + RC == ARM::DPR_8RegisterClass) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FSTD)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } else if (RC == ARM::SPRRegisterClass) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FSTS)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } else { + assert((RC == ARM::QPRRegisterClass || + RC == ARM::QPR_VFP2RegisterClass) && "Unknown regclass!"); + // FIXME: Neon instructions should support predicates + BuildMI(MBB, I, DL, get(ARM::VSTRQ)).addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO); + } +} + +void ARMBaseInstrInfo:: +loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned DestReg, int FI, + const TargetRegisterClass *RC) const { + DebugLoc DL = DebugLoc::getUnknownLoc(); + if (I != MBB.end()) DL = I->getDebugLoc(); + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + + MachineMemOperand *MMO = + MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI), + MachineMemOperand::MOLoad, 0, + MFI.getObjectSize(FI), + MFI.getObjectAlignment(FI)); + + if (RC == ARM::GPRRegisterClass) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDR), DestReg) + .addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO)); + } else if (RC == ARM::DPRRegisterClass || + RC == ARM::DPR_VFP2RegisterClass || + RC == ARM::DPR_8RegisterClass) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FLDD), DestReg) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } else if (RC == ARM::SPRRegisterClass) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FLDS), DestReg) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } else { + assert((RC == ARM::QPRRegisterClass || + RC == ARM::QPR_VFP2RegisterClass) && "Unknown regclass!"); + // FIXME: Neon instructions should support predicates + BuildMI(MBB, I, DL, get(ARM::VLDRQ), DestReg).addFrameIndex(FI).addImm(0).addMemOperand(MMO); + } +} + +MachineInstr *ARMBaseInstrInfo:: +foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops, int FI) const { + if (Ops.size() != 1) return NULL; + + unsigned OpNum = Ops[0]; + unsigned Opc = MI->getOpcode(); + MachineInstr *NewMI = NULL; + if (Opc == ARM::MOVr || Opc == ARM::t2MOVr) { + // If it is updating CPSR, then it cannot be folded. + if (MI->getOperand(4).getReg() == ARM::CPSR && !MI->getOperand(4).isDead()) + return NULL; + unsigned Pred = MI->getOperand(2).getImm(); + unsigned PredReg = MI->getOperand(3).getReg(); + if (OpNum == 0) { // move -> store + unsigned SrcReg = MI->getOperand(1).getReg(); + bool isKill = MI->getOperand(1).isKill(); + bool isUndef = MI->getOperand(1).isUndef(); + if (Opc == ARM::MOVr) + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::STR)) + .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)) + .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg); + else // ARM::t2MOVr + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2STRi12)) + .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)) + .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg); + } else { // move -> load + unsigned DstReg = MI->getOperand(0).getReg(); + bool isDead = MI->getOperand(0).isDead(); + bool isUndef = MI->getOperand(0).isUndef(); + if (Opc == ARM::MOVr) + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::LDR)) + .addReg(DstReg, + RegState::Define | + getDeadRegState(isDead) | + getUndefRegState(isUndef)) + .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg); + else // ARM::t2MOVr + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2LDRi12)) + .addReg(DstReg, + RegState::Define | + getDeadRegState(isDead) | + getUndefRegState(isUndef)) + .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg); + } + } else if (Opc == ARM::tMOVgpr2gpr || + Opc == ARM::tMOVtgpr2gpr || + Opc == ARM::tMOVgpr2tgpr) { + if (OpNum == 0) { // move -> store + unsigned SrcReg = MI->getOperand(1).getReg(); + bool isKill = MI->getOperand(1).isKill(); + bool isUndef = MI->getOperand(1).isUndef(); + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2STRi12)) + .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)) + .addFrameIndex(FI).addImm(0).addImm(ARMCC::AL).addReg(0); + } else { // move -> load + unsigned DstReg = MI->getOperand(0).getReg(); + bool isDead = MI->getOperand(0).isDead(); + bool isUndef = MI->getOperand(0).isUndef(); + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2LDRi12)) + .addReg(DstReg, + RegState::Define | + getDeadRegState(isDead) | + getUndefRegState(isUndef)) + .addFrameIndex(FI).addImm(0).addImm(ARMCC::AL).addReg(0); + } + } else if (Opc == ARM::FCPYS) { + unsigned Pred = MI->getOperand(2).getImm(); + unsigned PredReg = MI->getOperand(3).getReg(); + if (OpNum == 0) { // move -> store + unsigned SrcReg = MI->getOperand(1).getReg(); + bool isKill = MI->getOperand(1).isKill(); + bool isUndef = MI->getOperand(1).isUndef(); + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FSTS)) + .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)) + .addFrameIndex(FI) + .addImm(0).addImm(Pred).addReg(PredReg); + } else { // move -> load + unsigned DstReg = MI->getOperand(0).getReg(); + bool isDead = MI->getOperand(0).isDead(); + bool isUndef = MI->getOperand(0).isUndef(); + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FLDS)) + .addReg(DstReg, + RegState::Define | + getDeadRegState(isDead) | + getUndefRegState(isUndef)) + .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg); + } + } + else if (Opc == ARM::FCPYD) { + unsigned Pred = MI->getOperand(2).getImm(); + unsigned PredReg = MI->getOperand(3).getReg(); + if (OpNum == 0) { // move -> store + unsigned SrcReg = MI->getOperand(1).getReg(); + bool isKill = MI->getOperand(1).isKill(); + bool isUndef = MI->getOperand(1).isUndef(); + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FSTD)) + .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)) + .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg); + } else { // move -> load + unsigned DstReg = MI->getOperand(0).getReg(); + bool isDead = MI->getOperand(0).isDead(); + bool isUndef = MI->getOperand(0).isUndef(); + NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FLDD)) + .addReg(DstReg, + RegState::Define | + getDeadRegState(isDead) | + getUndefRegState(isUndef)) + .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg); + } + } + + return NewMI; +} + +MachineInstr* +ARMBaseInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr* MI, + const SmallVectorImpl<unsigned> &Ops, + MachineInstr* LoadMI) const { + // FIXME + return 0; +} + +bool +ARMBaseInstrInfo::canFoldMemoryOperand(const MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops) const { + if (Ops.size() != 1) return false; + + unsigned Opc = MI->getOpcode(); + if (Opc == ARM::MOVr || Opc == ARM::t2MOVr) { + // If it is updating CPSR, then it cannot be folded. + return MI->getOperand(4).getReg() != ARM::CPSR || + MI->getOperand(4).isDead(); + } else if (Opc == ARM::tMOVgpr2gpr || + Opc == ARM::tMOVtgpr2gpr || + Opc == ARM::tMOVgpr2tgpr) { + return true; + } else if (Opc == ARM::FCPYS || Opc == ARM::FCPYD) { + return true; + } else if (Opc == ARM::VMOVD || Opc == ARM::VMOVQ) { + return false; // FIXME + } + + return false; +} + +/// getInstrPredicate - If instruction is predicated, returns its predicate +/// condition, otherwise returns AL. It also returns the condition code +/// register by reference. +ARMCC::CondCodes +llvm::getInstrPredicate(const MachineInstr *MI, unsigned &PredReg) { + int PIdx = MI->findFirstPredOperandIdx(); + if (PIdx == -1) { + PredReg = 0; + return ARMCC::AL; + } + + PredReg = MI->getOperand(PIdx+1).getReg(); + return (ARMCC::CondCodes)MI->getOperand(PIdx).getImm(); +} + + +int llvm::getMatchingCondBranchOpcode(int Opc) { + if (Opc == ARM::B) + return ARM::Bcc; + else if (Opc == ARM::tB) + return ARM::tBcc; + else if (Opc == ARM::t2B) + return ARM::t2Bcc; + + llvm_unreachable("Unknown unconditional branch opcode!"); + return 0; +} + + +void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + unsigned DestReg, unsigned BaseReg, int NumBytes, + ARMCC::CondCodes Pred, unsigned PredReg, + const ARMBaseInstrInfo &TII) { + bool isSub = NumBytes < 0; + if (isSub) NumBytes = -NumBytes; + + while (NumBytes) { + unsigned RotAmt = ARM_AM::getSOImmValRotate(NumBytes); + unsigned ThisVal = NumBytes & ARM_AM::rotr32(0xFF, RotAmt); + assert(ThisVal && "Didn't extract field correctly"); + + // We will handle these bits from offset, clear them. + NumBytes &= ~ThisVal; + + assert(ARM_AM::getSOImmVal(ThisVal) != -1 && "Bit extraction didn't work?"); + + // Build the new ADD / SUB. + unsigned Opc = isSub ? ARM::SUBri : ARM::ADDri; + BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg) + .addReg(BaseReg, RegState::Kill).addImm(ThisVal) + .addImm((unsigned)Pred).addReg(PredReg).addReg(0); + BaseReg = DestReg; + } +} + +bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII) { + unsigned Opcode = MI.getOpcode(); + const TargetInstrDesc &Desc = MI.getDesc(); + unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); + bool isSub = false; + + // Memory operands in inline assembly always use AddrMode2. + if (Opcode == ARM::INLINEASM) + AddrMode = ARMII::AddrMode2; + + if (Opcode == ARM::ADDri) { + Offset += MI.getOperand(FrameRegIdx+1).getImm(); + if (Offset == 0) { + // Turn it into a move. + MI.setDesc(TII.get(ARM::MOVr)); + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + MI.RemoveOperand(FrameRegIdx+1); + Offset = 0; + return true; + } else if (Offset < 0) { + Offset = -Offset; + isSub = true; + MI.setDesc(TII.get(ARM::SUBri)); + } + + // Common case: small offset, fits into instruction. + if (ARM_AM::getSOImmVal(Offset) != -1) { + // Replace the FrameIndex with sp / fp + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset); + Offset = 0; + return true; + } + + // Otherwise, pull as much of the immedidate into this ADDri/SUBri + // as possible. + unsigned RotAmt = ARM_AM::getSOImmValRotate(Offset); + unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xFF, RotAmt); + + // We will handle these bits from offset, clear them. + Offset &= ~ThisImmVal; + + // Get the properly encoded SOImmVal field. + assert(ARM_AM::getSOImmVal(ThisImmVal) != -1 && + "Bit extraction didn't work?"); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(ThisImmVal); + } else { + unsigned ImmIdx = 0; + int InstrOffs = 0; + unsigned NumBits = 0; + unsigned Scale = 1; + switch (AddrMode) { + case ARMII::AddrMode2: { + ImmIdx = FrameRegIdx+2; + InstrOffs = ARM_AM::getAM2Offset(MI.getOperand(ImmIdx).getImm()); + if (ARM_AM::getAM2Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub) + InstrOffs *= -1; + NumBits = 12; + break; + } + case ARMII::AddrMode3: { + ImmIdx = FrameRegIdx+2; + InstrOffs = ARM_AM::getAM3Offset(MI.getOperand(ImmIdx).getImm()); + if (ARM_AM::getAM3Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub) + InstrOffs *= -1; + NumBits = 8; + break; + } + case ARMII::AddrMode4: + // Can't fold any offset even if it's zero. + return false; + case ARMII::AddrMode5: { + ImmIdx = FrameRegIdx+1; + InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm()); + if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub) + InstrOffs *= -1; + NumBits = 8; + Scale = 4; + break; + } + default: + llvm_unreachable("Unsupported addressing mode!"); + break; + } + + Offset += InstrOffs * Scale; + assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!"); + if (Offset < 0) { + Offset = -Offset; + isSub = true; + } + + // Attempt to fold address comp. if opcode has offset bits + if (NumBits > 0) { + // Common case: small offset, fits into instruction. + MachineOperand &ImmOp = MI.getOperand(ImmIdx); + int ImmedOffset = Offset / Scale; + unsigned Mask = (1 << NumBits) - 1; + if ((unsigned)Offset <= Mask * Scale) { + // Replace the FrameIndex with sp + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + if (isSub) + ImmedOffset |= 1 << NumBits; + ImmOp.ChangeToImmediate(ImmedOffset); + Offset = 0; + return true; + } + + // Otherwise, it didn't fit. Pull in what we can to simplify the immed. + ImmedOffset = ImmedOffset & Mask; + if (isSub) + ImmedOffset |= 1 << NumBits; + ImmOp.ChangeToImmediate(ImmedOffset); + Offset &= ~(Mask*Scale); + } + } + + Offset = (isSub) ? -Offset : Offset; + return Offset == 0; +} diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h new file mode 100644 index 0000000..a13155b --- /dev/null +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -0,0 +1,333 @@ +//===- ARMBaseInstrInfo.h - ARM Base Instruction Information -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Base ARM implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMBASEINSTRUCTIONINFO_H +#define ARMBASEINSTRUCTIONINFO_H + +#include "ARM.h" +#include "ARMRegisterInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Target/TargetInstrInfo.h" + +namespace llvm { + +/// ARMII - This namespace holds all of the target specific flags that +/// instruction info tracks. +/// +namespace ARMII { + enum { + //===------------------------------------------------------------------===// + // Instruction Flags. + + //===------------------------------------------------------------------===// + // This four-bit field describes the addressing mode used. + + AddrModeMask = 0xf, + AddrModeNone = 0, + AddrMode1 = 1, + AddrMode2 = 2, + AddrMode3 = 3, + AddrMode4 = 4, + AddrMode5 = 5, + AddrMode6 = 6, + AddrModeT1_1 = 7, + AddrModeT1_2 = 8, + AddrModeT1_4 = 9, + AddrModeT1_s = 10, // i8 * 4 for pc and sp relative data + AddrModeT2_i12 = 11, + AddrModeT2_i8 = 12, + AddrModeT2_so = 13, + AddrModeT2_pc = 14, // +/- i12 for pc relative data + AddrModeT2_i8s4 = 15, // i8 * 4 + + // Size* - Flags to keep track of the size of an instruction. + SizeShift = 4, + SizeMask = 7 << SizeShift, + SizeSpecial = 1, // 0 byte pseudo or special case. + Size8Bytes = 2, + Size4Bytes = 3, + Size2Bytes = 4, + + // IndexMode - Unindex, pre-indexed, or post-indexed. Only valid for load + // and store ops + IndexModeShift = 7, + IndexModeMask = 3 << IndexModeShift, + IndexModePre = 1, + IndexModePost = 2, + + //===------------------------------------------------------------------===// + // Instruction encoding formats. + // + FormShift = 9, + FormMask = 0x3f << FormShift, + + // Pseudo instructions + Pseudo = 0 << FormShift, + + // Multiply instructions + MulFrm = 1 << FormShift, + + // Branch instructions + BrFrm = 2 << FormShift, + BrMiscFrm = 3 << FormShift, + + // Data Processing instructions + DPFrm = 4 << FormShift, + DPSoRegFrm = 5 << FormShift, + + // Load and Store + LdFrm = 6 << FormShift, + StFrm = 7 << FormShift, + LdMiscFrm = 8 << FormShift, + StMiscFrm = 9 << FormShift, + LdStMulFrm = 10 << FormShift, + + // Miscellaneous arithmetic instructions + ArithMiscFrm = 11 << FormShift, + + // Extend instructions + ExtFrm = 12 << FormShift, + + // VFP formats + VFPUnaryFrm = 13 << FormShift, + VFPBinaryFrm = 14 << FormShift, + VFPConv1Frm = 15 << FormShift, + VFPConv2Frm = 16 << FormShift, + VFPConv3Frm = 17 << FormShift, + VFPConv4Frm = 18 << FormShift, + VFPConv5Frm = 19 << FormShift, + VFPLdStFrm = 20 << FormShift, + VFPLdStMulFrm = 21 << FormShift, + VFPMiscFrm = 22 << FormShift, + + // Thumb format + ThumbFrm = 23 << FormShift, + + // NEON format + NEONFrm = 24 << FormShift, + NEONGetLnFrm = 25 << FormShift, + NEONSetLnFrm = 26 << FormShift, + NEONDupFrm = 27 << FormShift, + + //===------------------------------------------------------------------===// + // Misc flags. + + // UnaryDP - Indicates this is a unary data processing instruction, i.e. + // it doesn't have a Rn operand. + UnaryDP = 1 << 15, + + // Xform16Bit - Indicates this Thumb2 instruction may be transformed into + // a 16-bit Thumb instruction if certain conditions are met. + Xform16Bit = 1 << 16, + + //===------------------------------------------------------------------===// + // Field shifts - such shifts are used to set field while generating + // machine instructions. + M_BitShift = 5, + ShiftImmShift = 5, + ShiftShift = 7, + N_BitShift = 7, + ImmHiShift = 8, + SoRotImmShift = 8, + RegRsShift = 8, + ExtRotImmShift = 10, + RegRdLoShift = 12, + RegRdShift = 12, + RegRdHiShift = 16, + RegRnShift = 16, + S_BitShift = 20, + W_BitShift = 21, + AM3_I_BitShift = 22, + D_BitShift = 22, + U_BitShift = 23, + P_BitShift = 24, + I_BitShift = 25, + CondShift = 28 + }; +} + +class ARMBaseInstrInfo : public TargetInstrInfoImpl { +protected: + // Can be only subclassed. + explicit ARMBaseInstrInfo(); +public: + // Return the non-pre/post incrementing version of 'Opc'. Return 0 + // if there is not such an opcode. + virtual unsigned getUnindexedOpcode(unsigned Opc) const =0; + + // Return true if the block does not fall through. + virtual bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const =0; + + virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const; + + virtual const ARMBaseRegisterInfo &getRegisterInfo() const =0; + + // Branch analysis. + virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const; + virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const; + virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const SmallVectorImpl<MachineOperand> &Cond) const; + + virtual + bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const; + + // Predication support. + bool isPredicated(const MachineInstr *MI) const { + int PIdx = MI->findFirstPredOperandIdx(); + return PIdx != -1 && MI->getOperand(PIdx).getImm() != ARMCC::AL; + } + + ARMCC::CondCodes getPredicate(const MachineInstr *MI) const { + int PIdx = MI->findFirstPredOperandIdx(); + return PIdx != -1 ? (ARMCC::CondCodes)MI->getOperand(PIdx).getImm() + : ARMCC::AL; + } + + virtual + bool PredicateInstruction(MachineInstr *MI, + const SmallVectorImpl<MachineOperand> &Pred) const; + + virtual + bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, + const SmallVectorImpl<MachineOperand> &Pred2) const; + + virtual bool DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const; + + /// GetInstSize - Returns the size of the specified MachineInstr. + /// + virtual unsigned GetInstSizeInBytes(const MachineInstr* MI) const; + + /// Return true if the instruction is a register to register move and return + /// the source and dest operands and their sub-register indices by reference. + virtual bool isMoveInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned &SrcSubIdx, unsigned &DstSubIdx) const; + + virtual unsigned isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const; + virtual unsigned isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const; + + virtual bool copyRegToReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *DestRC, + const TargetRegisterClass *SrcRC) const; + + virtual void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC) const; + + virtual void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC) const; + + virtual bool canFoldMemoryOperand(const MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops) const; + + virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr* MI, + const SmallVectorImpl<unsigned> &Ops, + int FrameIndex) const; + + virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr* MI, + const SmallVectorImpl<unsigned> &Ops, + MachineInstr* LoadMI) const; + +}; + +static inline +const MachineInstrBuilder &AddDefaultPred(const MachineInstrBuilder &MIB) { + return MIB.addImm((int64_t)ARMCC::AL).addReg(0); +} + +static inline +const MachineInstrBuilder &AddDefaultCC(const MachineInstrBuilder &MIB) { + return MIB.addReg(0); +} + +static inline +const MachineInstrBuilder &AddDefaultT1CC(const MachineInstrBuilder &MIB, + bool isDead = false) { + return MIB.addReg(ARM::CPSR, getDefRegState(true) | getDeadRegState(isDead)); +} + +static inline +const MachineInstrBuilder &AddNoT1CC(const MachineInstrBuilder &MIB) { + return MIB.addReg(0); +} + +static inline +bool isUncondBranchOpcode(int Opc) { + return Opc == ARM::B || Opc == ARM::tB || Opc == ARM::t2B; +} + +static inline +bool isCondBranchOpcode(int Opc) { + return Opc == ARM::Bcc || Opc == ARM::tBcc || Opc == ARM::t2Bcc; +} + +static inline +bool isJumpTableBranchOpcode(int Opc) { + return Opc == ARM::BR_JTr || Opc == ARM::BR_JTm || Opc == ARM::BR_JTadd || + Opc == ARM::tBR_JTr || Opc == ARM::t2BR_JT; +} + +/// getInstrPredicate - If instruction is predicated, returns its predicate +/// condition, otherwise returns AL. It also returns the condition code +/// register by reference. +ARMCC::CondCodes getInstrPredicate(const MachineInstr *MI, unsigned &PredReg); + +int getMatchingCondBranchOpcode(int Opc); + +/// emitARMRegPlusImmediate / emitT2RegPlusImmediate - Emits a series of +/// instructions to materializea destreg = basereg + immediate in ARM / Thumb2 +/// code. +void emitARMRegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + unsigned DestReg, unsigned BaseReg, int NumBytes, + ARMCC::CondCodes Pred, unsigned PredReg, + const ARMBaseInstrInfo &TII); + +void emitT2RegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + unsigned DestReg, unsigned BaseReg, int NumBytes, + ARMCC::CondCodes Pred, unsigned PredReg, + const ARMBaseInstrInfo &TII); + + +/// rewriteARMFrameIndex / rewriteT2FrameIndex - +/// Rewrite MI to access 'Offset' bytes from the FP. Return false if the +/// offset could not be handled directly in MI, and return the left-over +/// portion by reference. +bool rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII); + +bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII); + +} // End llvm namespace + +#endif diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp new file mode 100644 index 0000000..42ef183 --- /dev/null +++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -0,0 +1,1360 @@ +//===- ARMBaseRegisterInfo.cpp - ARM Register Information -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the base ARM implementation of TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMAddressingModes.h" +#include "ARMBaseInstrInfo.h" +#include "ARMBaseRegisterInfo.h" +#include "ARMInstrInfo.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMSubtarget.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/LLVMContext.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLocation.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h" +using namespace llvm; + +unsigned ARMBaseRegisterInfo::getRegisterNumbering(unsigned RegEnum, + bool *isSPVFP) { + if (isSPVFP) + *isSPVFP = false; + + using namespace ARM; + switch (RegEnum) { + default: + llvm_unreachable("Unknown ARM register!"); + case R0: case D0: case Q0: return 0; + case R1: case D1: case Q1: return 1; + case R2: case D2: case Q2: return 2; + case R3: case D3: case Q3: return 3; + case R4: case D4: case Q4: return 4; + case R5: case D5: case Q5: return 5; + case R6: case D6: case Q6: return 6; + case R7: case D7: case Q7: return 7; + case R8: case D8: case Q8: return 8; + case R9: case D9: case Q9: return 9; + case R10: case D10: case Q10: return 10; + case R11: case D11: case Q11: return 11; + case R12: case D12: case Q12: return 12; + case SP: case D13: case Q13: return 13; + case LR: case D14: case Q14: return 14; + case PC: case D15: case Q15: return 15; + + case D16: return 16; + case D17: return 17; + case D18: return 18; + case D19: return 19; + case D20: return 20; + case D21: return 21; + case D22: return 22; + case D23: return 23; + case D24: return 24; + case D25: return 25; + case D26: return 27; + case D27: return 27; + case D28: return 28; + case D29: return 29; + case D30: return 30; + case D31: return 31; + + case S0: case S1: case S2: case S3: + case S4: case S5: case S6: case S7: + case S8: case S9: case S10: case S11: + case S12: case S13: case S14: case S15: + case S16: case S17: case S18: case S19: + case S20: case S21: case S22: case S23: + case S24: case S25: case S26: case S27: + case S28: case S29: case S30: case S31: { + if (isSPVFP) + *isSPVFP = true; + switch (RegEnum) { + default: return 0; // Avoid compile time warning. + case S0: return 0; + case S1: return 1; + case S2: return 2; + case S3: return 3; + case S4: return 4; + case S5: return 5; + case S6: return 6; + case S7: return 7; + case S8: return 8; + case S9: return 9; + case S10: return 10; + case S11: return 11; + case S12: return 12; + case S13: return 13; + case S14: return 14; + case S15: return 15; + case S16: return 16; + case S17: return 17; + case S18: return 18; + case S19: return 19; + case S20: return 20; + case S21: return 21; + case S22: return 22; + case S23: return 23; + case S24: return 24; + case S25: return 25; + case S26: return 26; + case S27: return 27; + case S28: return 28; + case S29: return 29; + case S30: return 30; + case S31: return 31; + } + } + } +} + +ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii, + const ARMSubtarget &sti) + : ARMGenRegisterInfo(ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP), + TII(tii), STI(sti), + FramePtr((STI.isTargetDarwin() || STI.isThumb()) ? ARM::R7 : ARM::R11) { +} + +const unsigned* +ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + static const unsigned CalleeSavedRegs[] = { + ARM::LR, ARM::R11, ARM::R10, ARM::R9, ARM::R8, + ARM::R7, ARM::R6, ARM::R5, ARM::R4, + + ARM::D15, ARM::D14, ARM::D13, ARM::D12, + ARM::D11, ARM::D10, ARM::D9, ARM::D8, + 0 + }; + + static const unsigned DarwinCalleeSavedRegs[] = { + // Darwin ABI deviates from ARM standard ABI. R9 is not a callee-saved + // register. + ARM::LR, ARM::R7, ARM::R6, ARM::R5, ARM::R4, + ARM::R11, ARM::R10, ARM::R8, + + ARM::D15, ARM::D14, ARM::D13, ARM::D12, + ARM::D11, ARM::D10, ARM::D9, ARM::D8, + 0 + }; + return STI.isTargetDarwin() ? DarwinCalleeSavedRegs : CalleeSavedRegs; +} + +const TargetRegisterClass* const * +ARMBaseRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const { + static const TargetRegisterClass * const CalleeSavedRegClasses[] = { + &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, + &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, + &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, + + &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, + &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, + 0 + }; + + static const TargetRegisterClass * const ThumbCalleeSavedRegClasses[] = { + &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, + &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::tGPRRegClass, + &ARM::tGPRRegClass,&ARM::tGPRRegClass,&ARM::tGPRRegClass, + + &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, + &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, + 0 + }; + + static const TargetRegisterClass * const DarwinCalleeSavedRegClasses[] = { + &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, + &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, + &ARM::GPRRegClass, &ARM::GPRRegClass, + + &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, + &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, + 0 + }; + + static const TargetRegisterClass * const DarwinThumbCalleeSavedRegClasses[] ={ + &ARM::GPRRegClass, &ARM::tGPRRegClass, &ARM::tGPRRegClass, + &ARM::tGPRRegClass, &ARM::tGPRRegClass, &ARM::GPRRegClass, + &ARM::GPRRegClass, &ARM::GPRRegClass, + + &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, + &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, + 0 + }; + + if (STI.isThumb1Only()) { + return STI.isTargetDarwin() + ? DarwinThumbCalleeSavedRegClasses : ThumbCalleeSavedRegClasses; + } + return STI.isTargetDarwin() + ? DarwinCalleeSavedRegClasses : CalleeSavedRegClasses; +} + +BitVector ARMBaseRegisterInfo::getReservedRegs(const MachineFunction &MF) const { + // FIXME: avoid re-calculating this everytime. + BitVector Reserved(getNumRegs()); + Reserved.set(ARM::SP); + Reserved.set(ARM::PC); + if (STI.isTargetDarwin() || hasFP(MF)) + Reserved.set(FramePtr); + // Some targets reserve R9. + if (STI.isR9Reserved()) + Reserved.set(ARM::R9); + return Reserved; +} + +bool ARMBaseRegisterInfo::isReservedReg(const MachineFunction &MF, + unsigned Reg) const { + switch (Reg) { + default: break; + case ARM::SP: + case ARM::PC: + return true; + case ARM::R7: + case ARM::R11: + if (FramePtr == Reg && (STI.isTargetDarwin() || hasFP(MF))) + return true; + break; + case ARM::R9: + return STI.isR9Reserved(); + } + + return false; +} + +const TargetRegisterClass * +ARMBaseRegisterInfo::getPointerRegClass(unsigned Kind) const { + return ARM::GPRRegisterClass; +} + +/// getAllocationOrder - Returns the register allocation order for a specified +/// register class in the form of a pair of TargetRegisterClass iterators. +std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator> +ARMBaseRegisterInfo::getAllocationOrder(const TargetRegisterClass *RC, + unsigned HintType, unsigned HintReg, + const MachineFunction &MF) const { + // Alternative register allocation orders when favoring even / odd registers + // of register pairs. + + // No FP, R9 is available. + static const unsigned GPREven1[] = { + ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R8, ARM::R10, + ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R7, + ARM::R9, ARM::R11 + }; + static const unsigned GPROdd1[] = { + ARM::R1, ARM::R3, ARM::R5, ARM::R7, ARM::R9, ARM::R11, + ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, + ARM::R8, ARM::R10 + }; + + // FP is R7, R9 is available. + static const unsigned GPREven2[] = { + ARM::R0, ARM::R2, ARM::R4, ARM::R8, ARM::R10, + ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R6, + ARM::R9, ARM::R11 + }; + static const unsigned GPROdd2[] = { + ARM::R1, ARM::R3, ARM::R5, ARM::R9, ARM::R11, + ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, + ARM::R8, ARM::R10 + }; + + // FP is R11, R9 is available. + static const unsigned GPREven3[] = { + ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R8, + ARM::R1, ARM::R3, ARM::R10,ARM::R12,ARM::LR, ARM::R5, ARM::R7, + ARM::R9 + }; + static const unsigned GPROdd3[] = { + ARM::R1, ARM::R3, ARM::R5, ARM::R6, ARM::R9, + ARM::R0, ARM::R2, ARM::R10,ARM::R12,ARM::LR, ARM::R4, ARM::R7, + ARM::R8 + }; + + // No FP, R9 is not available. + static const unsigned GPREven4[] = { + ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R10, + ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R7, ARM::R8, + ARM::R11 + }; + static const unsigned GPROdd4[] = { + ARM::R1, ARM::R3, ARM::R5, ARM::R7, ARM::R11, + ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8, + ARM::R10 + }; + + // FP is R7, R9 is not available. + static const unsigned GPREven5[] = { + ARM::R0, ARM::R2, ARM::R4, ARM::R10, + ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R6, ARM::R8, + ARM::R11 + }; + static const unsigned GPROdd5[] = { + ARM::R1, ARM::R3, ARM::R5, ARM::R11, + ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8, + ARM::R10 + }; + + // FP is R11, R9 is not available. + static const unsigned GPREven6[] = { + ARM::R0, ARM::R2, ARM::R4, ARM::R6, + ARM::R1, ARM::R3, ARM::R10,ARM::R12,ARM::LR, ARM::R5, ARM::R7, ARM::R8 + }; + static const unsigned GPROdd6[] = { + ARM::R1, ARM::R3, ARM::R5, ARM::R7, + ARM::R0, ARM::R2, ARM::R10,ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8 + }; + + + if (HintType == ARMRI::RegPairEven) { + if (isPhysicalRegister(HintReg) && getRegisterPairEven(HintReg, MF) == 0) + // It's no longer possible to fulfill this hint. Return the default + // allocation order. + return std::make_pair(RC->allocation_order_begin(MF), + RC->allocation_order_end(MF)); + + if (!STI.isTargetDarwin() && !hasFP(MF)) { + if (!STI.isR9Reserved()) + return std::make_pair(GPREven1, + GPREven1 + (sizeof(GPREven1)/sizeof(unsigned))); + else + return std::make_pair(GPREven4, + GPREven4 + (sizeof(GPREven4)/sizeof(unsigned))); + } else if (FramePtr == ARM::R7) { + if (!STI.isR9Reserved()) + return std::make_pair(GPREven2, + GPREven2 + (sizeof(GPREven2)/sizeof(unsigned))); + else + return std::make_pair(GPREven5, + GPREven5 + (sizeof(GPREven5)/sizeof(unsigned))); + } else { // FramePtr == ARM::R11 + if (!STI.isR9Reserved()) + return std::make_pair(GPREven3, + GPREven3 + (sizeof(GPREven3)/sizeof(unsigned))); + else + return std::make_pair(GPREven6, + GPREven6 + (sizeof(GPREven6)/sizeof(unsigned))); + } + } else if (HintType == ARMRI::RegPairOdd) { + if (isPhysicalRegister(HintReg) && getRegisterPairOdd(HintReg, MF) == 0) + // It's no longer possible to fulfill this hint. Return the default + // allocation order. + return std::make_pair(RC->allocation_order_begin(MF), + RC->allocation_order_end(MF)); + + if (!STI.isTargetDarwin() && !hasFP(MF)) { + if (!STI.isR9Reserved()) + return std::make_pair(GPROdd1, + GPROdd1 + (sizeof(GPROdd1)/sizeof(unsigned))); + else + return std::make_pair(GPROdd4, + GPROdd4 + (sizeof(GPROdd4)/sizeof(unsigned))); + } else if (FramePtr == ARM::R7) { + if (!STI.isR9Reserved()) + return std::make_pair(GPROdd2, + GPROdd2 + (sizeof(GPROdd2)/sizeof(unsigned))); + else + return std::make_pair(GPROdd5, + GPROdd5 + (sizeof(GPROdd5)/sizeof(unsigned))); + } else { // FramePtr == ARM::R11 + if (!STI.isR9Reserved()) + return std::make_pair(GPROdd3, + GPROdd3 + (sizeof(GPROdd3)/sizeof(unsigned))); + else + return std::make_pair(GPROdd6, + GPROdd6 + (sizeof(GPROdd6)/sizeof(unsigned))); + } + } + return std::make_pair(RC->allocation_order_begin(MF), + RC->allocation_order_end(MF)); +} + +/// ResolveRegAllocHint - Resolves the specified register allocation hint +/// to a physical register. Returns the physical register if it is successful. +unsigned +ARMBaseRegisterInfo::ResolveRegAllocHint(unsigned Type, unsigned Reg, + const MachineFunction &MF) const { + if (Reg == 0 || !isPhysicalRegister(Reg)) + return 0; + if (Type == 0) + return Reg; + else if (Type == (unsigned)ARMRI::RegPairOdd) + // Odd register. + return getRegisterPairOdd(Reg, MF); + else if (Type == (unsigned)ARMRI::RegPairEven) + // Even register. + return getRegisterPairEven(Reg, MF); + return 0; +} + +void +ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg, + MachineFunction &MF) const { + MachineRegisterInfo *MRI = &MF.getRegInfo(); + std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg); + if ((Hint.first == (unsigned)ARMRI::RegPairOdd || + Hint.first == (unsigned)ARMRI::RegPairEven) && + Hint.second && TargetRegisterInfo::isVirtualRegister(Hint.second)) { + // If 'Reg' is one of the even / odd register pair and it's now changed + // (e.g. coalesced) into a different register. The other register of the + // pair allocation hint must be updated to reflect the relationship + // change. + unsigned OtherReg = Hint.second; + Hint = MRI->getRegAllocationHint(OtherReg); + if (Hint.second == Reg) + // Make sure the pair has not already divorced. + MRI->setRegAllocationHint(OtherReg, Hint.first, NewReg); + } +} + +/// hasFP - Return true if the specified function should have a dedicated frame +/// pointer register. This is true if the function has variable sized allocas +/// or if frame pointer elimination is disabled. +/// +bool ARMBaseRegisterInfo::hasFP(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + return (NoFramePointerElim || + MFI->hasVarSizedObjects() || + MFI->isFrameAddressTaken()); +} + +bool ARMBaseRegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + if (NoFramePointerElim && MFI->hasCalls()) + return true; + return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken(); +} + +/// estimateStackSize - Estimate and return the size of the frame. +static unsigned estimateStackSize(MachineFunction &MF, MachineFrameInfo *MFI) { + const MachineFrameInfo *FFI = MF.getFrameInfo(); + int Offset = 0; + for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) { + int FixedOff = -FFI->getObjectOffset(i); + if (FixedOff > Offset) Offset = FixedOff; + } + for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) { + if (FFI->isDeadObjectIndex(i)) + continue; + Offset += FFI->getObjectSize(i); + unsigned Align = FFI->getObjectAlignment(i); + // Adjust to alignment boundary + Offset = (Offset+Align-1)/Align*Align; + } + return (unsigned)Offset; +} + +/// estimateRSStackSizeLimit - Look at each instruction that references stack +/// frames and return the stack size limit beyond which some of these +/// instructions will require scratch register during their expansion later. +unsigned +ARMBaseRegisterInfo::estimateRSStackSizeLimit(MachineFunction &MF) const { + unsigned Limit = (1 << 12) - 1; + for (MachineFunction::iterator BB = MF.begin(),E = MF.end(); BB != E; ++BB) { + for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); + I != E; ++I) { + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { + if (!I->getOperand(i).isFI()) continue; + + const TargetInstrDesc &Desc = TII.get(I->getOpcode()); + unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); + if (AddrMode == ARMII::AddrMode3 || + AddrMode == ARMII::AddrModeT2_i8) + return (1 << 8) - 1; + + if (AddrMode == ARMII::AddrMode5 || + AddrMode == ARMII::AddrModeT2_i8s4) + Limit = std::min(Limit, ((1U << 8) - 1) * 4); + + if (AddrMode == ARMII::AddrModeT2_i12 && hasFP(MF)) + // When the stack offset is negative, we will end up using + // the i8 instructions instead. + return (1 << 8) - 1; + break; // At most one FI per instruction + } + } + } + + return Limit; +} + +void +ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS) const { + // This tells PEI to spill the FP as if it is any other callee-save register + // to take advantage the eliminateFrameIndex machinery. This also ensures it + // is spilled in the order specified by getCalleeSavedRegs() to make it easier + // to combine multiple loads / stores. + bool CanEliminateFrame = true; + bool CS1Spilled = false; + bool LRSpilled = false; + unsigned NumGPRSpills = 0; + SmallVector<unsigned, 4> UnspilledCS1GPRs; + SmallVector<unsigned, 4> UnspilledCS2GPRs; + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + // Don't spill FP if the frame can be eliminated. This is determined + // by scanning the callee-save registers to see if any is used. + const unsigned *CSRegs = getCalleeSavedRegs(); + const TargetRegisterClass* const *CSRegClasses = getCalleeSavedRegClasses(); + for (unsigned i = 0; CSRegs[i]; ++i) { + unsigned Reg = CSRegs[i]; + bool Spilled = false; + if (MF.getRegInfo().isPhysRegUsed(Reg)) { + AFI->setCSRegisterIsSpilled(Reg); + Spilled = true; + CanEliminateFrame = false; + } else { + // Check alias registers too. + for (const unsigned *Aliases = getAliasSet(Reg); *Aliases; ++Aliases) { + if (MF.getRegInfo().isPhysRegUsed(*Aliases)) { + Spilled = true; + CanEliminateFrame = false; + } + } + } + + if (CSRegClasses[i] == ARM::GPRRegisterClass || + CSRegClasses[i] == ARM::tGPRRegisterClass) { + if (Spilled) { + NumGPRSpills++; + + if (!STI.isTargetDarwin()) { + if (Reg == ARM::LR) + LRSpilled = true; + CS1Spilled = true; + continue; + } + + // Keep track if LR and any of R4, R5, R6, and R7 is spilled. + switch (Reg) { + case ARM::LR: + LRSpilled = true; + // Fallthrough + case ARM::R4: + case ARM::R5: + case ARM::R6: + case ARM::R7: + CS1Spilled = true; + break; + default: + break; + } + } else { + if (!STI.isTargetDarwin()) { + UnspilledCS1GPRs.push_back(Reg); + continue; + } + + switch (Reg) { + case ARM::R4: + case ARM::R5: + case ARM::R6: + case ARM::R7: + case ARM::LR: + UnspilledCS1GPRs.push_back(Reg); + break; + default: + UnspilledCS2GPRs.push_back(Reg); + break; + } + } + } + } + + bool ForceLRSpill = false; + if (!LRSpilled && AFI->isThumb1OnlyFunction()) { + unsigned FnSize = TII.GetFunctionSizeInBytes(MF); + // Force LR to be spilled if the Thumb function size is > 2048. This enables + // use of BL to implement far jump. If it turns out that it's not needed + // then the branch fix up path will undo it. + if (FnSize >= (1 << 11)) { + CanEliminateFrame = false; + ForceLRSpill = true; + } + } + + bool ExtraCSSpill = false; + if (!CanEliminateFrame || cannotEliminateFrame(MF)) { + AFI->setHasStackFrame(true); + + // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled. + // Spill LR as well so we can fold BX_RET to the registers restore (LDM). + if (!LRSpilled && CS1Spilled) { + MF.getRegInfo().setPhysRegUsed(ARM::LR); + AFI->setCSRegisterIsSpilled(ARM::LR); + NumGPRSpills++; + UnspilledCS1GPRs.erase(std::find(UnspilledCS1GPRs.begin(), + UnspilledCS1GPRs.end(), (unsigned)ARM::LR)); + ForceLRSpill = false; + ExtraCSSpill = true; + } + + // Darwin ABI requires FP to point to the stack slot that contains the + // previous FP. + if (STI.isTargetDarwin() || hasFP(MF)) { + MF.getRegInfo().setPhysRegUsed(FramePtr); + NumGPRSpills++; + } + + // If stack and double are 8-byte aligned and we are spilling an odd number + // of GPRs. Spill one extra callee save GPR so we won't have to pad between + // the integer and double callee save areas. + unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment(); + if (TargetAlign == 8 && (NumGPRSpills & 1)) { + if (CS1Spilled && !UnspilledCS1GPRs.empty()) { + for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) { + unsigned Reg = UnspilledCS1GPRs[i]; + // Don't spill high register if the function is thumb1 + if (!AFI->isThumb1OnlyFunction() || + isARMLowRegister(Reg) || Reg == ARM::LR) { + MF.getRegInfo().setPhysRegUsed(Reg); + AFI->setCSRegisterIsSpilled(Reg); + if (!isReservedReg(MF, Reg)) + ExtraCSSpill = true; + break; + } + } + } else if (!UnspilledCS2GPRs.empty() && + !AFI->isThumb1OnlyFunction()) { + unsigned Reg = UnspilledCS2GPRs.front(); + MF.getRegInfo().setPhysRegUsed(Reg); + AFI->setCSRegisterIsSpilled(Reg); + if (!isReservedReg(MF, Reg)) + ExtraCSSpill = true; + } + } + + // Estimate if we might need to scavenge a register at some point in order + // to materialize a stack offset. If so, either spill one additional + // callee-saved register or reserve a special spill slot to facilitate + // register scavenging. Thumb1 needs a spill slot for stack pointer + // adjustments also, even when the frame itself is small. + if (RS && !ExtraCSSpill) { + MachineFrameInfo *MFI = MF.getFrameInfo(); + // If any of the stack slot references may be out of range of an + // immediate offset, make sure a register (or a spill slot) is + // available for the register scavenger. Note that if we're indexing + // off the frame pointer, the effective stack size is 4 bytes larger + // since the FP points to the stack slot of the previous FP. + if (estimateStackSize(MF, MFI) + (hasFP(MF) ? 4 : 0) + >= estimateRSStackSizeLimit(MF)) { + // If any non-reserved CS register isn't spilled, just spill one or two + // extra. That should take care of it! + unsigned NumExtras = TargetAlign / 4; + SmallVector<unsigned, 2> Extras; + while (NumExtras && !UnspilledCS1GPRs.empty()) { + unsigned Reg = UnspilledCS1GPRs.back(); + UnspilledCS1GPRs.pop_back(); + if (!isReservedReg(MF, Reg)) { + Extras.push_back(Reg); + NumExtras--; + } + } + // For non-Thumb1 functions, also check for hi-reg CS registers + if (!AFI->isThumb1OnlyFunction()) { + while (NumExtras && !UnspilledCS2GPRs.empty()) { + unsigned Reg = UnspilledCS2GPRs.back(); + UnspilledCS2GPRs.pop_back(); + if (!isReservedReg(MF, Reg)) { + Extras.push_back(Reg); + NumExtras--; + } + } + } + if (Extras.size() && NumExtras == 0) { + for (unsigned i = 0, e = Extras.size(); i != e; ++i) { + MF.getRegInfo().setPhysRegUsed(Extras[i]); + AFI->setCSRegisterIsSpilled(Extras[i]); + } + } else if (!AFI->isThumb1OnlyFunction()) { + // note: Thumb1 functions spill to R12, not the stack. + // Reserve a slot closest to SP or frame pointer. + const TargetRegisterClass *RC = ARM::GPRRegisterClass; + RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(), + RC->getAlignment())); + } + } + } + } + + if (ForceLRSpill) { + MF.getRegInfo().setPhysRegUsed(ARM::LR); + AFI->setCSRegisterIsSpilled(ARM::LR); + AFI->setLRIsSpilledForFarJump(true); + } +} + +unsigned ARMBaseRegisterInfo::getRARegister() const { + return ARM::LR; +} + +unsigned ARMBaseRegisterInfo::getFrameRegister(MachineFunction &MF) const { + if (STI.isTargetDarwin() || hasFP(MF)) + return FramePtr; + return ARM::SP; +} + +unsigned ARMBaseRegisterInfo::getEHExceptionRegister() const { + llvm_unreachable("What is the exception register"); + return 0; +} + +unsigned ARMBaseRegisterInfo::getEHHandlerRegister() const { + llvm_unreachable("What is the exception handler register"); + return 0; +} + +int ARMBaseRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const { + return ARMGenRegisterInfo::getDwarfRegNumFull(RegNum, 0); +} + +unsigned ARMBaseRegisterInfo::getRegisterPairEven(unsigned Reg, + const MachineFunction &MF) const { + switch (Reg) { + default: break; + // Return 0 if either register of the pair is a special register. + // So no R12, etc. + case ARM::R1: + return ARM::R0; + case ARM::R3: + // FIXME! + return STI.isThumb1Only() ? 0 : ARM::R2; + case ARM::R5: + return ARM::R4; + case ARM::R7: + return isReservedReg(MF, ARM::R7) ? 0 : ARM::R6; + case ARM::R9: + return isReservedReg(MF, ARM::R9) ? 0 :ARM::R8; + case ARM::R11: + return isReservedReg(MF, ARM::R11) ? 0 : ARM::R10; + + case ARM::S1: + return ARM::S0; + case ARM::S3: + return ARM::S2; + case ARM::S5: + return ARM::S4; + case ARM::S7: + return ARM::S6; + case ARM::S9: + return ARM::S8; + case ARM::S11: + return ARM::S10; + case ARM::S13: + return ARM::S12; + case ARM::S15: + return ARM::S14; + case ARM::S17: + return ARM::S16; + case ARM::S19: + return ARM::S18; + case ARM::S21: + return ARM::S20; + case ARM::S23: + return ARM::S22; + case ARM::S25: + return ARM::S24; + case ARM::S27: + return ARM::S26; + case ARM::S29: + return ARM::S28; + case ARM::S31: + return ARM::S30; + + case ARM::D1: + return ARM::D0; + case ARM::D3: + return ARM::D2; + case ARM::D5: + return ARM::D4; + case ARM::D7: + return ARM::D6; + case ARM::D9: + return ARM::D8; + case ARM::D11: + return ARM::D10; + case ARM::D13: + return ARM::D12; + case ARM::D15: + return ARM::D14; + case ARM::D17: + return ARM::D16; + case ARM::D19: + return ARM::D18; + case ARM::D21: + return ARM::D20; + case ARM::D23: + return ARM::D22; + case ARM::D25: + return ARM::D24; + case ARM::D27: + return ARM::D26; + case ARM::D29: + return ARM::D28; + case ARM::D31: + return ARM::D30; + } + + return 0; +} + +unsigned ARMBaseRegisterInfo::getRegisterPairOdd(unsigned Reg, + const MachineFunction &MF) const { + switch (Reg) { + default: break; + // Return 0 if either register of the pair is a special register. + // So no R12, etc. + case ARM::R0: + return ARM::R1; + case ARM::R2: + // FIXME! + return STI.isThumb1Only() ? 0 : ARM::R3; + case ARM::R4: + return ARM::R5; + case ARM::R6: + return isReservedReg(MF, ARM::R7) ? 0 : ARM::R7; + case ARM::R8: + return isReservedReg(MF, ARM::R9) ? 0 :ARM::R9; + case ARM::R10: + return isReservedReg(MF, ARM::R11) ? 0 : ARM::R11; + + case ARM::S0: + return ARM::S1; + case ARM::S2: + return ARM::S3; + case ARM::S4: + return ARM::S5; + case ARM::S6: + return ARM::S7; + case ARM::S8: + return ARM::S9; + case ARM::S10: + return ARM::S11; + case ARM::S12: + return ARM::S13; + case ARM::S14: + return ARM::S15; + case ARM::S16: + return ARM::S17; + case ARM::S18: + return ARM::S19; + case ARM::S20: + return ARM::S21; + case ARM::S22: + return ARM::S23; + case ARM::S24: + return ARM::S25; + case ARM::S26: + return ARM::S27; + case ARM::S28: + return ARM::S29; + case ARM::S30: + return ARM::S31; + + case ARM::D0: + return ARM::D1; + case ARM::D2: + return ARM::D3; + case ARM::D4: + return ARM::D5; + case ARM::D6: + return ARM::D7; + case ARM::D8: + return ARM::D9; + case ARM::D10: + return ARM::D11; + case ARM::D12: + return ARM::D13; + case ARM::D14: + return ARM::D15; + case ARM::D16: + return ARM::D17; + case ARM::D18: + return ARM::D19; + case ARM::D20: + return ARM::D21; + case ARM::D22: + return ARM::D23; + case ARM::D24: + return ARM::D25; + case ARM::D26: + return ARM::D27; + case ARM::D28: + return ARM::D29; + case ARM::D30: + return ARM::D31; + } + + return 0; +} + +/// emitLoadConstPool - Emits a load from constpool to materialize the +/// specified immediate. +void ARMBaseRegisterInfo:: +emitLoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, + unsigned DestReg, unsigned SubIdx, int Val, + ARMCC::CondCodes Pred, + unsigned PredReg) const { + MachineFunction &MF = *MBB.getParent(); + MachineConstantPool *ConstantPool = MF.getConstantPool(); + Constant *C = + ConstantInt::get(Type::getInt32Ty(MF.getFunction()->getContext()), Val); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4); + + BuildMI(MBB, MBBI, dl, TII.get(ARM::LDRcp)) + .addReg(DestReg, getDefRegState(true), SubIdx) + .addConstantPoolIndex(Idx) + .addReg(0).addImm(0).addImm(Pred).addReg(PredReg); +} + +bool ARMBaseRegisterInfo:: +requiresRegisterScavenging(const MachineFunction &MF) const { + return true; +} + +// hasReservedCallFrame - Under normal circumstances, when a frame pointer is +// not required, we reserve argument space for call sites in the function +// immediately on entry to the current function. This eliminates the need for +// add/sub sp brackets around call sites. Returns true if the call frame is +// included as part of the stack frame. +bool ARMBaseRegisterInfo:: +hasReservedCallFrame(MachineFunction &MF) const { + const MachineFrameInfo *FFI = MF.getFrameInfo(); + unsigned CFSize = FFI->getMaxCallFrameSize(); + // It's not always a good idea to include the call frame as part of the + // stack frame. ARM (especially Thumb) has small immediate offset to + // address the stack frame. So a large call frame can cause poor codegen + // and may even makes it impossible to scavenge a register. + if (CFSize >= ((1 << 12) - 1) / 2) // Half of imm12 + return false; + + return !MF.getFrameInfo()->hasVarSizedObjects(); +} + +static void +emitSPUpdate(bool isARM, + MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + DebugLoc dl, const ARMBaseInstrInfo &TII, + int NumBytes, + ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) { + if (isARM) + emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, + Pred, PredReg, TII); + else + emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, + Pred, PredReg, TII); +} + + +void ARMBaseRegisterInfo:: +eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + if (!hasReservedCallFrame(MF)) { + // If we have alloca, convert as follows: + // ADJCALLSTACKDOWN -> sub, sp, sp, amount + // ADJCALLSTACKUP -> add, sp, sp, amount + MachineInstr *Old = I; + DebugLoc dl = Old->getDebugLoc(); + unsigned Amount = Old->getOperand(0).getImm(); + if (Amount != 0) { + // We need to keep the stack aligned properly. To do this, we round the + // amount of space needed for the outgoing arguments up to the next + // alignment boundary. + unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment(); + Amount = (Amount+Align-1)/Align*Align; + + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + assert(!AFI->isThumb1OnlyFunction() && + "This eliminateCallFramePseudoInstr does not suppor Thumb1!"); + bool isARM = !AFI->isThumbFunction(); + + // Replace the pseudo instruction with a new instruction... + unsigned Opc = Old->getOpcode(); + ARMCC::CondCodes Pred = (ARMCC::CondCodes)Old->getOperand(1).getImm(); + // FIXME: Thumb2 version of ADJCALLSTACKUP and ADJCALLSTACKDOWN? + if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) { + // Note: PredReg is operand 2 for ADJCALLSTACKDOWN. + unsigned PredReg = Old->getOperand(2).getReg(); + emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, Pred, PredReg); + } else { + // Note: PredReg is operand 3 for ADJCALLSTACKUP. + unsigned PredReg = Old->getOperand(3).getReg(); + assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP); + emitSPUpdate(isARM, MBB, I, dl, TII, Amount, Pred, PredReg); + } + } + } + MBB.erase(I); +} + +/// findScratchRegister - Find a 'free' ARM register. If register scavenger +/// is not being used, R12 is available. Otherwise, try for a call-clobbered +/// register first and then a spilled callee-saved register if that fails. +static +unsigned findScratchRegister(RegScavenger *RS, const TargetRegisterClass *RC, + ARMFunctionInfo *AFI) { + unsigned Reg = RS ? RS->FindUnusedReg(RC) : (unsigned) ARM::R12; + assert(!AFI->isThumb1OnlyFunction()); + return Reg; +} + +unsigned +ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, int *Value, + RegScavenger *RS) const { + unsigned i = 0; + MachineInstr &MI = *II; + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + assert(!AFI->isThumb1OnlyFunction() && + "This eliminateFrameIndex does not support Thumb1!"); + + while (!MI.getOperand(i).isFI()) { + ++i; + assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); + } + + unsigned FrameReg = ARM::SP; + int FrameIndex = MI.getOperand(i).getIndex(); + int Offset = MFI->getObjectOffset(FrameIndex) + MFI->getStackSize() + SPAdj; + + if (AFI->isGPRCalleeSavedArea1Frame(FrameIndex)) + Offset -= AFI->getGPRCalleeSavedArea1Offset(); + else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex)) + Offset -= AFI->getGPRCalleeSavedArea2Offset(); + else if (AFI->isDPRCalleeSavedAreaFrame(FrameIndex)) + Offset -= AFI->getDPRCalleeSavedAreaOffset(); + else if (hasFP(MF) && AFI->hasStackFrame()) { + assert(SPAdj == 0 && "Unexpected stack offset!"); + // Use frame pointer to reference fixed objects unless this is a + // frameless function, + FrameReg = getFrameRegister(MF); + Offset -= AFI->getFramePtrSpillOffset(); + } + + // modify MI as necessary to handle as much of 'Offset' as possible + bool Done = false; + if (!AFI->isThumbFunction()) + Done = rewriteARMFrameIndex(MI, i, FrameReg, Offset, TII); + else { + assert(AFI->isThumb2Function()); + Done = rewriteT2FrameIndex(MI, i, FrameReg, Offset, TII); + } + if (Done) + return 0; + + // If we get here, the immediate doesn't fit into the instruction. We folded + // as much as possible above, handle the rest, providing a register that is + // SP+LargeImm. + assert((Offset || + (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4) && + "This code isn't needed if offset already handled!"); + + // Insert a set of r12 with the full address: r12 = sp + offset + // If the offset we have is too large to fit into the instruction, we need + // to form it with a series of ADDri's. Do this by taking 8-bit chunks + // out of 'Offset'. + unsigned ScratchReg = findScratchRegister(RS, ARM::GPRRegisterClass, AFI); + if (ScratchReg == 0) + // No register is "free". Scavenge a register. + ScratchReg = RS->scavengeRegister(ARM::GPRRegisterClass, II, SPAdj); + int PIdx = MI.findFirstPredOperandIdx(); + ARMCC::CondCodes Pred = (PIdx == -1) + ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm(); + unsigned PredReg = (PIdx == -1) ? 0 : MI.getOperand(PIdx+1).getReg(); + if (Offset == 0) + // Must be addrmode4. + MI.getOperand(i).ChangeToRegister(FrameReg, false, false, false); + else { + if (!AFI->isThumbFunction()) + emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, + Offset, Pred, PredReg, TII); + else { + assert(AFI->isThumb2Function()); + emitT2RegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, + Offset, Pred, PredReg, TII); + } + MI.getOperand(i).ChangeToRegister(ScratchReg, false, false, true); + } + return 0; +} + +/// Move iterator pass the next bunch of callee save load / store ops for +/// the particular spill area (1: integer area 1, 2: integer area 2, +/// 3: fp area, 0: don't care). +static void movePastCSLoadStoreOps(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + int Opc1, int Opc2, unsigned Area, + const ARMSubtarget &STI) { + while (MBBI != MBB.end() && + ((MBBI->getOpcode() == Opc1) || (MBBI->getOpcode() == Opc2)) && + MBBI->getOperand(1).isFI()) { + if (Area != 0) { + bool Done = false; + unsigned Category = 0; + switch (MBBI->getOperand(0).getReg()) { + case ARM::R4: case ARM::R5: case ARM::R6: case ARM::R7: + case ARM::LR: + Category = 1; + break; + case ARM::R8: case ARM::R9: case ARM::R10: case ARM::R11: + Category = STI.isTargetDarwin() ? 2 : 1; + break; + case ARM::D8: case ARM::D9: case ARM::D10: case ARM::D11: + case ARM::D12: case ARM::D13: case ARM::D14: case ARM::D15: + Category = 3; + break; + default: + Done = true; + break; + } + if (Done || Category != Area) + break; + } + + ++MBBI; + } +} + +void ARMBaseRegisterInfo:: +emitPrologue(MachineFunction &MF) const { + MachineBasicBlock &MBB = MF.front(); + MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + assert(!AFI->isThumb1OnlyFunction() && + "This emitPrologue does not suppor Thumb1!"); + bool isARM = !AFI->isThumbFunction(); + unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); + unsigned NumBytes = MFI->getStackSize(); + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + DebugLoc dl = (MBBI != MBB.end() ? + MBBI->getDebugLoc() : DebugLoc::getUnknownLoc()); + + // Determine the sizes of each callee-save spill areas and record which frame + // belongs to which callee-save spill areas. + unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0; + int FramePtrSpillFI = 0; + + // Allocate the vararg register save area. This is not counted in NumBytes. + if (VARegSaveSize) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -VARegSaveSize); + + if (!AFI->hasStackFrame()) { + if (NumBytes != 0) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes); + return; + } + + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + int FI = CSI[i].getFrameIdx(); + switch (Reg) { + case ARM::R4: + case ARM::R5: + case ARM::R6: + case ARM::R7: + case ARM::LR: + if (Reg == FramePtr) + FramePtrSpillFI = FI; + AFI->addGPRCalleeSavedArea1Frame(FI); + GPRCS1Size += 4; + break; + case ARM::R8: + case ARM::R9: + case ARM::R10: + case ARM::R11: + if (Reg == FramePtr) + FramePtrSpillFI = FI; + if (STI.isTargetDarwin()) { + AFI->addGPRCalleeSavedArea2Frame(FI); + GPRCS2Size += 4; + } else { + AFI->addGPRCalleeSavedArea1Frame(FI); + GPRCS1Size += 4; + } + break; + default: + AFI->addDPRCalleeSavedAreaFrame(FI); + DPRCSSize += 8; + } + } + + // Build the new SUBri to adjust SP for integer callee-save spill area 1. + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -GPRCS1Size); + movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, ARM::t2STRi12, 1, STI); + + // Set FP to point to the stack slot that contains the previous FP. + // For Darwin, FP is R7, which has now been stored in spill area 1. + // Otherwise, if this is not Darwin, all the callee-saved registers go + // into spill area 1, including the FP in R11. In either case, it is + // now safe to emit this assignment. + if (STI.isTargetDarwin() || hasFP(MF)) { + unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : ARM::t2ADDri; + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, dl, TII.get(ADDriOpc), FramePtr) + .addFrameIndex(FramePtrSpillFI).addImm(0); + AddDefaultCC(AddDefaultPred(MIB)); + } + + // Build the new SUBri to adjust SP for integer callee-save spill area 2. + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -GPRCS2Size); + + // Build the new SUBri to adjust SP for FP callee-save spill area. + movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, ARM::t2STRi12, 2, STI); + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -DPRCSSize); + + // Determine starting offsets of spill areas. + unsigned DPRCSOffset = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize); + unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize; + unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size; + AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes); + AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset); + AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset); + AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset); + + NumBytes = DPRCSOffset; + if (NumBytes) { + // Insert it after all the callee-save spills. + movePastCSLoadStoreOps(MBB, MBBI, ARM::FSTD, 0, 3, STI); + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes); + } + + if (STI.isTargetELF() && hasFP(MF)) { + MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() - + AFI->getFramePtrSpillOffset()); + } + + AFI->setGPRCalleeSavedArea1Size(GPRCS1Size); + AFI->setGPRCalleeSavedArea2Size(GPRCS2Size); + AFI->setDPRCalleeSavedAreaSize(DPRCSSize); +} + +static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) { + for (unsigned i = 0; CSRegs[i]; ++i) + if (Reg == CSRegs[i]) + return true; + return false; +} + +static bool isCSRestore(MachineInstr *MI, + const ARMBaseInstrInfo &TII, + const unsigned *CSRegs) { + return ((MI->getOpcode() == (int)ARM::FLDD || + MI->getOpcode() == (int)ARM::LDR || + MI->getOpcode() == (int)ARM::t2LDRi12) && + MI->getOperand(1).isFI() && + isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs)); +} + +void ARMBaseRegisterInfo:: +emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = prior(MBB.end()); + assert(MBBI->getDesc().isReturn() && + "Can only insert epilog into returning blocks"); + DebugLoc dl = MBBI->getDebugLoc(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + assert(!AFI->isThumb1OnlyFunction() && + "This emitEpilogue does not suppor Thumb1!"); + bool isARM = !AFI->isThumbFunction(); + + unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); + int NumBytes = (int)MFI->getStackSize(); + + if (!AFI->hasStackFrame()) { + if (NumBytes != 0) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes); + } else { + // Unwind MBBI to point to first LDR / FLDD. + const unsigned *CSRegs = getCalleeSavedRegs(); + if (MBBI != MBB.begin()) { + do + --MBBI; + while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs)); + if (!isCSRestore(MBBI, TII, CSRegs)) + ++MBBI; + } + + // Move SP to start of FP callee save spill area. + NumBytes -= (AFI->getGPRCalleeSavedArea1Size() + + AFI->getGPRCalleeSavedArea2Size() + + AFI->getDPRCalleeSavedAreaSize()); + + // Darwin ABI requires FP to point to the stack slot that contains the + // previous FP. + bool HasFP = hasFP(MF); + if ((STI.isTargetDarwin() && NumBytes) || HasFP) { + NumBytes = AFI->getFramePtrSpillOffset() - NumBytes; + // Reset SP based on frame pointer only if the stack frame extends beyond + // frame pointer stack slot or target is ELF and the function has FP. + if (HasFP || + AFI->getGPRCalleeSavedArea2Size() || + AFI->getDPRCalleeSavedAreaSize() || + AFI->getDPRCalleeSavedAreaOffset()) { + if (NumBytes) { + if (isARM) + emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes, + ARMCC::AL, 0, TII); + else + emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes, + ARMCC::AL, 0, TII); + } else { + // Thumb2 or ARM. + if (isARM) + BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP) + .addReg(FramePtr) + .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); + else + BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), ARM::SP) + .addReg(FramePtr); + } + } + } else if (NumBytes) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes); + + // Move SP to start of integer callee save spill area 2. + movePastCSLoadStoreOps(MBB, MBBI, ARM::FLDD, 0, 3, STI); + emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getDPRCalleeSavedAreaSize()); + + // Move SP to start of integer callee save spill area 1. + movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, ARM::t2LDRi12, 2, STI); + emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getGPRCalleeSavedArea2Size()); + + // Move SP to SP upon entry to the function. + movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, ARM::t2LDRi12, 1, STI); + emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getGPRCalleeSavedArea1Size()); + } + + if (VARegSaveSize) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, VARegSaveSize); +} + +#include "ARMGenRegisterInfo.inc" diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h new file mode 100644 index 0000000..da703fb --- /dev/null +++ b/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -0,0 +1,148 @@ +//===- ARMBaseRegisterInfo.h - ARM Register Information Impl ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the base ARM implementation of TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMBASEREGISTERINFO_H +#define ARMBASEREGISTERINFO_H + +#include "ARM.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "ARMGenRegisterInfo.h.inc" + +namespace llvm { + class ARMSubtarget; + class ARMBaseInstrInfo; + class Type; + +/// Register allocation hints. +namespace ARMRI { + enum { + RegPairOdd = 1, + RegPairEven = 2 + }; +} + +/// isARMLowRegister - Returns true if the register is low register r0-r7. +/// +static inline bool isARMLowRegister(unsigned Reg) { + using namespace ARM; + switch (Reg) { + case R0: case R1: case R2: case R3: + case R4: case R5: case R6: case R7: + return true; + default: + return false; + } +} + +struct ARMBaseRegisterInfo : public ARMGenRegisterInfo { +protected: + const ARMBaseInstrInfo &TII; + const ARMSubtarget &STI; + + /// FramePtr - ARM physical register used as frame ptr. + unsigned FramePtr; + + // Can be only subclassed. + explicit ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii, + const ARMSubtarget &STI); + + // Return the opcode that implements 'Op', or 0 if no opcode + unsigned getOpcode(int Op) const; + +public: + /// getRegisterNumbering - Given the enum value for some register, e.g. + /// ARM::LR, return the number that it corresponds to (e.g. 14). It + /// also returns true in isSPVFP if the register is a single precision + /// VFP register. + static unsigned getRegisterNumbering(unsigned RegEnum, bool *isSPVFP = 0); + + /// Code Generation virtual methods... + const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const; + + const TargetRegisterClass* const* + getCalleeSavedRegClasses(const MachineFunction *MF = 0) const; + + BitVector getReservedRegs(const MachineFunction &MF) const; + + const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const; + + std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator> + getAllocationOrder(const TargetRegisterClass *RC, + unsigned HintType, unsigned HintReg, + const MachineFunction &MF) const; + + unsigned ResolveRegAllocHint(unsigned Type, unsigned Reg, + const MachineFunction &MF) const; + + void UpdateRegAllocHint(unsigned Reg, unsigned NewReg, + MachineFunction &MF) const; + + bool hasFP(const MachineFunction &MF) const; + + bool cannotEliminateFrame(const MachineFunction &MF) const; + + void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS = NULL) const; + + // Debug information queries. + unsigned getRARegister() const; + unsigned getFrameRegister(MachineFunction &MF) const; + + // Exception handling queries. + unsigned getEHExceptionRegister() const; + unsigned getEHHandlerRegister() const; + + int getDwarfRegNum(unsigned RegNum, bool isEH) const; + + bool isLowRegister(unsigned Reg) const; + + + /// emitLoadConstPool - Emits a load from constpool to materialize the + /// specified immediate. + virtual void emitLoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, + unsigned DestReg, unsigned SubIdx, + int Val, + ARMCC::CondCodes Pred = ARMCC::AL, + unsigned PredReg = 0) const; + + /// Code Generation virtual methods... + virtual bool isReservedReg(const MachineFunction &MF, unsigned Reg) const; + + virtual bool requiresRegisterScavenging(const MachineFunction &MF) const; + + virtual bool hasReservedCallFrame(MachineFunction &MF) const; + + virtual void eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; + + virtual unsigned eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, int *Value = NULL, + RegScavenger *RS = NULL) const; + + virtual void emitPrologue(MachineFunction &MF) const; + virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + +private: + unsigned estimateRSStackSizeLimit(MachineFunction &MF) const; + + unsigned getRegisterPairEven(unsigned Reg, const MachineFunction &MF) const; + + unsigned getRegisterPairOdd(unsigned Reg, const MachineFunction &MF) const; +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td index 8a4c741..7161639 100644 --- a/lib/Target/ARM/ARMCallingConv.td +++ b/lib/Target/ARM/ARMCallingConv.td @@ -111,6 +111,7 @@ def CC_ARM_AAPCS_VFP : CallingConv<[ CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15]>>, @@ -122,6 +123,7 @@ def RetCC_ARM_AAPCS_VFP : CallingConv<[ CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15]>>, diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp index f295761..6f1c624 100644 --- a/lib/Target/ARM/ARMCodeEmitter.cpp +++ b/lib/Target/ARM/ARMCodeEmitter.cpp @@ -26,14 +26,18 @@ #include "llvm/PassManager.h" #include "llvm/CodeGen/MachineCodeEmitter.h" #include "llvm/CodeGen/JITCodeEmitter.h" +#include "llvm/CodeGen/ObjectCodeEmitter.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/ADT/Statistic.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" #ifndef NDEBUG #include <iomanip> #endif @@ -57,12 +61,18 @@ namespace { ARMJITInfo *JTI; const ARMInstrInfo *II; const TargetData *TD; + const ARMSubtarget *Subtarget; TargetMachine &TM; CodeEmitter &MCE; const std::vector<MachineConstantPoolEntry> *MCPEs; const std::vector<MachineJumpTableEntry> *MJTEs; bool IsPIC; + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<MachineModuleInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + public: static char ID; explicit Emitter(TargetMachine &tm, CodeEmitter &mce) @@ -160,7 +170,7 @@ namespace { /// Routines that handle operands which add machine relocations which are /// fixed up by the relocation stage. void emitGlobalAddress(GlobalValue *GV, unsigned Reloc, - bool NeedStub, intptr_t ACPV = 0); + bool NeedStub, bool Indirect, intptr_t ACPV = 0); void emitExternalSymbolAddress(const char *ES, unsigned Reloc); void emitConstPoolAddress(unsigned CPI, unsigned Reloc); void emitJumpTableAddress(unsigned JTIndex, unsigned Reloc); @@ -174,36 +184,39 @@ namespace { /// createARMCodeEmitterPass - Return a pass that emits the collected ARM code /// to the specified MCE object. -namespace llvm { - -FunctionPass *createARMCodeEmitterPass(ARMBaseTargetMachine &TM, - MachineCodeEmitter &MCE) { +FunctionPass *llvm::createARMCodeEmitterPass(ARMBaseTargetMachine &TM, + MachineCodeEmitter &MCE) { return new Emitter<MachineCodeEmitter>(TM, MCE); } -FunctionPass *createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM, - JITCodeEmitter &JCE) { +FunctionPass *llvm::createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM, + JITCodeEmitter &JCE) { return new Emitter<JITCodeEmitter>(TM, JCE); } - -} // end namespace llvm +FunctionPass *llvm::createARMObjectCodeEmitterPass(ARMBaseTargetMachine &TM, + ObjectCodeEmitter &OCE) { + return new Emitter<ObjectCodeEmitter>(TM, OCE); +} template<class CodeEmitter> bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) { assert((MF.getTarget().getRelocationModel() != Reloc::Default || MF.getTarget().getRelocationModel() != Reloc::Static) && "JIT relocation model must be set to static or default!"); + JTI = ((ARMTargetMachine&)MF.getTarget()).getJITInfo(); II = ((ARMTargetMachine&)MF.getTarget()).getInstrInfo(); TD = ((ARMTargetMachine&)MF.getTarget()).getTargetData(); - JTI = ((ARMTargetMachine&)MF.getTarget()).getJITInfo(); + Subtarget = &TM.getSubtarget<ARMSubtarget>(); MCPEs = &MF.getConstantPool()->getConstants(); MJTEs = &MF.getJumpTableInfo()->getJumpTables(); IsPIC = TM.getRelocationModel() == Reloc::PIC_; JTI->Initialize(MF, IsPIC); + MCE.setModuleInfo(&getAnalysis<MachineModuleInfo>()); do { - DOUT << "JITTing function '" << MF.getFunction()->getName() << "'\n"; + DEBUG(errs() << "JITTing function '" + << MF.getFunction()->getName() << "'\n"); MCE.startFunction(MF); - for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); + for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); MBB != E; ++MBB) { MCE.StartMachineBasicBlock(MBB); for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end(); @@ -220,7 +233,7 @@ bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) { template<class CodeEmitter> unsigned Emitter<CodeEmitter>::getShiftOp(unsigned Imm) const { switch (ARM_AM::getAM2ShiftOpc(Imm)) { - default: assert(0 && "Unknown shift opc!"); + default: llvm_unreachable("Unknown shift opc!"); case ARM_AM::asr: return 2; case ARM_AM::lsl: return 0; case ARM_AM::lsr: return 1; @@ -240,7 +253,7 @@ unsigned Emitter<CodeEmitter>::getMachineOpValue(const MachineInstr &MI, else if (MO.isImm()) return static_cast<unsigned>(MO.getImm()); else if (MO.isGlobal()) - emitGlobalAddress(MO.getGlobal(), ARM::reloc_arm_branch, true); + emitGlobalAddress(MO.getGlobal(), ARM::reloc_arm_branch, true, false); else if (MO.isSymbol()) emitExternalSymbolAddress(MO.getSymbolName(), ARM::reloc_arm_branch); else if (MO.isCPI()) { @@ -254,8 +267,10 @@ unsigned Emitter<CodeEmitter>::getMachineOpValue(const MachineInstr &MI, else if (MO.isMBB()) emitMachineBasicBlock(MO.getMBB(), ARM::reloc_arm_branch); else { - cerr << "ERROR: Unknown type of MachineOperand: " << MO << "\n"; - abort(); +#ifndef NDEBUG + errs() << MO; +#endif + llvm_unreachable(0); } return 0; } @@ -264,9 +279,14 @@ unsigned Emitter<CodeEmitter>::getMachineOpValue(const MachineInstr &MI, /// template<class CodeEmitter> void Emitter<CodeEmitter>::emitGlobalAddress(GlobalValue *GV, unsigned Reloc, - bool NeedStub, intptr_t ACPV) { - MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc, - GV, ACPV, NeedStub)); + bool NeedStub, bool Indirect, + intptr_t ACPV) { + MachineRelocation MR = Indirect + ? MachineRelocation::getIndirectSymbol(MCE.getCurrentPCOffset(), Reloc, + GV, ACPV, NeedStub) + : MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc, + GV, ACPV, NeedStub); + MCE.addRelocation(MR); } /// emitExternalSymbolAddress - Arrange for the address of an external symbol to @@ -294,7 +314,7 @@ void Emitter<CodeEmitter>::emitConstPoolAddress(unsigned CPI, /// be emitted to the current location in the function, and allow it to be PC /// relative. template<class CodeEmitter> -void Emitter<CodeEmitter>::emitJumpTableAddress(unsigned JTIndex, +void Emitter<CodeEmitter>::emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) { MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(), Reloc, JTIndex, 0, true)); @@ -310,32 +330,28 @@ void Emitter<CodeEmitter>::emitMachineBasicBlock(MachineBasicBlock *BB, template<class CodeEmitter> void Emitter<CodeEmitter>::emitWordLE(unsigned Binary) { -#ifndef NDEBUG - DOUT << " 0x" << std::hex << std::setw(8) << std::setfill('0') - << Binary << std::dec << "\n"; -#endif + DEBUG(errs() << " 0x"; + errs().write_hex(Binary) << "\n"); MCE.emitWordLE(Binary); } template<class CodeEmitter> void Emitter<CodeEmitter>::emitDWordLE(uint64_t Binary) { -#ifndef NDEBUG - DOUT << " 0x" << std::hex << std::setw(8) << std::setfill('0') - << (unsigned)Binary << std::dec << "\n"; - DOUT << " 0x" << std::hex << std::setw(8) << std::setfill('0') - << (unsigned)(Binary >> 32) << std::dec << "\n"; -#endif + DEBUG(errs() << " 0x"; + errs().write_hex(Binary) << "\n"); MCE.emitDWordLE(Binary); } template<class CodeEmitter> void Emitter<CodeEmitter>::emitInstruction(const MachineInstr &MI) { - DOUT << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << MI; + DEBUG(errs() << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << MI); + + MCE.processDebugLoc(MI.getDebugLoc(), true); NumEmitted++; // Keep track of the # of mi's emitted switch (MI.getDesc().TSFlags & ARMII::FormMask) { default: { - assert(0 && "Unhandled instruction encoding format!"); + llvm_unreachable("Unhandled instruction encoding format!"); break; } case ARMII::Pseudo: @@ -393,6 +409,7 @@ void Emitter<CodeEmitter>::emitInstruction(const MachineInstr &MI) { emitMiscInstruction(MI); break; } + MCE.processDebugLoc(MI.getDebugLoc(), false); } template<class CodeEmitter> @@ -400,7 +417,7 @@ void Emitter<CodeEmitter>::emitConstPoolInstruction(const MachineInstr &MI) { unsigned CPI = MI.getOperand(0).getImm(); // CP instruction index. unsigned CPIndex = MI.getOperand(1).getIndex(); // Actual cp entry index. const MachineConstantPoolEntry &MCPE = (*MCPEs)[CPIndex]; - + // Remember the CONSTPOOL_ENTRY address for later relocation. JTI->addConstantPoolEntryAddr(CPI, MCE.getCurrentPCValue()); @@ -410,55 +427,49 @@ void Emitter<CodeEmitter>::emitConstPoolInstruction(const MachineInstr &MI) { ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPE.Val.MachineCPVal); - DOUT << " ** ARM constant pool #" << CPI << " @ " - << (void*)MCE.getCurrentPCValue() << " " << *ACPV << '\n'; + DEBUG(errs() << " ** ARM constant pool #" << CPI << " @ " + << (void*)MCE.getCurrentPCValue() << " " << *ACPV << '\n'); GlobalValue *GV = ACPV->getGV(); if (GV) { - assert(!ACPV->isStub() && "Don't know how to deal this yet!"); - if (ACPV->isNonLazyPointer()) - MCE.addRelocation(MachineRelocation::getIndirectSymbol( - MCE.getCurrentPCOffset(), ARM::reloc_arm_machine_cp_entry, GV, - (intptr_t)ACPV, false)); - else - emitGlobalAddress(GV, ARM::reloc_arm_machine_cp_entry, - ACPV->isStub() || isa<Function>(GV), (intptr_t)ACPV); + Reloc::Model RelocM = TM.getRelocationModel(); + emitGlobalAddress(GV, ARM::reloc_arm_machine_cp_entry, + isa<Function>(GV), + Subtarget->GVIsIndirectSymbol(GV, RelocM), + (intptr_t)ACPV); } else { - assert(!ACPV->isNonLazyPointer() && "Don't know how to deal this yet!"); emitExternalSymbolAddress(ACPV->getSymbol(), ARM::reloc_arm_absolute); } emitWordLE(0); } else { Constant *CV = MCPE.Val.ConstVal; -#ifndef NDEBUG - DOUT << " ** Constant pool #" << CPI << " @ " - << (void*)MCE.getCurrentPCValue() << " "; - if (const Function *F = dyn_cast<Function>(CV)) - DOUT << F->getName(); - else - DOUT << *CV; - DOUT << '\n'; -#endif + DEBUG({ + errs() << " ** Constant pool #" << CPI << " @ " + << (void*)MCE.getCurrentPCValue() << " "; + if (const Function *F = dyn_cast<Function>(CV)) + errs() << F->getName(); + else + errs() << *CV; + errs() << '\n'; + }); if (GlobalValue *GV = dyn_cast<GlobalValue>(CV)) { - emitGlobalAddress(GV, ARM::reloc_arm_absolute, isa<Function>(GV)); + emitGlobalAddress(GV, ARM::reloc_arm_absolute, isa<Function>(GV), false); emitWordLE(0); } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) { uint32_t Val = *(uint32_t*)CI->getValue().getRawData(); emitWordLE(Val); } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) { - if (CFP->getType() == Type::FloatTy) + if (CFP->getType()->isFloatTy()) emitWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue()); - else if (CFP->getType() == Type::DoubleTy) + else if (CFP->getType()->isDoubleTy()) emitDWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue()); else { - assert(0 && "Unable to handle this constantpool entry!"); - abort(); + llvm_unreachable("Unable to handle this constantpool entry!"); } } else { - assert(0 && "Unable to handle this constantpool entry!"); - abort(); + llvm_unreachable("Unable to handle this constantpool entry!"); } } } @@ -467,7 +478,8 @@ template<class CodeEmitter> void Emitter<CodeEmitter>::emitMOVi2piecesInstruction(const MachineInstr &MI) { const MachineOperand &MO0 = MI.getOperand(0); const MachineOperand &MO1 = MI.getOperand(1); - assert(MO1.isImm() && "Not a valid so_imm value!"); + assert(MO1.isImm() && ARM_AM::getSOImmVal(MO1.isImm()) != -1 && + "Not a valid so_imm value!"); unsigned V1 = ARM_AM::getSOImmTwoPartFirst(MO1.getImm()); unsigned V2 = ARM_AM::getSOImmTwoPartSecond(MO1.getImm()); @@ -483,7 +495,7 @@ void Emitter<CodeEmitter>::emitMOVi2piecesInstruction(const MachineInstr &MI) { // Encode so_imm. // Set bit I(25) to identify this is the immediate form of <shifter_op> Binary |= 1 << ARMII::I_BitShift; - Binary |= getMachineSoImmOpValue(ARM_AM::getSOImmVal(V1)); + Binary |= getMachineSoImmOpValue(V1); emitWordLE(Binary); // Now the 'orr' instruction. @@ -501,14 +513,14 @@ void Emitter<CodeEmitter>::emitMOVi2piecesInstruction(const MachineInstr &MI) { // Encode so_imm. // Set bit I(25) to identify this is the immediate form of <shifter_op> Binary |= 1 << ARMII::I_BitShift; - Binary |= getMachineSoImmOpValue(ARM_AM::getSOImmVal(V2)); + Binary |= getMachineSoImmOpValue(V2); emitWordLE(Binary); } template<class CodeEmitter> void Emitter<CodeEmitter>::emitLEApcrelJTInstruction(const MachineInstr &MI) { // It's basically add r, pc, (LJTI - $+8) - + const TargetInstrDesc &TID = MI.getDesc(); // Emit the 'add' instruction. @@ -527,7 +539,6 @@ void Emitter<CodeEmitter>::emitLEApcrelJTInstruction(const MachineInstr &MI) { Binary |= ARMRegisterInfo::getRegisterNumbering(ARM::PC) << ARMII::RegRnShift; // Encode the displacement. - // Set bit I(25) to identify this is the immediate form of <shifter_op>. Binary |= 1 << ARMII::I_BitShift; emitJumpTableAddress(MI.getOperand(1).getIndex(), ARM::reloc_arm_jt_base); @@ -576,8 +587,8 @@ void Emitter<CodeEmitter>::emitPseudoMoveInstruction(const MachineInstr &MI) { template<class CodeEmitter> void Emitter<CodeEmitter>::addPCLabel(unsigned LabelID) { - DOUT << " ** LPC" << LabelID << " @ " - << (void*)MCE.getCurrentPCValue() << '\n'; + DEBUG(errs() << " ** LPC" << LabelID << " @ " + << (void*)MCE.getCurrentPCValue() << '\n'); JTI->addPCLabelAddr(LabelID, MCE.getCurrentPCValue()); } @@ -586,13 +597,13 @@ void Emitter<CodeEmitter>::emitPseudoInstruction(const MachineInstr &MI) { unsigned Opcode = MI.getDesc().Opcode; switch (Opcode) { default: - abort(); // FIXME: + llvm_unreachable("ARMCodeEmitter::emitPseudoInstruction"); + // FIXME: Add support for MOVimm32. case TargetInstrInfo::INLINEASM: { // We allow inline assembler nodes with empty bodies - they can // implicitly define registers, which is ok for JIT. if (MI.getOperand(0).getSymbolName()[0]) { - assert(0 && "JIT does not support inline asm!\n"); - abort(); + llvm_report_error("JIT does not support inline asm!"); } break; } @@ -601,7 +612,7 @@ void Emitter<CodeEmitter>::emitPseudoInstruction(const MachineInstr &MI) { MCE.emitLabel(MI.getOperand(0).getImm()); break; case TargetInstrInfo::IMPLICIT_DEF: - case TargetInstrInfo::DECLARE: + case TargetInstrInfo::KILL: case ARM::DWARF_LOC: // Do nothing. break; @@ -674,7 +685,7 @@ unsigned Emitter<CodeEmitter>::getMachineSoRegOpValue( // ROR - 0111 // RRX - 0110 and bit[11:8] clear. switch (SOpc) { - default: assert(0 && "Unknown shift opc!"); + default: llvm_unreachable("Unknown shift opc!"); case ARM_AM::lsl: SBits = 0x1; break; case ARM_AM::lsr: SBits = 0x3; break; case ARM_AM::asr: SBits = 0x5; break; @@ -688,7 +699,7 @@ unsigned Emitter<CodeEmitter>::getMachineSoRegOpValue( // ASR - 100 // ROR - 110 switch (SOpc) { - default: assert(0 && "Unknown shift opc!"); + default: llvm_unreachable("Unknown shift opc!"); case ARM_AM::lsl: SBits = 0x0; break; case ARM_AM::lsr: SBits = 0x2; break; case ARM_AM::asr: SBits = 0x4; break; @@ -713,12 +724,15 @@ unsigned Emitter<CodeEmitter>::getMachineSoRegOpValue( template<class CodeEmitter> unsigned Emitter<CodeEmitter>::getMachineSoImmOpValue(unsigned SoImm) { + int SoImmVal = ARM_AM::getSOImmVal(SoImm); + assert(SoImmVal != -1 && "Not a valid so_imm value!"); + // Encode rotate_imm. - unsigned Binary = (ARM_AM::getSOImmValRot(SoImm) >> 1) + unsigned Binary = (ARM_AM::getSOImmValRot((unsigned)SoImmVal) >> 1) << ARMII::SoRotImmShift; // Encode immed_8. - Binary |= ARM_AM::getSOImmValImm(SoImm); + Binary |= ARM_AM::getSOImmValImm((unsigned)SoImmVal); return Binary; } @@ -740,6 +754,10 @@ void Emitter<CodeEmitter>::emitDataProcessingInstruction( unsigned ImplicitRn) { const TargetInstrDesc &TID = MI.getDesc(); + if (TID.Opcode == ARM::BFC) { + llvm_report_error("ARMv6t2 JIT is not yet supported."); + } + // Part of binary is determined by TableGn. unsigned Binary = getBinaryCodeForInstr(MI); @@ -791,9 +809,7 @@ void Emitter<CodeEmitter>::emitDataProcessingInstruction( } // Encode so_imm. - // Set bit I(25) to identify this is the immediate form of <shifter_op>. - Binary |= 1 << ARMII::I_BitShift; - Binary |= getMachineSoImmOpValue(MO.getImm()); + Binary |= getMachineSoImmOpValue((unsigned)MO.getImm()); emitWordLE(Binary); } @@ -952,8 +968,8 @@ static unsigned getAddrModeUPBits(unsigned Mode) { // DA - Decrement after - bit U = 0 and bit P = 0 // DB - Decrement before - bit U = 0 and bit P = 1 switch (Mode) { - default: assert(0 && "Unknown addressing sub-mode!"); - case ARM_AM::da: break; + default: llvm_unreachable("Unknown addressing sub-mode!"); + case ARM_AM::da: break; case ARM_AM::db: Binary |= 0x1 << ARMII::P_BitShift; break; case ARM_AM::ia: Binary |= 0x1 << ARMII::U_BitShift; break; case ARM_AM::ib: Binary |= 0x3 << ARMII::U_BitShift; break; @@ -983,7 +999,7 @@ void Emitter<CodeEmitter>::emitLoadStoreMultipleInstruction( Binary |= 0x1 << ARMII::W_BitShift; // Set registers - for (unsigned i = 4, e = MI.getNumOperands(); i != e; ++i) { + for (unsigned i = 5, e = MI.getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI.getOperand(i); if (!MO.isReg() || MO.isImplicit()) break; @@ -1107,7 +1123,7 @@ void Emitter<CodeEmitter>::emitMiscArithInstruction(const MachineInstr &MI) { unsigned ShiftAmt = MI.getOperand(OpIdx).getImm(); assert(ShiftAmt < 32 && "shift_imm range is 0 to 31!"); Binary |= ShiftAmt << ARMII::ShiftShift; - + emitWordLE(Binary); } @@ -1115,8 +1131,9 @@ template<class CodeEmitter> void Emitter<CodeEmitter>::emitBranchInstruction(const MachineInstr &MI) { const TargetInstrDesc &TID = MI.getDesc(); - if (TID.Opcode == ARM::TPsoft) - abort(); // FIXME + if (TID.Opcode == ARM::TPsoft) { + llvm_unreachable("ARM::TPsoft FIXME"); // FIXME + } // Part of binary is determined by TableGn. unsigned Binary = getBinaryCodeForInstr(MI); @@ -1135,7 +1152,8 @@ void Emitter<CodeEmitter>::emitInlineJumpTable(unsigned JTIndex) { // Remember the base address of the inline jump table. uintptr_t JTBase = MCE.getCurrentPCValue(); JTI->addJumpTableBaseAddr(JTIndex, JTBase); - DOUT << " ** Jump Table #" << JTIndex << " @ " << (void*)JTBase << '\n'; + DEBUG(errs() << " ** Jump Table #" << JTIndex << " @ " << (void*)JTBase + << '\n'); // Now emit the jump table entries. const std::vector<MachineBasicBlock*> &MBBs = (*MJTEs)[JTIndex].MBBs; @@ -1155,17 +1173,17 @@ void Emitter<CodeEmitter>::emitMiscBranchInstruction(const MachineInstr &MI) { const TargetInstrDesc &TID = MI.getDesc(); // Handle jump tables. - if (TID.Opcode == ARM::BR_JTr || TID.Opcode == ARM::BR_JTadd || - TID.Opcode == ARM::t2BR_JTr || TID.Opcode == ARM::t2BR_JTadd) { + if (TID.Opcode == ARM::BR_JTr || TID.Opcode == ARM::BR_JTadd) { // First emit a ldr pc, [] instruction. emitDataProcessingInstruction(MI, ARM::PC); // Then emit the inline jump table. - unsigned JTIndex = (TID.Opcode == ARM::BR_JTr || TID.Opcode == ARM::t2BR_JTr) + unsigned JTIndex = + (TID.Opcode == ARM::BR_JTr) ? MI.getOperand(1).getIndex() : MI.getOperand(2).getIndex(); emitInlineJumpTable(JTIndex); return; - } else if (TID.Opcode == ARM::BR_JTm || TID.Opcode == ARM::t2BR_JTm) { + } else if (TID.Opcode == ARM::BR_JTm) { // First emit a ldr pc, [] instruction. emitLoadStoreInstruction(MI, ARM::PC); @@ -1183,7 +1201,7 @@ void Emitter<CodeEmitter>::emitMiscBranchInstruction(const MachineInstr &MI) { if (TID.Opcode == ARM::BX_RET) // The return register is LR. Binary |= ARMRegisterInfo::getRegisterNumbering(ARM::LR); - else + else // otherwise, set the return register Binary |= getMachineOpValue(MI, 0); @@ -1194,7 +1212,7 @@ static unsigned encodeVFPRd(const MachineInstr &MI, unsigned OpIdx) { unsigned RegD = MI.getOperand(OpIdx).getReg(); unsigned Binary = 0; bool isSPVFP = false; - RegD = ARMRegisterInfo::getRegisterNumbering(RegD, isSPVFP); + RegD = ARMRegisterInfo::getRegisterNumbering(RegD, &isSPVFP); if (!isSPVFP) Binary |= RegD << ARMII::RegRdShift; else { @@ -1208,7 +1226,7 @@ static unsigned encodeVFPRn(const MachineInstr &MI, unsigned OpIdx) { unsigned RegN = MI.getOperand(OpIdx).getReg(); unsigned Binary = 0; bool isSPVFP = false; - RegN = ARMRegisterInfo::getRegisterNumbering(RegN, isSPVFP); + RegN = ARMRegisterInfo::getRegisterNumbering(RegN, &isSPVFP); if (!isSPVFP) Binary |= RegN << ARMII::RegRnShift; else { @@ -1222,7 +1240,7 @@ static unsigned encodeVFPRm(const MachineInstr &MI, unsigned OpIdx) { unsigned RegM = MI.getOperand(OpIdx).getReg(); unsigned Binary = 0; bool isSPVFP = false; - RegM = ARMRegisterInfo::getRegisterNumbering(RegM, isSPVFP); + RegM = ARMRegisterInfo::getRegisterNumbering(RegM, &isSPVFP); if (!isSPVFP) Binary |= RegM; else { @@ -1268,7 +1286,7 @@ void Emitter<CodeEmitter>::emitVFPArithInstruction(const MachineInstr &MI) { // Encode Dm / Sm. Binary |= encodeVFPRm(MI, OpIdx); - + emitWordLE(Binary); } @@ -1386,11 +1404,11 @@ void Emitter<CodeEmitter>::emitVFPLoadStoreMultipleInstruction( Binary |= 0x1 << ARMII::W_BitShift; // First register is encoded in Dd. - Binary |= encodeVFPRd(MI, 4); + Binary |= encodeVFPRd(MI, 5); // Number of registers are encoded in offset field. unsigned NumRegs = 1; - for (unsigned i = 5, e = MI.getNumOperands(); i != e; ++i) { + for (unsigned i = 6, e = MI.getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI.getOperand(i); if (!MO.isReg() || MO.isImplicit()) break; @@ -1413,4 +1431,3 @@ void Emitter<CodeEmitter>::emitMiscInstruction(const MachineInstr &MI) { } #include "ARMGenCodeEmitter.inc" - diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp index 9fedaa4..309e3ba 100644 --- a/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -15,24 +15,31 @@ #define DEBUG_TYPE "arm-cp-islands" #include "ARM.h" +#include "ARMAddressingModes.h" #include "ARMMachineFunctionInfo.h" #include "ARMInstrInfo.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/Target/TargetData.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" using namespace llvm; -STATISTIC(NumCPEs, "Number of constpool entries"); -STATISTIC(NumSplit, "Number of uncond branches inserted"); -STATISTIC(NumCBrFixed, "Number of cond branches fixed"); -STATISTIC(NumUBrFixed, "Number of uncond branches fixed"); +STATISTIC(NumCPEs, "Number of constpool entries"); +STATISTIC(NumSplit, "Number of uncond branches inserted"); +STATISTIC(NumCBrFixed, "Number of cond branches fixed"); +STATISTIC(NumUBrFixed, "Number of uncond branches fixed"); +STATISTIC(NumTBs, "Number of table branches generated"); +STATISTIC(NumT2CPShrunk, "Number of Thumb2 constantpool instructions shrunk"); +STATISTIC(NumT2BrShrunk, "Number of Thumb2 immediate branches shrunk"); namespace { /// ARMConstantIslands - Due to limited PC-relative displacements, ARM @@ -63,6 +70,8 @@ namespace { /// to a return, unreachable, or unconditional branch). std::vector<MachineBasicBlock*> WaterList; + typedef std::vector<MachineBasicBlock*>::iterator water_iterator; + /// CPUser - One user of a constant pool, keeping the machine instruction /// pointer, the constant pool being referenced, and the max displacement /// allowed from the instruction to the CP. @@ -70,8 +79,11 @@ namespace { MachineInstr *MI; MachineInstr *CPEMI; unsigned MaxDisp; - CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp) - : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp) {} + bool NegOk; + bool IsSoImm; + CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp, + bool neg, bool soimm) + : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp), NegOk(neg), IsSoImm(soimm) {} }; /// CPUsers - Keep track of all of the machine instructions that use various @@ -117,29 +129,34 @@ namespace { /// SmallVector<MachineInstr*, 4> PushPopMIs; + /// T2JumpTables - Keep track of all the Thumb2 jumptable instructions. + SmallVector<MachineInstr*, 4> T2JumpTables; + /// HasFarJump - True if any far jump instruction has been emitted during /// the branch fix up pass. bool HasFarJump; const TargetInstrInfo *TII; + const ARMSubtarget *STI; ARMFunctionInfo *AFI; bool isThumb; + bool isThumb1; bool isThumb2; public: static char ID; ARMConstantIslands() : MachineFunctionPass(&ID) {} - virtual bool runOnMachineFunction(MachineFunction &Fn); + virtual bool runOnMachineFunction(MachineFunction &MF); virtual const char *getPassName() const { return "ARM constant island placement and branch shortening pass"; } private: - void DoInitialPlacement(MachineFunction &Fn, + void DoInitialPlacement(MachineFunction &MF, std::vector<MachineInstr*> &CPEMIs); CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI); - void InitialFunctionScan(MachineFunction &Fn, + void InitialFunctionScan(MachineFunction &MF, const std::vector<MachineInstr*> &CPEMIs); MachineBasicBlock *SplitBlockBeforeInstr(MachineInstr *MI); void UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB); @@ -147,58 +164,62 @@ namespace { bool DecrementOldEntry(unsigned CPI, MachineInstr* CPEMI); int LookForExistingCPEntry(CPUser& U, unsigned UserOffset); bool LookForWater(CPUser&U, unsigned UserOffset, - MachineBasicBlock** NewMBB); - MachineBasicBlock* AcceptWater(MachineBasicBlock *WaterBB, - std::vector<MachineBasicBlock*>::iterator IP); + MachineBasicBlock *&NewMBB); + MachineBasicBlock *AcceptWater(water_iterator IP); void CreateNewWater(unsigned CPUserIndex, unsigned UserOffset, - MachineBasicBlock** NewMBB); - bool HandleConstantPoolUser(MachineFunction &Fn, unsigned CPUserIndex); + MachineBasicBlock *&NewMBB); + bool HandleConstantPoolUser(MachineFunction &MF, unsigned CPUserIndex); void RemoveDeadCPEMI(MachineInstr *CPEMI); bool RemoveUnusedCPEntries(); bool CPEIsInRange(MachineInstr *MI, unsigned UserOffset, - MachineInstr *CPEMI, unsigned Disp, - bool DoDump); + MachineInstr *CPEMI, unsigned Disp, bool NegOk, + bool DoDump = false); bool WaterIsInRange(unsigned UserOffset, MachineBasicBlock *Water, CPUser &U); bool OffsetIsInRange(unsigned UserOffset, unsigned TrialOffset, - unsigned Disp, bool NegativeOK); + unsigned Disp, bool NegativeOK, bool IsSoImm = false); bool BBIsInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp); - bool FixUpImmediateBr(MachineFunction &Fn, ImmBranch &Br); - bool FixUpConditionalBr(MachineFunction &Fn, ImmBranch &Br); - bool FixUpUnconditionalBr(MachineFunction &Fn, ImmBranch &Br); + bool FixUpImmediateBr(MachineFunction &MF, ImmBranch &Br); + bool FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br); + bool FixUpUnconditionalBr(MachineFunction &MF, ImmBranch &Br); bool UndoLRSpillRestore(); + bool OptimizeThumb2Instructions(MachineFunction &MF); + bool OptimizeThumb2Branches(MachineFunction &MF); + bool OptimizeThumb2JumpTables(MachineFunction &MF); unsigned GetOffsetOf(MachineInstr *MI) const; void dumpBBs(); - void verify(MachineFunction &Fn); + void verify(MachineFunction &MF); }; char ARMConstantIslands::ID = 0; } /// verify - check BBOffsets, BBSizes, alignment of islands -void ARMConstantIslands::verify(MachineFunction &Fn) { +void ARMConstantIslands::verify(MachineFunction &MF) { assert(BBOffsets.size() == BBSizes.size()); for (unsigned i = 1, e = BBOffsets.size(); i != e; ++i) assert(BBOffsets[i-1]+BBSizes[i-1] == BBOffsets[i]); - if (isThumb) { - for (MachineFunction::iterator MBBI = Fn.begin(), E = Fn.end(); - MBBI != E; ++MBBI) { - MachineBasicBlock *MBB = MBBI; - if (!MBB->empty() && - MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY) - assert((BBOffsets[MBB->getNumber()]%4 == 0 && - BBSizes[MBB->getNumber()]%4 == 0) || - (BBOffsets[MBB->getNumber()]%4 != 0 && - BBSizes[MBB->getNumber()]%4 != 0)); + if (!isThumb) + return; +#ifndef NDEBUG + for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end(); + MBBI != E; ++MBBI) { + MachineBasicBlock *MBB = MBBI; + if (!MBB->empty() && + MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY) { + unsigned MBBId = MBB->getNumber(); + assert((BBOffsets[MBBId]%4 == 0 && BBSizes[MBBId]%4 == 0) || + (BBOffsets[MBBId]%4 != 0 && BBSizes[MBBId]%4 != 0)); } } +#endif } /// print block size and offset information - debugging void ARMConstantIslands::dumpBBs() { for (unsigned J = 0, E = BBOffsets.size(); J !=E; ++J) { - DOUT << "block " << J << " offset " << BBOffsets[J] << - " size " << BBSizes[J] << "\n"; + DEBUG(errs() << "block " << J << " offset " << BBOffsets[J] + << " size " << BBSizes[J] << "\n"); } } @@ -208,31 +229,36 @@ FunctionPass *llvm::createARMConstantIslandPass() { return new ARMConstantIslands(); } -bool ARMConstantIslands::runOnMachineFunction(MachineFunction &Fn) { - MachineConstantPool &MCP = *Fn.getConstantPool(); +bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) { + MachineConstantPool &MCP = *MF.getConstantPool(); + + TII = MF.getTarget().getInstrInfo(); + AFI = MF.getInfo<ARMFunctionInfo>(); + STI = &MF.getTarget().getSubtarget<ARMSubtarget>(); - TII = Fn.getTarget().getInstrInfo(); - AFI = Fn.getInfo<ARMFunctionInfo>(); isThumb = AFI->isThumbFunction(); + isThumb1 = AFI->isThumb1OnlyFunction(); isThumb2 = AFI->isThumb2Function(); HasFarJump = false; // Renumber all of the machine basic blocks in the function, guaranteeing that // the numbers agree with the position of the block in the function. - Fn.RenumberBlocks(); + MF.RenumberBlocks(); + + // Thumb1 functions containing constant pools get 4-byte alignment. + // This is so we can keep exact track of where the alignment padding goes. - /// Thumb functions containing constant pools get 2-byte alignment. - /// This is so we can keep exact track of where the alignment padding goes. - /// Set default. - AFI->setAlign(isThumb ? 1U : 2U); + // Set default. Thumb1 function is 2-byte aligned, ARM and Thumb2 are 4-byte + // aligned. + AFI->setAlign(isThumb1 ? 1U : 2U); // Perform the initial placement of the constant pool entries. To start with, // we put them all at the end of the function. std::vector<MachineInstr*> CPEMIs; if (!MCP.isEmpty()) { - DoInitialPlacement(Fn, CPEMIs); - if (isThumb) + DoInitialPlacement(MF, CPEMIs); + if (isThumb1) AFI->setAlign(2U); } @@ -242,7 +268,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &Fn) { // Do the initial scan of the function, building up information about the // sizes of each block, the location of all the water, and finding all of the // constant pool users. - InitialFunctionScan(Fn, CPEMIs); + InitialFunctionScan(MF, CPEMIs); CPEMIs.clear(); /// Remove dead constant pool entries. @@ -251,25 +277,37 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &Fn) { // Iteratively place constant pool entries and fix up branches until there // is no change. bool MadeChange = false; + unsigned NoCPIters = 0, NoBRIters = 0; while (true) { - bool Change = false; + bool CPChange = false; for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) - Change |= HandleConstantPoolUser(Fn, i); + CPChange |= HandleConstantPoolUser(MF, i); + if (CPChange && ++NoCPIters > 30) + llvm_unreachable("Constant Island pass failed to converge!"); DEBUG(dumpBBs()); + + bool BRChange = false; for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i) - Change |= FixUpImmediateBr(Fn, ImmBranches[i]); + BRChange |= FixUpImmediateBr(MF, ImmBranches[i]); + if (BRChange && ++NoBRIters > 30) + llvm_unreachable("Branch Fix Up pass failed to converge!"); DEBUG(dumpBBs()); - if (!Change) + + if (!CPChange && !BRChange) break; MadeChange = true; } + // Shrink 32-bit Thumb2 branch, load, and store instructions. + if (isThumb2) + MadeChange |= OptimizeThumb2Instructions(MF); + // After a while, this might be made debug-only, but it is not expensive. - verify(Fn); + verify(MF); // If LR has been forced spilled and no far jumps (i.e. BL) has been issued. // Undo the spill / restore of LR if possible. - if (!HasFarJump && AFI->isLRSpilledForFarJump() && isThumb) + if (isThumb && !HasFarJump && AFI->isLRSpilledForFarJump()) MadeChange |= UndoLRSpillRestore(); BBSizes.clear(); @@ -279,24 +317,25 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &Fn) { CPEntries.clear(); ImmBranches.clear(); PushPopMIs.clear(); + T2JumpTables.clear(); return MadeChange; } /// DoInitialPlacement - Perform the initial placement of the constant pool /// entries. To start with, we put them all at the end of the function. -void ARMConstantIslands::DoInitialPlacement(MachineFunction &Fn, +void ARMConstantIslands::DoInitialPlacement(MachineFunction &MF, std::vector<MachineInstr*> &CPEMIs) { // Create the basic block to hold the CPE's. - MachineBasicBlock *BB = Fn.CreateMachineBasicBlock(); - Fn.push_back(BB); + MachineBasicBlock *BB = MF.CreateMachineBasicBlock(); + MF.push_back(BB); // Add all of the constants from the constant pool to the end block, use an // identity mapping of CPI's to CPE's. const std::vector<MachineConstantPoolEntry> &CPs = - Fn.getConstantPool()->getConstants(); + MF.getConstantPool()->getConstants(); - const TargetData &TD = *Fn.getTarget().getTargetData(); + const TargetData &TD = *MF.getTarget().getTargetData(); for (unsigned i = 0, e = CPs.size(); i != e; ++i) { unsigned Size = TD.getTypeAllocSize(CPs[i].getType()); // Verify that all constant pool entries are a multiple of 4 bytes. If not, @@ -313,7 +352,8 @@ void ARMConstantIslands::DoInitialPlacement(MachineFunction &Fn, CPEs.push_back(CPEntry(CPEMI, i)); CPEntries.push_back(CPEs); NumCPEs++; - DOUT << "Moved CPI#" << i << " to end of function as #" << i << "\n"; + DEBUG(errs() << "Moved CPI#" << i << " to end of function as #" << i + << "\n"); } } @@ -352,10 +392,10 @@ ARMConstantIslands::CPEntry /// InitialFunctionScan - Do the initial scan of the function, building up /// information about the sizes of each block, the location of all the water, /// and finding all of the constant pool users. -void ARMConstantIslands::InitialFunctionScan(MachineFunction &Fn, +void ARMConstantIslands::InitialFunctionScan(MachineFunction &MF, const std::vector<MachineInstr*> &CPEMIs) { unsigned Offset = 0; - for (MachineFunction::iterator MBBI = Fn.begin(), E = Fn.end(); + for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end(); MBBI != E; ++MBBI) { MachineBasicBlock &MBB = *MBBI; @@ -377,18 +417,19 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &Fn, unsigned Scale = 1; int UOpc = Opc; switch (Opc) { + default: + continue; // Ignore other JT branches case ARM::tBR_JTr: - case ARM::t2BR_JTr: - case ARM::t2BR_JTm: - case ARM::t2BR_JTadd: - // A Thumb table jump may involve padding; for the offsets to + // A Thumb1 table jump may involve padding; for the offsets to // be right, functions containing these must be 4-byte aligned. AFI->setAlign(2U); if ((Offset+MBBSize)%4 != 0) + // FIXME: Add a pseudo ALIGN instruction instead. MBBSize += 2; // padding continue; // Does not get an entry in ImmBranches - default: - continue; // Ignore other JT branches + case ARM::t2BR_JT: + T2JumpTables.push_back(I); + continue; // Does not get an entry in ImmBranches case ARM::Bcc: isCond = true; UOpc = ARM::B; @@ -427,6 +468,9 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &Fn, if (Opc == ARM::tPUSH || Opc == ARM::tPOP_RET) PushPopMIs.push_back(I); + if (Opc == ARM::CONSTPOOL_ENTRY) + continue; + // Scan the instructions for constant pool operands. for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) if (I->getOperand(op).isCPI()) { @@ -436,50 +480,52 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &Fn, // Basic size info comes from the TSFlags field. unsigned Bits = 0; unsigned Scale = 1; - unsigned TSFlags = I->getDesc().TSFlags; - switch (TSFlags & ARMII::AddrModeMask) { + bool NegOk = false; + bool IsSoImm = false; + + switch (Opc) { default: - // Constant pool entries can reach anything. - if (I->getOpcode() == ARM::CONSTPOOL_ENTRY) - continue; - if (I->getOpcode() == ARM::tLEApcrel) { - Bits = 8; // Taking the address of a CP entry. - break; - } - assert(0 && "Unknown addressing mode for CP reference!"); - case ARMII::AddrMode1: // AM1: 8 bits << 2 - Bits = 8; - Scale = 4; // Taking the address of a CP entry. - break; - case ARMII::AddrMode2: - Bits = 12; // +-offset_12 - break; - case ARMII::AddrMode3: - Bits = 8; // +-offset_8 + llvm_unreachable("Unknown addressing mode for CP reference!"); break; - // addrmode4 has no immediate offset. - case ARMII::AddrMode5: + + // Taking the address of a CP entry. + case ARM::LEApcrel: + // This takes a SoImm, which is 8 bit immediate rotated. We'll + // pretend the maximum offset is 255 * 4. Since each instruction + // 4 byte wide, this is always correct. We'llc heck for other + // displacements that fits in a SoImm as well. Bits = 8; - Scale = 4; // +-(offset_8*4) + Scale = 4; + NegOk = true; + IsSoImm = true; break; - // addrmode6 has no immediate offset. - case ARMII::AddrModeT1_1: - Bits = 5; // +offset_5 + case ARM::t2LEApcrel: + Bits = 12; + NegOk = true; break; - case ARMII::AddrModeT1_2: - Bits = 5; - Scale = 2; // +(offset_5*2) + case ARM::tLEApcrel: + Bits = 8; + Scale = 4; break; - case ARMII::AddrModeT1_4: - Bits = 5; - Scale = 4; // +(offset_5*4) + + case ARM::LDR: + case ARM::LDRcp: + case ARM::t2LDRpci: + Bits = 12; // +-offset_12 + NegOk = true; break; - case ARMII::AddrModeT1_s: + + case ARM::tLDRpci: + case ARM::tLDRcp: Bits = 8; Scale = 4; // +(offset_8*4) break; - case ARMII::AddrModeT2_pc: - Bits = 12; // +-offset_12 + + case ARM::FLDD: + case ARM::FLDS: + Bits = 8; + Scale = 4; // +-(offset_8*4) + NegOk = true; break; } @@ -487,7 +533,7 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &Fn, unsigned CPI = I->getOperand(op).getIndex(); MachineInstr *CPEMI = CPEMIs[CPI]; unsigned MaxOffs = ((1 << Bits)-1) * Scale; - CPUsers.push_back(CPUser(I, CPEMI, MaxOffs)); + CPUsers.push_back(CPUser(I, CPEMI, MaxOffs, NegOk, IsSoImm)); // Increment corresponding CPEntry reference count. CPEntry *CPE = findConstPoolEntry(CPI, CPEMI); @@ -563,7 +609,7 @@ void ARMConstantIslands::UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB) { // Next, update WaterList. Specifically, we need to add NewMBB as having // available water after it. - std::vector<MachineBasicBlock*>::iterator IP = + water_iterator IP = std::lower_bound(WaterList.begin(), WaterList.end(), NewBB, CompareMBBNumbers); WaterList.insert(IP, NewBB); @@ -590,8 +636,8 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) { // Note the new unconditional branch is not being recorded. // There doesn't seem to be meaningful DebugInfo available; this doesn't // correspond to anything in the source. - BuildMI(OrigBB, DebugLoc::getUnknownLoc(), - TII->get(isThumb ? ((isThumb2) ? ARM::t2B : ARM::tB) : ARM::B)).addMBB(NewBB); + unsigned Opc = isThumb ? (isThumb2 ? ARM::t2B : ARM::tB) : ARM::B; + BuildMI(OrigBB, DebugLoc::getUnknownLoc(), TII->get(Opc)).addMBB(NewBB); NumSplit++; // Update the CFG. All succs of OrigBB are now succs of NewBB. @@ -625,7 +671,7 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) { // available water after it (but not if it's already there, which happens // when splitting before a conditional branch that is followed by an // unconditional branch - in that case we want to insert NewBB). - std::vector<MachineBasicBlock*>::iterator IP = + water_iterator IP = std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB, CompareMBBNumbers); MachineBasicBlock* WaterBB = *IP; @@ -648,7 +694,7 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) { // We removed instructions from UserMBB, subtract that off from its size. // Add 2 or 4 to the block to count the unconditional branch we added to it. - unsigned delta = isThumb ? 2 : 4; + int delta = isThumb1 ? 2 : 4; BBSizes[OrigBBI] -= NewBBSize - delta; // ...and adjust BBOffsets for NewBB accordingly. @@ -664,24 +710,39 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) { /// reference) is within MaxDisp of TrialOffset (a proposed location of a /// constant pool entry). bool ARMConstantIslands::OffsetIsInRange(unsigned UserOffset, - unsigned TrialOffset, unsigned MaxDisp, bool NegativeOK) { + unsigned TrialOffset, unsigned MaxDisp, + bool NegativeOK, bool IsSoImm) { // On Thumb offsets==2 mod 4 are rounded down by the hardware for // purposes of the displacement computation; compensate for that here. // Effectively, the valid range of displacements is 2 bytes smaller for such // references. - if (isThumb && UserOffset%4 !=0) + unsigned TotalAdj = 0; + if (isThumb && UserOffset%4 !=0) { UserOffset -= 2; + TotalAdj = 2; + } // CPEs will be rounded up to a multiple of 4. - if (isThumb && TrialOffset%4 != 0) + if (isThumb && TrialOffset%4 != 0) { TrialOffset += 2; + TotalAdj += 2; + } + + // In Thumb2 mode, later branch adjustments can shift instructions up and + // cause alignment change. In the worst case scenario this can cause the + // user's effective address to be subtracted by 2 and the CPE's address to + // be plus 2. + if (isThumb2 && TotalAdj != 4) + MaxDisp -= (4 - TotalAdj); if (UserOffset <= TrialOffset) { // User before the Trial. - if (TrialOffset-UserOffset <= MaxDisp) + if (TrialOffset - UserOffset <= MaxDisp) return true; + // FIXME: Make use full range of soimm values. } else if (NegativeOK) { - if (UserOffset-TrialOffset <= MaxDisp) + if (UserOffset - TrialOffset <= MaxDisp) return true; + // FIXME: Make use full range of soimm values. } return false; } @@ -690,39 +751,36 @@ bool ARMConstantIslands::OffsetIsInRange(unsigned UserOffset, /// Water (a basic block) will be in range for the specific MI. bool ARMConstantIslands::WaterIsInRange(unsigned UserOffset, - MachineBasicBlock* Water, CPUser &U) -{ + MachineBasicBlock* Water, CPUser &U) { unsigned MaxDisp = U.MaxDisp; - MachineFunction::iterator I = next(MachineFunction::iterator(Water)); unsigned CPEOffset = BBOffsets[Water->getNumber()] + BBSizes[Water->getNumber()]; // If the CPE is to be inserted before the instruction, that will raise - // the offset of the instruction. (Currently applies only to ARM, so - // no alignment compensation attempted here.) + // the offset of the instruction. if (CPEOffset < UserOffset) UserOffset += U.CPEMI->getOperand(2).getImm(); - return OffsetIsInRange (UserOffset, CPEOffset, MaxDisp, !isThumb); + return OffsetIsInRange(UserOffset, CPEOffset, MaxDisp, U.NegOk, U.IsSoImm); } /// CPEIsInRange - Returns true if the distance between specific MI and /// specific ConstPool entry instruction can fit in MI's displacement field. bool ARMConstantIslands::CPEIsInRange(MachineInstr *MI, unsigned UserOffset, - MachineInstr *CPEMI, - unsigned MaxDisp, bool DoDump) { + MachineInstr *CPEMI, unsigned MaxDisp, + bool NegOk, bool DoDump) { unsigned CPEOffset = GetOffsetOf(CPEMI); assert(CPEOffset%4 == 0 && "Misaligned CPE"); if (DoDump) { - DOUT << "User of CPE#" << CPEMI->getOperand(0).getImm() - << " max delta=" << MaxDisp - << " insn address=" << UserOffset - << " CPE address=" << CPEOffset - << " offset=" << int(CPEOffset-UserOffset) << "\t" << *MI; + DEBUG(errs() << "User of CPE#" << CPEMI->getOperand(0).getImm() + << " max delta=" << MaxDisp + << " insn address=" << UserOffset + << " CPE address=" << CPEOffset + << " offset=" << int(CPEOffset-UserOffset) << "\t" << *MI); } - return OffsetIsInRange(UserOffset, CPEOffset, MaxDisp, !isThumb); + return OffsetIsInRange(UserOffset, CPEOffset, MaxDisp, NegOk); } #ifndef NDEBUG @@ -745,52 +803,48 @@ static bool BBIsJumpedOver(MachineBasicBlock *MBB) { void ARMConstantIslands::AdjustBBOffsetsAfter(MachineBasicBlock *BB, int delta) { MachineFunction::iterator MBBI = BB; MBBI = next(MBBI); - for(unsigned i=BB->getNumber()+1; i<BB->getParent()->getNumBlockIDs(); i++) { + for(unsigned i = BB->getNumber()+1, e = BB->getParent()->getNumBlockIDs(); + i < e; ++i) { BBOffsets[i] += delta; // If some existing blocks have padding, adjust the padding as needed, a // bit tricky. delta can be negative so don't use % on that. - if (isThumb) { - MachineBasicBlock *MBB = MBBI; - if (!MBB->empty()) { - // Constant pool entries require padding. - if (MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY) { - unsigned oldOffset = BBOffsets[i] - delta; - if (oldOffset%4==0 && BBOffsets[i]%4!=0) { - // add new padding - BBSizes[i] += 2; - delta += 2; - } else if (oldOffset%4!=0 && BBOffsets[i]%4==0) { - // remove existing padding - BBSizes[i] -=2; - delta -= 2; - } + if (!isThumb) + continue; + MachineBasicBlock *MBB = MBBI; + if (!MBB->empty()) { + // Constant pool entries require padding. + if (MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY) { + unsigned OldOffset = BBOffsets[i] - delta; + if ((OldOffset%4) == 0 && (BBOffsets[i]%4) != 0) { + // add new padding + BBSizes[i] += 2; + delta += 2; + } else if ((OldOffset%4) != 0 && (BBOffsets[i]%4) == 0) { + // remove existing padding + BBSizes[i] -= 2; + delta -= 2; } - // Thumb jump tables require padding. They should be at the end; - // following unconditional branches are removed by AnalyzeBranch. - MachineInstr *ThumbJTMI = NULL; - if ((prior(MBB->end())->getOpcode() == ARM::tBR_JTr) - || (prior(MBB->end())->getOpcode() == ARM::t2BR_JTr) - || (prior(MBB->end())->getOpcode() == ARM::t2BR_JTm) - || (prior(MBB->end())->getOpcode() == ARM::t2BR_JTadd)) - ThumbJTMI = prior(MBB->end()); - if (ThumbJTMI) { - unsigned newMIOffset = GetOffsetOf(ThumbJTMI); - unsigned oldMIOffset = newMIOffset - delta; - if (oldMIOffset%4 == 0 && newMIOffset%4 != 0) { - // remove existing padding - BBSizes[i] -= 2; - delta -= 2; - } else if (oldMIOffset%4 != 0 && newMIOffset%4 == 0) { - // add new padding - BBSizes[i] += 2; - delta += 2; - } + } + // Thumb1 jump tables require padding. They should be at the end; + // following unconditional branches are removed by AnalyzeBranch. + MachineInstr *ThumbJTMI = prior(MBB->end()); + if (ThumbJTMI->getOpcode() == ARM::tBR_JTr) { + unsigned NewMIOffset = GetOffsetOf(ThumbJTMI); + unsigned OldMIOffset = NewMIOffset - delta; + if ((OldMIOffset%4) == 0 && (NewMIOffset%4) != 0) { + // remove existing padding + BBSizes[i] -= 2; + delta -= 2; + } else if ((OldMIOffset%4) != 0 && (NewMIOffset%4) == 0) { + // add new padding + BBSizes[i] += 2; + delta += 2; } - if (delta==0) - return; } - MBBI = next(MBBI); + if (delta==0) + return; } + MBBI = next(MBBI); } } @@ -824,8 +878,8 @@ int ARMConstantIslands::LookForExistingCPEntry(CPUser& U, unsigned UserOffset) MachineInstr *CPEMI = U.CPEMI; // Check to see if the CPE is already in-range. - if (CPEIsInRange(UserMI, UserOffset, CPEMI, U.MaxDisp, true)) { - DOUT << "In range\n"; + if (CPEIsInRange(UserMI, UserOffset, CPEMI, U.MaxDisp, U.NegOk, true)) { + DEBUG(errs() << "In range\n"); return 1; } @@ -839,8 +893,9 @@ int ARMConstantIslands::LookForExistingCPEntry(CPUser& U, unsigned UserOffset) // Removing CPEs can leave empty entries, skip if (CPEs[i].CPEMI == NULL) continue; - if (CPEIsInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.MaxDisp, false)) { - DOUT << "Replacing CPE#" << CPI << " with CPE#" << CPEs[i].CPI << "\n"; + if (CPEIsInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.MaxDisp, U.NegOk)) { + DEBUG(errs() << "Replacing CPE#" << CPI << " with CPE#" + << CPEs[i].CPI << "\n"); // Point the CPUser node to the replacement U.CPEMI = CPEs[i].CPEMI; // Change the CPI in the instruction operand to refer to the clone. @@ -870,15 +925,15 @@ static inline unsigned getUnconditionalBrDisp(int Opc) { default: break; } - + return ((1<<23)-1)*4; } /// AcceptWater - Small amount of common code factored out of the following. - -MachineBasicBlock* ARMConstantIslands::AcceptWater(MachineBasicBlock *WaterBB, - std::vector<MachineBasicBlock*>::iterator IP) { - DOUT << "found water in range\n"; +/// +MachineBasicBlock *ARMConstantIslands::AcceptWater(water_iterator IP) { + DEBUG(errs() << "found water in range\n"); + MachineBasicBlock *WaterBB = *IP; // Remove the original WaterList entry; we want subsequent // insertions in this vicinity to go after the one we're // about to insert. This considerably reduces the number @@ -890,41 +945,44 @@ MachineBasicBlock* ARMConstantIslands::AcceptWater(MachineBasicBlock *WaterBB, /// LookForWater - look for an existing entry in the WaterList in which /// we can place the CPE referenced from U so it's within range of U's MI. -/// Returns true if found, false if not. If it returns true, *NewMBB -/// is set to the WaterList entry. -/// For ARM, we prefer the water that's farthest away. For Thumb, prefer -/// water that will not introduce padding to water that will; within each -/// group, prefer the water that's farthest away. - +/// Returns true if found, false if not. If it returns true, NewMBB +/// is set to the WaterList entry. For Thumb, prefer water that will not +/// introduce padding to water that will. To ensure that this pass +/// terminates, the CPE location for a particular CPUser is only allowed to +/// move to a lower address, so search backward from the end of the list and +/// prefer the first water that is in range. bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset, - MachineBasicBlock** NewMBB) { - std::vector<MachineBasicBlock*>::iterator IPThatWouldPad; - MachineBasicBlock* WaterBBThatWouldPad = NULL; - if (!WaterList.empty()) { - for (std::vector<MachineBasicBlock*>::iterator IP = prior(WaterList.end()), - B = WaterList.begin();; --IP) { - MachineBasicBlock* WaterBB = *IP; - if (WaterIsInRange(UserOffset, WaterBB, U)) { - if (isThumb && - (BBOffsets[WaterBB->getNumber()] + - BBSizes[WaterBB->getNumber()])%4 != 0) { - // This is valid Water, but would introduce padding. Remember - // it in case we don't find any Water that doesn't do this. - if (!WaterBBThatWouldPad) { - WaterBBThatWouldPad = WaterBB; - IPThatWouldPad = IP; - } - } else { - *NewMBB = AcceptWater(WaterBB, IP); - return true; + MachineBasicBlock *&NewMBB) { + if (WaterList.empty()) + return false; + + bool FoundWaterThatWouldPad = false; + water_iterator IPThatWouldPad; + for (water_iterator IP = prior(WaterList.end()), + B = WaterList.begin();; --IP) { + MachineBasicBlock* WaterBB = *IP; + // Check if water is in range and at a lower address than the current one. + if (WaterIsInRange(UserOffset, WaterBB, U) && + WaterBB->getNumber() < U.CPEMI->getParent()->getNumber()) { + unsigned WBBId = WaterBB->getNumber(); + if (isThumb && + (BBOffsets[WBBId] + BBSizes[WBBId])%4 != 0) { + // This is valid Water, but would introduce padding. Remember + // it in case we don't find any Water that doesn't do this. + if (!FoundWaterThatWouldPad) { + FoundWaterThatWouldPad = true; + IPThatWouldPad = IP; } + } else { + NewMBB = AcceptWater(IP); + return true; + } } - if (IP == B) - break; - } + if (IP == B) + break; } - if (isThumb && WaterBBThatWouldPad) { - *NewMBB = AcceptWater(WaterBBThatWouldPad, IPThatWouldPad); + if (FoundWaterThatWouldPad) { + NewMBB = AcceptWater(IPThatWouldPad); return true; } return false; @@ -934,12 +992,12 @@ bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset, /// CPUsers[CPUserIndex], so create a place to put the CPE. The end of the /// block is used if in range, and the conditional branch munged so control /// flow is correct. Otherwise the block is split to create a hole with an -/// unconditional branch around it. In either case *NewMBB is set to a +/// unconditional branch around it. In either case NewMBB is set to a /// block following which the new island can be inserted (the WaterList /// is not adjusted). - void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex, - unsigned UserOffset, MachineBasicBlock** NewMBB) { + unsigned UserOffset, + MachineBasicBlock *&NewMBB) { CPUser &U = CPUsers[CPUserIndex]; MachineInstr *UserMI = U.MI; MachineInstr *CPEMI = U.CPEMI; @@ -950,18 +1008,18 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex, // If the use is at the end of the block, or the end of the block // is within range, make new water there. (The addition below is - // for the unconditional branch we will be adding: 4 bytes on ARM, - // 2 on Thumb. Possible Thumb alignment padding is allowed for + // for the unconditional branch we will be adding: 4 bytes on ARM + Thumb2, + // 2 on Thumb1. Possible Thumb1 alignment padding is allowed for // inside OffsetIsInRange. // If the block ends in an unconditional branch already, it is water, // and is known to be out of range, so we'll always be adding a branch.) if (&UserMBB->back() == UserMI || - OffsetIsInRange(UserOffset, OffsetOfNextBlock + (isThumb ? 2: 4), - U.MaxDisp, !isThumb)) { - DOUT << "Split at end of block\n"; + OffsetIsInRange(UserOffset, OffsetOfNextBlock + (isThumb1 ? 2: 4), + U.MaxDisp, U.NegOk, U.IsSoImm)) { + DEBUG(errs() << "Split at end of block\n"); if (&UserMBB->back() == UserMI) assert(BBHasFallthrough(UserMBB) && "Expected a fallthrough BB!"); - *NewMBB = next(MachineFunction::iterator(UserMBB)); + NewMBB = next(MachineFunction::iterator(UserMBB)); // Add an unconditional branch from UserMBB to fallthrough block. // Record it for branch lengthening; this new branch will not get out of // range, but if the preceding conditional branch is out of range, the @@ -969,16 +1027,16 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex, // range, so the machinery has to know about it. int UncondBr = isThumb ? ((isThumb2) ? ARM::t2B : ARM::tB) : ARM::B; BuildMI(UserMBB, DebugLoc::getUnknownLoc(), - TII->get(UncondBr)).addMBB(*NewMBB); + TII->get(UncondBr)).addMBB(NewMBB); unsigned MaxDisp = getUnconditionalBrDisp(UncondBr); ImmBranches.push_back(ImmBranch(&UserMBB->back(), MaxDisp, false, UncondBr)); - int delta = isThumb ? 2 : 4; + int delta = isThumb1 ? 2 : 4; BBSizes[UserMBB->getNumber()] += delta; AdjustBBOffsetsAfter(UserMBB, delta); } else { // What a big block. Find a place within the block to split it. - // This is a little tricky on Thumb since instructions are 2 bytes + // This is a little tricky on Thumb1 since instructions are 2 bytes // and constant pool entries are 4 bytes: if instruction I references // island CPE, and instruction I+1 references CPE', it will // not work well to put CPE as far forward as possible, since then @@ -991,7 +1049,7 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex, // if not, we back up the insertion point. // The 4 in the following is for the unconditional branch we'll be - // inserting (allows for long branch on Thumb). Alignment of the + // inserting (allows for long branch on Thumb1). Alignment of the // island is handled inside OffsetIsInRange. unsigned BaseInsertOffset = UserOffset + U.MaxDisp -4; // This could point off the end of the block if we've already got @@ -1000,7 +1058,7 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex, // conditional and a maximally long unconditional). if (BaseInsertOffset >= BBOffsets[UserMBB->getNumber()+1]) BaseInsertOffset = BBOffsets[UserMBB->getNumber()+1] - - (isThumb ? 6 : 8); + (isThumb1 ? 6 : 8); unsigned EndInsertOffset = BaseInsertOffset + CPEMI->getOperand(2).getImm(); MachineBasicBlock::iterator MI = UserMI; @@ -1011,10 +1069,11 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex, Offset += TII->GetInstSizeInBytes(MI), MI = next(MI)) { if (CPUIndex < CPUsers.size() && CPUsers[CPUIndex].MI == MI) { + CPUser &U = CPUsers[CPUIndex]; if (!OffsetIsInRange(Offset, EndInsertOffset, - CPUsers[CPUIndex].MaxDisp, !isThumb)) { - BaseInsertOffset -= (isThumb ? 2 : 4); - EndInsertOffset -= (isThumb ? 2 : 4); + U.MaxDisp, U.NegOk, U.IsSoImm)) { + BaseInsertOffset -= (isThumb1 ? 2 : 4); + EndInsertOffset -= (isThumb1 ? 2 : 4); } // This is overly conservative, as we don't account for CPEMIs // being reused within the block, but it doesn't matter much. @@ -1022,8 +1081,8 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex, CPUIndex++; } } - DOUT << "Split in middle of big block\n"; - *NewMBB = SplitBlockBeforeInstr(prior(MI)); + DEBUG(errs() << "Split in middle of big block\n"); + NewMBB = SplitBlockBeforeInstr(prior(MI)); } } @@ -1031,7 +1090,7 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex, /// is out-of-range. If so, pick up the constant pool value and move it some /// place in-range. Return true if we changed any addresses (thus must run /// another pass of branch lengthening), false otherwise. -bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &Fn, +bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &MF, unsigned CPUserIndex) { CPUser &U = CPUsers[CPUserIndex]; MachineInstr *UserMI = U.MI; @@ -1040,14 +1099,9 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &Fn, unsigned Size = CPEMI->getOperand(2).getImm(); MachineBasicBlock *NewMBB; // Compute this only once, it's expensive. The 4 or 8 is the value the - // hardware keeps in the PC (2 insns ahead of the reference). + // hardware keeps in the PC. unsigned UserOffset = GetOffsetOf(UserMI) + (isThumb ? 4 : 8); - // Special case: tLEApcrel are two instructions MI's. The actual user is the - // second instruction. - if (UserMI->getOpcode() == ARM::tLEApcrel) - UserOffset += 2; - // See if the current entry is within range, or there is a clone of it // in range. int result = LookForExistingCPEntry(U, UserOffset); @@ -1058,19 +1112,16 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &Fn, // We will be generating a new clone. Get a UID for it. unsigned ID = AFI->createConstPoolEntryUId(); - // Look for water where we can place this CPE. We look for the farthest one - // away that will work. Forward references only for now (although later - // we might find some that are backwards). - - if (!LookForWater(U, UserOffset, &NewMBB)) { + // Look for water where we can place this CPE. + if (!LookForWater(U, UserOffset, NewMBB)) { // No water found. - DOUT << "No water found\n"; - CreateNewWater(CPUserIndex, UserOffset, &NewMBB); + DEBUG(errs() << "No water found\n"); + CreateNewWater(CPUserIndex, UserOffset, NewMBB); } // Okay, we know we can put an island before NewMBB now, do it! - MachineBasicBlock *NewIsland = Fn.CreateMachineBasicBlock(); - Fn.insert(NewMBB, NewIsland); + MachineBasicBlock *NewIsland = MF.CreateMachineBasicBlock(); + MF.insert(NewMBB, NewIsland); // Update internal data structures to account for the newly inserted MBB. UpdateForInsertedWaterBlock(NewIsland); @@ -1101,7 +1152,8 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &Fn, break; } - DOUT << " Moved CPE to #" << ID << " CPI=" << CPI << "\t" << *UserMI; + DEBUG(errs() << " Moved CPE to #" << ID << " CPI=" << CPI + << '\t' << *UserMI); return true; } @@ -1115,7 +1167,7 @@ void ARMConstantIslands::RemoveDeadCPEMI(MachineInstr *CPEMI) { BBSizes[CPEBB->getNumber()] -= Size; // All succeeding offsets have the current size value added in, fix this. if (CPEBB->empty()) { - // In thumb mode, the size of island may be padded by two to compensate for + // In thumb1 mode, the size of island may be padded by two to compensate for // the alignment requirement. Then it will now be 2 when the block is // empty, so fix this. // All succeeding offsets have the current size value added in, fix this. @@ -1157,11 +1209,11 @@ bool ARMConstantIslands::BBIsInRange(MachineInstr *MI,MachineBasicBlock *DestBB, unsigned BrOffset = GetOffsetOf(MI) + PCAdj; unsigned DestOffset = BBOffsets[DestBB->getNumber()]; - DOUT << "Branch of destination BB#" << DestBB->getNumber() - << " from BB#" << MI->getParent()->getNumber() - << " max delta=" << MaxDisp - << " from " << GetOffsetOf(MI) << " to " << DestOffset - << " offset " << int(DestOffset-BrOffset) << "\t" << *MI; + DEBUG(errs() << "Branch of destination BB#" << DestBB->getNumber() + << " from BB#" << MI->getParent()->getNumber() + << " max delta=" << MaxDisp + << " from " << GetOffsetOf(MI) << " to " << DestOffset + << " offset " << int(DestOffset-BrOffset) << "\t" << *MI); if (BrOffset <= DestOffset) { // Branch before the Dest. @@ -1176,7 +1228,7 @@ bool ARMConstantIslands::BBIsInRange(MachineInstr *MI,MachineBasicBlock *DestBB, /// FixUpImmediateBr - Fix up an immediate branch whose destination is too far /// away to fit in its displacement field. -bool ARMConstantIslands::FixUpImmediateBr(MachineFunction &Fn, ImmBranch &Br) { +bool ARMConstantIslands::FixUpImmediateBr(MachineFunction &MF, ImmBranch &Br) { MachineInstr *MI = Br.MI; MachineBasicBlock *DestBB = MI->getOperand(0).getMBB(); @@ -1185,8 +1237,8 @@ bool ARMConstantIslands::FixUpImmediateBr(MachineFunction &Fn, ImmBranch &Br) { return false; if (!Br.isCond) - return FixUpUnconditionalBr(Fn, Br); - return FixUpConditionalBr(Fn, Br); + return FixUpUnconditionalBr(MF, Br); + return FixUpConditionalBr(MF, Br); } /// FixUpUnconditionalBr - Fix up an unconditional branch whose destination is @@ -1194,10 +1246,11 @@ bool ARMConstantIslands::FixUpImmediateBr(MachineFunction &Fn, ImmBranch &Br) { /// spilled in the epilogue, then we can use BL to implement a far jump. /// Otherwise, add an intermediate branch instruction to a branch. bool -ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &Fn, ImmBranch &Br) { +ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &MF, ImmBranch &Br) { MachineInstr *MI = Br.MI; MachineBasicBlock *MBB = MI->getParent(); - assert(isThumb && !isThumb2 && "Expected a Thumb-1 function!"); + if (!isThumb1) + llvm_unreachable("FixUpUnconditionalBr is Thumb1 only!"); // Use BL to implement far jump. Br.MaxDisp = (1 << 21) * 2; @@ -1207,7 +1260,7 @@ ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &Fn, ImmBranch &Br) { HasFarJump = true; NumUBrFixed++; - DOUT << " Changed B to long jump " << *MI; + DEBUG(errs() << " Changed B to long jump " << *MI); return true; } @@ -1216,7 +1269,7 @@ ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &Fn, ImmBranch &Br) { /// far away to fit in its displacement field. It is converted to an inverse /// conditional branch + an unconditional branch to the destination. bool -ARMConstantIslands::FixUpConditionalBr(MachineFunction &Fn, ImmBranch &Br) { +ARMConstantIslands::FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br) { MachineInstr *MI = Br.MI; MachineBasicBlock *DestBB = MI->getOperand(0).getMBB(); @@ -1251,7 +1304,8 @@ ARMConstantIslands::FixUpConditionalBr(MachineFunction &Fn, ImmBranch &Br) { // b L1 MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB(); if (BBIsInRange(MI, NewDest, Br.MaxDisp)) { - DOUT << " Invert Bcc condition and swap its destination with " << *BMI; + DEBUG(errs() << " Invert Bcc condition and swap its destination with " + << *BMI); BMI->getOperand(0).setMBB(DestBB); MI->getOperand(0).setMBB(NewDest); MI->getOperand(1).setImm(CC); @@ -1273,9 +1327,9 @@ ARMConstantIslands::FixUpConditionalBr(MachineFunction &Fn, ImmBranch &Br) { } MachineBasicBlock *NextBB = next(MachineFunction::iterator(MBB)); - DOUT << " Insert B to BB#" << DestBB->getNumber() - << " also invert condition and change dest. to BB#" - << NextBB->getNumber() << "\n"; + DEBUG(errs() << " Insert B to BB#" << DestBB->getNumber() + << " also invert condition and change dest. to BB#" + << NextBB->getNumber() << "\n"); // Insert a new conditional branch and a new unconditional branch. // Also update the ImmBranch as well as adding a new entry for the new branch. @@ -1300,14 +1354,17 @@ ARMConstantIslands::FixUpConditionalBr(MachineFunction &Fn, ImmBranch &Br) { } /// UndoLRSpillRestore - Remove Thumb push / pop instructions that only spills -/// LR / restores LR to pc. +/// LR / restores LR to pc. FIXME: This is done here because it's only possible +/// to do this if tBfar is not used. bool ARMConstantIslands::UndoLRSpillRestore() { bool MadeChange = false; for (unsigned i = 0, e = PushPopMIs.size(); i != e; ++i) { MachineInstr *MI = PushPopMIs[i]; + // First two operands are predicates, the third is a zero since there + // is no writeback. if (MI->getOpcode() == ARM::tPOP_RET && - MI->getOperand(0).getReg() == ARM::PC && - MI->getNumExplicitOperands() == 1) { + MI->getOperand(3).getReg() == ARM::PC && + MI->getNumExplicitOperands() == 4) { BuildMI(MI->getParent(), MI->getDebugLoc(), TII->get(ARM::tBX_RET)); MI->eraseFromParent(); MadeChange = true; @@ -1315,3 +1372,201 @@ bool ARMConstantIslands::UndoLRSpillRestore() { } return MadeChange; } + +bool ARMConstantIslands::OptimizeThumb2Instructions(MachineFunction &MF) { + bool MadeChange = false; + + // Shrink ADR and LDR from constantpool. + for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) { + CPUser &U = CPUsers[i]; + unsigned Opcode = U.MI->getOpcode(); + unsigned NewOpc = 0; + unsigned Scale = 1; + unsigned Bits = 0; + switch (Opcode) { + default: break; + case ARM::t2LEApcrel: + if (isARMLowRegister(U.MI->getOperand(0).getReg())) { + NewOpc = ARM::tLEApcrel; + Bits = 8; + Scale = 4; + } + break; + case ARM::t2LDRpci: + if (isARMLowRegister(U.MI->getOperand(0).getReg())) { + NewOpc = ARM::tLDRpci; + Bits = 8; + Scale = 4; + } + break; + } + + if (!NewOpc) + continue; + + unsigned UserOffset = GetOffsetOf(U.MI) + 4; + unsigned MaxOffs = ((1 << Bits) - 1) * Scale; + // FIXME: Check if offset is multiple of scale if scale is not 4. + if (CPEIsInRange(U.MI, UserOffset, U.CPEMI, MaxOffs, false, true)) { + U.MI->setDesc(TII->get(NewOpc)); + MachineBasicBlock *MBB = U.MI->getParent(); + BBSizes[MBB->getNumber()] -= 2; + AdjustBBOffsetsAfter(MBB, -2); + ++NumT2CPShrunk; + MadeChange = true; + } + } + + MadeChange |= OptimizeThumb2Branches(MF); + MadeChange |= OptimizeThumb2JumpTables(MF); + return MadeChange; +} + +bool ARMConstantIslands::OptimizeThumb2Branches(MachineFunction &MF) { + bool MadeChange = false; + + for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i) { + ImmBranch &Br = ImmBranches[i]; + unsigned Opcode = Br.MI->getOpcode(); + unsigned NewOpc = 0; + unsigned Scale = 1; + unsigned Bits = 0; + switch (Opcode) { + default: break; + case ARM::t2B: + NewOpc = ARM::tB; + Bits = 11; + Scale = 2; + break; + case ARM::t2Bcc: + NewOpc = ARM::tBcc; + Bits = 8; + Scale = 2; + break; + } + if (!NewOpc) + continue; + + unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale; + MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB(); + if (BBIsInRange(Br.MI, DestBB, MaxOffs)) { + Br.MI->setDesc(TII->get(NewOpc)); + MachineBasicBlock *MBB = Br.MI->getParent(); + BBSizes[MBB->getNumber()] -= 2; + AdjustBBOffsetsAfter(MBB, -2); + ++NumT2BrShrunk; + MadeChange = true; + } + } + + return MadeChange; +} + + +/// OptimizeThumb2JumpTables - Use tbb / tbh instructions to generate smaller +/// jumptables when it's possible. +bool ARMConstantIslands::OptimizeThumb2JumpTables(MachineFunction &MF) { + bool MadeChange = false; + + // FIXME: After the tables are shrunk, can we get rid some of the + // constantpool tables? + const MachineJumpTableInfo *MJTI = MF.getJumpTableInfo(); + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); + for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) { + MachineInstr *MI = T2JumpTables[i]; + const TargetInstrDesc &TID = MI->getDesc(); + unsigned NumOps = TID.getNumOperands(); + unsigned JTOpIdx = NumOps - (TID.isPredicable() ? 3 : 2); + MachineOperand JTOP = MI->getOperand(JTOpIdx); + unsigned JTI = JTOP.getIndex(); + assert(JTI < JT.size()); + + bool ByteOk = true; + bool HalfWordOk = true; + unsigned JTOffset = GetOffsetOf(MI) + 4; + const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; + for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) { + MachineBasicBlock *MBB = JTBBs[j]; + unsigned DstOffset = BBOffsets[MBB->getNumber()]; + // Negative offset is not ok. FIXME: We should change BB layout to make + // sure all the branches are forward. + if (ByteOk && (DstOffset - JTOffset) > ((1<<8)-1)*2) + ByteOk = false; + unsigned TBHLimit = ((1<<16)-1)*2; + if (HalfWordOk && (DstOffset - JTOffset) > TBHLimit) + HalfWordOk = false; + if (!ByteOk && !HalfWordOk) + break; + } + + if (ByteOk || HalfWordOk) { + MachineBasicBlock *MBB = MI->getParent(); + unsigned BaseReg = MI->getOperand(0).getReg(); + bool BaseRegKill = MI->getOperand(0).isKill(); + if (!BaseRegKill) + continue; + unsigned IdxReg = MI->getOperand(1).getReg(); + bool IdxRegKill = MI->getOperand(1).isKill(); + MachineBasicBlock::iterator PrevI = MI; + if (PrevI == MBB->begin()) + continue; + + MachineInstr *AddrMI = --PrevI; + bool OptOk = true; + // Examine the instruction that calculate the jumptable entry address. + // If it's not the one just before the t2BR_JT, we won't delete it, then + // it's not worth doing the optimization. + for (unsigned k = 0, eee = AddrMI->getNumOperands(); k != eee; ++k) { + const MachineOperand &MO = AddrMI->getOperand(k); + if (!MO.isReg() || !MO.getReg()) + continue; + if (MO.isDef() && MO.getReg() != BaseReg) { + OptOk = false; + break; + } + if (MO.isUse() && !MO.isKill() && MO.getReg() != IdxReg) { + OptOk = false; + break; + } + } + if (!OptOk) + continue; + + // The previous instruction should be a tLEApcrel or t2LEApcrelJT, we want + // to delete it as well. + MachineInstr *LeaMI = --PrevI; + if ((LeaMI->getOpcode() != ARM::tLEApcrelJT && + LeaMI->getOpcode() != ARM::t2LEApcrelJT) || + LeaMI->getOperand(0).getReg() != BaseReg) + OptOk = false; + + if (!OptOk) + continue; + + unsigned Opc = ByteOk ? ARM::t2TBB : ARM::t2TBH; + MachineInstr *NewJTMI = BuildMI(MBB, MI->getDebugLoc(), TII->get(Opc)) + .addReg(IdxReg, getKillRegState(IdxRegKill)) + .addJumpTableIndex(JTI, JTOP.getTargetFlags()) + .addImm(MI->getOperand(JTOpIdx+1).getImm()); + // FIXME: Insert an "ALIGN" instruction to ensure the next instruction + // is 2-byte aligned. For now, asm printer will fix it up. + unsigned NewSize = TII->GetInstSizeInBytes(NewJTMI); + unsigned OrigSize = TII->GetInstSizeInBytes(AddrMI); + OrigSize += TII->GetInstSizeInBytes(LeaMI); + OrigSize += TII->GetInstSizeInBytes(MI); + + AddrMI->eraseFromParent(); + LeaMI->eraseFromParent(); + MI->eraseFromParent(); + + int delta = OrigSize - NewSize; + BBSizes[MBB->getNumber()] -= delta; + AdjustBBOffsetsAfter(MBB, -delta); + + ++NumTBs; + MadeChange = true; + } + } + + return MadeChange; +} diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp index a75ed3b..7170089 100644 --- a/lib/Target/ARM/ARMConstantPoolValue.cpp +++ b/lib/Target/ARM/ARMConstantPoolValue.cpp @@ -15,33 +15,31 @@ #include "llvm/ADT/FoldingSet.h" #include "llvm/GlobalValue.h" #include "llvm/Type.h" -#include "llvm/Support/Streams.h" #include "llvm/Support/raw_ostream.h" +#include <cstdlib> using namespace llvm; ARMConstantPoolValue::ARMConstantPoolValue(GlobalValue *gv, unsigned id, - ARMCP::ARMCPKind k, + ARMCP::ARMCPKind K, unsigned char PCAdj, const char *Modif, bool AddCA) : MachineConstantPoolValue((const Type*)gv->getType()), - GV(gv), S(NULL), LabelId(id), Kind(k), PCAdjust(PCAdj), + GV(gv), S(NULL), LabelId(id), Kind(K), PCAdjust(PCAdj), Modifier(Modif), AddCurrentAddress(AddCA) {} -ARMConstantPoolValue::ARMConstantPoolValue(const char *s, unsigned id, - ARMCP::ARMCPKind k, +ARMConstantPoolValue::ARMConstantPoolValue(LLVMContext &C, + const char *s, unsigned id, unsigned char PCAdj, const char *Modif, bool AddCA) - : MachineConstantPoolValue((const Type*)Type::Int32Ty), - GV(NULL), S(s), LabelId(id), Kind(k), PCAdjust(PCAdj), + : MachineConstantPoolValue((const Type*)Type::getInt32Ty(C)), + GV(NULL), S(strdup(s)), LabelId(id), Kind(ARMCP::CPValue), PCAdjust(PCAdj), Modifier(Modif), AddCurrentAddress(AddCA) {} -ARMConstantPoolValue::ARMConstantPoolValue(GlobalValue *gv, - ARMCP::ARMCPKind k, - const char *Modif) - : MachineConstantPoolValue((const Type*)Type::Int32Ty), - GV(gv), S(NULL), LabelId(0), Kind(k), PCAdjust(0), +ARMConstantPoolValue::ARMConstantPoolValue(GlobalValue *gv, const char *Modif) + : MachineConstantPoolValue((const Type*)Type::getInt32Ty(gv->getContext())), + GV(gv), S(NULL), LabelId(0), Kind(ARMCP::CPValue), PCAdjust(0), Modifier(Modif) {} int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP, @@ -56,7 +54,6 @@ int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP, if (CPV->GV == GV && CPV->S == S && CPV->LabelId == LabelId && - CPV->Kind == Kind && CPV->PCAdjust == PCAdjust) return i; } @@ -65,31 +62,28 @@ int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP, return -1; } +ARMConstantPoolValue::~ARMConstantPoolValue() { + free((void*)S); +} + void ARMConstantPoolValue::AddSelectionDAGCSEId(FoldingSetNodeID &ID) { ID.AddPointer(GV); ID.AddPointer(S); ID.AddInteger(LabelId); - ID.AddInteger((unsigned)Kind); ID.AddInteger(PCAdjust); } void ARMConstantPoolValue::dump() const { - cerr << " " << *this; + errs() << " " << *this; } -void ARMConstantPoolValue::print(std::ostream &O) const { - raw_os_ostream RawOS(O); - print(RawOS); -} void ARMConstantPoolValue::print(raw_ostream &O) const { if (GV) O << GV->getName(); else O << S; - if (isNonLazyPointer()) O << "$non_lazy_ptr"; - else if (isStub()) O << "$stub"; if (Modifier) O << "(" << Modifier << ")"; if (PCAdjust != 0) { O << "-(LPC" << LabelId << "+" << (unsigned)PCAdjust; diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h index d2b9066..00c4808 100644 --- a/lib/Target/ARM/ARMConstantPoolValue.h +++ b/lib/Target/ARM/ARMConstantPoolValue.h @@ -15,17 +15,16 @@ #define LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H #include "llvm/CodeGen/MachineConstantPool.h" -#include <iosfwd> namespace llvm { class GlobalValue; +class LLVMContext; namespace ARMCP { enum ARMCPKind { CPValue, - CPNonLazyPtr, - CPStub + CPLSDA }; } @@ -36,7 +35,7 @@ class ARMConstantPoolValue : public MachineConstantPoolValue { GlobalValue *GV; // GlobalValue being loaded. const char *S; // ExtSymbol being loaded. unsigned LabelId; // Label id of the load. - ARMCP::ARMCPKind Kind; // non_lazy_ptr or stub? + ARMCP::ARMCPKind Kind; // Value or LSDA? unsigned char PCAdjust; // Extra adjustment if constantpool is pc relative. // 8 for ARM, 4 for Thumb. const char *Modifier; // GV modifier i.e. (&GV(modifier)-(LPIC+8)) @@ -47,12 +46,12 @@ public: ARMCP::ARMCPKind Kind = ARMCP::CPValue, unsigned char PCAdj = 0, const char *Modifier = NULL, bool AddCurrentAddress = false); - ARMConstantPoolValue(const char *s, unsigned id, - ARMCP::ARMCPKind Kind = ARMCP::CPValue, + ARMConstantPoolValue(LLVMContext &C, const char *s, unsigned id, unsigned char PCAdj = 0, const char *Modifier = NULL, bool AddCurrentAddress = false); - ARMConstantPoolValue(GlobalValue *GV, ARMCP::ARMCPKind Kind, - const char *Modifier); + ARMConstantPoolValue(GlobalValue *GV, const char *Modifier); + ARMConstantPoolValue(); + ~ARMConstantPoolValue(); GlobalValue *getGV() const { return GV; } @@ -61,27 +60,27 @@ public: bool hasModifier() const { return Modifier != NULL; } bool mustAddCurrentAddress() const { return AddCurrentAddress; } unsigned getLabelId() const { return LabelId; } - bool isNonLazyPointer() const { return Kind == ARMCP::CPNonLazyPtr; } - bool isStub() const { return Kind == ARMCP::CPStub; } unsigned char getPCAdjustment() const { return PCAdjust; } + bool isLSDA() { return Kind == ARMCP::CPLSDA; } + + virtual unsigned getRelocationInfo() const { + // FIXME: This is conservatively claiming that these entries require a + // relocation, we may be able to do better than this. + return 2; + } + virtual int getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment); virtual void AddSelectionDAGCSEId(FoldingSetNodeID &ID); - void print(std::ostream *O) const { if (O) print(*O); } - void print(std::ostream &O) const; void print(raw_ostream *O) const { if (O) print(*O); } void print(raw_ostream &O) const; void dump() const; }; - inline std::ostream &operator<<(std::ostream &O, const ARMConstantPoolValue &V) { - V.print(O); - return O; -} - + inline raw_ostream &operator<<(raw_ostream &O, const ARMConstantPoolValue &V) { V.print(O); return O; diff --git a/lib/Target/ARM/ARMFrameInfo.h b/lib/Target/ARM/ARMFrameInfo.h index 405b8f2..d5dae24 100644 --- a/lib/Target/ARM/ARMFrameInfo.h +++ b/lib/Target/ARM/ARMFrameInfo.h @@ -15,15 +15,15 @@ #define ARM_FRAMEINFO_H #include "ARM.h" -#include "llvm/Target/TargetFrameInfo.h" #include "ARMSubtarget.h" +#include "llvm/Target/TargetFrameInfo.h" namespace llvm { class ARMFrameInfo : public TargetFrameInfo { public: explicit ARMFrameInfo(const ARMSubtarget &ST) - : TargetFrameInfo(StackGrowsDown, ST.getStackAlignment(), 0) { + : TargetFrameInfo(StackGrowsDown, ST.getStackAlignment(), 0, 4) { } }; diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 6485fc1..bebf4e8 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -21,6 +21,7 @@ #include "llvm/DerivedTypes.h" #include "llvm/Function.h" #include "llvm/Intrinsics.h" +#include "llvm/LLVMContext.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -30,10 +31,10 @@ #include "llvm/Target/TargetOptions.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" -using namespace llvm; +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" -static const unsigned arm_dsubreg_0 = 5; -static const unsigned arm_dsubreg_1 = 6; +using namespace llvm; //===--------------------------------------------------------------------===// /// ARMDAGToDAGISel - ARM specific code to select ARM machine @@ -48,8 +49,9 @@ class ARMDAGToDAGISel : public SelectionDAGISel { const ARMSubtarget *Subtarget; public: - explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm) - : SelectionDAGISel(tm), TM(tm), + explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm, + CodeGenOpt::Level OptLevel) + : SelectionDAGISel(tm, OptLevel), TM(tm), Subtarget(&TM.getSubtarget<ARMSubtarget>()) { } @@ -57,7 +59,8 @@ public: return "ARM Instruction Selection"; } - /// getI32Imm - Return a target constant with the specified value, of type i32. + /// getI32Imm - Return a target constant of type i32 with the specified + /// value. inline SDValue getI32Imm(unsigned Imm) { return CurDAG->getTargetConstant(Imm, MVT::i32); } @@ -74,6 +77,8 @@ public: SDValue &Offset, SDValue &Opc); bool SelectAddrMode3Offset(SDValue Op, SDValue N, SDValue &Offset, SDValue &Opc); + bool SelectAddrMode4(SDValue Op, SDValue N, SDValue &Addr, + SDValue &Mode); bool SelectAddrMode5(SDValue Op, SDValue N, SDValue &Base, SDValue &Offset); bool SelectAddrMode6(SDValue Op, SDValue N, SDValue &Addr, SDValue &Update, @@ -118,15 +123,63 @@ private: SDNode *SelectARMIndexedLoad(SDValue Op); SDNode *SelectT2IndexedLoad(SDValue Op); + /// SelectDYN_ALLOC - Select dynamic alloc for Thumb. + SDNode *SelectDYN_ALLOC(SDValue Op); + + /// SelectVLD - Select NEON load intrinsics. NumVecs should + /// be 2, 3 or 4. The opcode arrays specify the instructions used for + /// loads of D registers and even subregs and odd subregs of Q registers. + /// For NumVecs == 2, QOpcodes1 is not used. + SDNode *SelectVLD(SDValue Op, unsigned NumVecs, unsigned *DOpcodes, + unsigned *QOpcodes0, unsigned *QOpcodes1); + + /// SelectVLDSTLane - Select NEON load/store lane intrinsics. NumVecs should + /// be 2, 3 or 4. The opcode arrays specify the instructions used for + /// load/store of D registers and even subregs and odd subregs of Q registers. + SDNode *SelectVLDSTLane(SDValue Op, bool IsLoad, unsigned NumVecs, + unsigned *DOpcodes, unsigned *QOpcodes0, + unsigned *QOpcodes1); + + /// SelectV6T2BitfieldExtractOp - Select SBFX/UBFX instructions for ARM. + SDNode *SelectV6T2BitfieldExtractOp(SDValue Op, unsigned Opc); /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for /// inline asm expressions. virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode, std::vector<SDValue> &OutOps); + + /// PairDRegs - Insert a pair of double registers into an implicit def to + /// form a quad register. + SDNode *PairDRegs(EVT VT, SDValue V0, SDValue V1); }; } +/// isInt32Immediate - This method tests to see if the node is a 32-bit constant +/// operand. If so Imm will receive the 32-bit value. +static bool isInt32Immediate(SDNode *N, unsigned &Imm) { + if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) { + Imm = cast<ConstantSDNode>(N)->getZExtValue(); + return true; + } + return false; +} + +// isInt32Immediate - This method tests to see if a constant operand. +// If so Imm will receive the 32 bit value. +static bool isInt32Immediate(SDValue N, unsigned &Imm) { + return isInt32Immediate(N.getNode(), Imm); +} + +// isOpcWithIntImmediate - This method tests to see if the node is a specific +// opcode and that it has a immediate integer right operand. +// If so Imm will receive the 32 bit value. +static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) { + return N->getOpcode() == Opc && + isInt32Immediate(N->getOperand(1).getNode(), Imm); +} + + void ARMDAGToDAGISel::InstructionSelect() { DEBUG(BB->dump()); @@ -144,7 +197,7 @@ bool ARMDAGToDAGISel::SelectShifterOperandReg(SDValue Op, // Don't match base register only case. That is matched to a separate // lower complexity pattern with explicit register operand. if (ShOpcVal == ARM_AM::no_shift) return false; - + BaseReg = N.getOperand(0); unsigned ShImmVal = 0; if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { @@ -198,7 +251,7 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDValue Op, SDValue N, MVT::i32); return true; } - + // Match simple R +/- imm12 operands. if (N.getOpcode() == ISD::ADD) if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { @@ -223,15 +276,15 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDValue Op, SDValue N, return true; } } - + // Otherwise this is R +/- [possibly shifted] R ARM_AM::AddrOpc AddSub = N.getOpcode() == ISD::ADD ? ARM_AM::add:ARM_AM::sub; ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(1)); unsigned ShAmt = 0; - + Base = N.getOperand(0); Offset = N.getOperand(1); - + if (ShOpcVal != ARM_AM::no_shift) { // Check to see if the RHS of the shift is a constant, if not, we can't fold // it. @@ -243,7 +296,7 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDValue Op, SDValue N, ShOpcVal = ARM_AM::no_shift; } } - + // Try matching (R shl C) + (R). if (N.getOpcode() == ISD::ADD && ShOpcVal == ARM_AM::no_shift) { ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0)); @@ -260,7 +313,7 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDValue Op, SDValue N, } } } - + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal), MVT::i32); return true; @@ -315,7 +368,7 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue Op, SDValue N, Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::sub, 0),MVT::i32); return true; } - + if (N.getOpcode() != ISD::ADD) { Base = N; if (N.getOpcode() == ISD::FrameIndex) { @@ -326,7 +379,7 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue Op, SDValue N, Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0),MVT::i32); return true; } - + // If the RHS is +/- imm8, fold into addr mode. if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { int RHSC = (int)RHS->getZExtValue(); @@ -348,7 +401,7 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue Op, SDValue N, return true; } } - + Base = N.getOperand(0); Offset = N.getOperand(1); Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0), MVT::i32); @@ -377,6 +430,12 @@ bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDValue Op, SDValue N, return true; } +bool ARMDAGToDAGISel::SelectAddrMode4(SDValue Op, SDValue N, + SDValue &Addr, SDValue &Mode) { + Addr = N; + Mode = CurDAG->getTargetConstant(0, MVT::i32); + return true; +} bool ARMDAGToDAGISel::SelectAddrMode5(SDValue Op, SDValue N, SDValue &Base, SDValue &Offset) { @@ -392,7 +451,7 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue Op, SDValue N, MVT::i32); return true; } - + // If the RHS is +/- imm8, fold into addr mode. if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { int RHSC = (int)RHS->getZExtValue(); @@ -417,7 +476,7 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue Op, SDValue N, } } } - + Base = N; Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0), MVT::i32); @@ -428,14 +487,14 @@ bool ARMDAGToDAGISel::SelectAddrMode6(SDValue Op, SDValue N, SDValue &Addr, SDValue &Update, SDValue &Opc) { Addr = N; - // The optional writeback is handled in ARMLoadStoreOpt. + // Default to no writeback. Update = CurDAG->getRegister(0, MVT::i32); Opc = CurDAG->getTargetConstant(ARM_AM::getAM6Opc(false), MVT::i32); return true; } bool ARMDAGToDAGISel::SelectAddrModePC(SDValue Op, SDValue N, - SDValue &Offset, SDValue &Label) { + SDValue &Offset, SDValue &Label) { if (N.getOpcode() == ARMISD::PIC_ADD && N.hasOneUse()) { Offset = N.getOperand(0); SDValue N1 = N.getOperand(1); @@ -451,13 +510,11 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDValue Op, SDValue N, // FIXME dl should come from the parent load or store, not the address DebugLoc dl = Op.getDebugLoc(); if (N.getOpcode() != ISD::ADD) { - Base = N; - // We must materialize a zero in a reg! Returning a constant here - // wouldn't work without additional code to position the node within - // ISel's topological ordering in a place where ISel will process it - // normally. Instead, just explicitly issue a tMOVri8 node! - Offset = SDValue(CurDAG->getTargetNode(ARM::tMOVi8, dl, MVT::i32, - CurDAG->getTargetConstant(0, MVT::i32)), 0); + ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N); + if (!NC || NC->getZExtValue() != 0) + return false; + + Base = Offset = N; return true; } @@ -567,7 +624,7 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue Op, SDValue N, } } } - + return false; } @@ -594,41 +651,70 @@ bool ARMDAGToDAGISel::SelectT2ShifterOperandReg(SDValue Op, SDValue N, bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue Op, SDValue N, SDValue &Base, SDValue &OffImm) { // Match simple R + imm12 operands. - if (N.getOpcode() != ISD::ADD) - return false; + + // Base only. + if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB) { + if (N.getOpcode() == ISD::FrameIndex) { + // Match frame index... + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; + } else if (N.getOpcode() == ARMISD::Wrapper) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::TargetConstantPool) + return false; // We want to select t2LDRpci instead. + } else + Base = N; + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; + } if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + if (SelectT2AddrModeImm8(Op, N, Base, OffImm)) + // Let t2LDRi8 handle (R - imm8). + return false; + int RHSC = (int)RHS->getZExtValue(); - if (RHSC >= 0 && RHSC < 0x1000) { // 12 bits. + if (N.getOpcode() == ISD::SUB) + RHSC = -RHSC; + + if (RHSC >= 0 && RHSC < 0x1000) { // 12 bits (unsigned) Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); return true; } } - return false; + // Base only. + Base = N; + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; } bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue Op, SDValue N, SDValue &Base, SDValue &OffImm) { - if (N.getOpcode() == ISD::ADD) { + // Match simple R - imm8 operands. + if (N.getOpcode() == ISD::ADD || N.getOpcode() == ISD::SUB) { if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { - int RHSC = (int)RHS->getZExtValue(); - if (RHSC < 0 && RHSC > -0x100) { // 8 bits. - Base = N.getOperand(0); + int RHSC = (int)RHS->getSExtValue(); + if (N.getOpcode() == ISD::SUB) + RHSC = -RHSC; + + if ((RHSC >= -255) && (RHSC < 0)) { // 8 bits (always negative) + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); return true; } } - } else if (N.getOpcode() == ISD::SUB) { - if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { - int RHSC = (int)RHS->getZExtValue(); - if (RHSC >= 0 && RHSC < 0x100) { // 8 bits. - Base = N.getOperand(0); - OffImm = CurDAG->getTargetConstant(-RHSC, MVT::i32); - return true; - } - } } return false; @@ -643,7 +729,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDValue Op, SDValue N, if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N)) { int RHSC = (int)RHS->getZExtValue(); if (RHSC >= 0 && RHSC < 0x100) { // 8 bits. - OffImm = (AM == ISD::PRE_INC) + OffImm = ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC)) ? CurDAG->getTargetConstant(RHSC, MVT::i32) : CurDAG->getTargetConstant(-RHSC, MVT::i32); return true; @@ -658,7 +744,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8s4(SDValue Op, SDValue N, if (N.getOpcode() == ISD::ADD) { if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { int RHSC = (int)RHS->getZExtValue(); - if (((RHSC & 0x3) == 0) && (RHSC < 0 && RHSC > -0x400)) { // 8 bits. + if (((RHSC & 0x3) == 0) && + ((RHSC >= 0 && RHSC < 0x400) || (RHSC < 0 && RHSC > -0x400))) { // 8 bits. Base = N.getOperand(0); OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); return true; @@ -681,20 +768,17 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8s4(SDValue Op, SDValue N, bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue Op, SDValue N, SDValue &Base, SDValue &OffReg, SDValue &ShImm) { - // Base only. - if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB) { - Base = N; - if (N.getOpcode() == ISD::FrameIndex) { - int FI = cast<FrameIndexSDNode>(N)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); - } else if (N.getOpcode() == ARMISD::Wrapper) { - Base = N.getOperand(0); - if (Base.getOpcode() == ISD::TargetConstantPool) - return false; // We want to select t2LDRpci instead. - } - OffReg = CurDAG->getRegister(0, MVT::i32); - ShImm = CurDAG->getTargetConstant(0, MVT::i32); - return true; + // (R - imm8) should be handled by t2LDRi8. The rest are handled by t2LDRi12. + if (N.getOpcode() != ISD::ADD) + return false; + + // Leave (R + imm12) for t2LDRi12, (R - imm8) for t2LDRi8. + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if (RHSC >= 0 && RHSC < 0x1000) // 12 bits (unsigned) + return false; + else if (RHSC < 0 && RHSC >= -255) // 8 bits + return false; } // Look for (R + R) or (R + (R << [1,2,3])). @@ -708,8 +792,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue Op, SDValue N, ShOpcVal = ARM_AM::getShiftOpcForNode(Base); if (ShOpcVal == ARM_AM::lsl) std::swap(Base, OffReg); - } - + } + if (ShOpcVal == ARM_AM::lsl) { // Check to see if the RHS of the shift is a constant, if not, we can't fold // it. @@ -723,11 +807,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue Op, SDValue N, } else { ShOpcVal = ARM_AM::no_shift; } - } else if (SelectT2AddrModeImm12(Op, N, Base, ShImm) || - SelectT2AddrModeImm8 (Op, N, Base, ShImm)) - // Don't match if it's possible to match to one of the r +/- imm cases. - return false; - + } + ShImm = CurDAG->getTargetConstant(ShAmt, MVT::i32); return true; @@ -746,7 +827,7 @@ SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDValue Op) { if (AM == ISD::UNINDEXED) return NULL; - MVT LoadedVT = LD->getMemoryVT(); + EVT LoadedVT = LD->getMemoryVT(); SDValue Offset, AMOpc; bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); unsigned Opcode = 0; @@ -780,8 +861,8 @@ SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDValue Op) { SDValue Base = LD->getBasePtr(); SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), Chain }; - return CurDAG->getTargetNode(Opcode, Op.getDebugLoc(), MVT::i32, MVT::i32, - MVT::Other, Ops, 6); + return CurDAG->getMachineNode(Opcode, Op.getDebugLoc(), MVT::i32, MVT::i32, + MVT::Other, Ops, 6); } return NULL; @@ -793,14 +874,14 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDValue Op) { if (AM == ISD::UNINDEXED) return NULL; - MVT LoadedVT = LD->getMemoryVT(); + EVT LoadedVT = LD->getMemoryVT(); bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD; SDValue Offset; bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); unsigned Opcode = 0; bool Match = false; if (SelectT2AddrModeImm8Offset(Op, LD->getOffset(), Offset)) { - switch (LoadedVT.getSimpleVT()) { + switch (LoadedVT.getSimpleVT().SimpleTy) { case MVT::i32: Opcode = isPre ? ARM::t2LDR_PRE : ARM::t2LDR_POST; break; @@ -828,13 +909,300 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDValue Op) { SDValue Base = LD->getBasePtr(); SDValue Ops[]= { Base, Offset, getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), Chain }; - return CurDAG->getTargetNode(Opcode, Op.getDebugLoc(), MVT::i32, MVT::i32, - MVT::Other, Ops, 5); + return CurDAG->getMachineNode(Opcode, Op.getDebugLoc(), MVT::i32, MVT::i32, + MVT::Other, Ops, 5); + } + + return NULL; +} + +SDNode *ARMDAGToDAGISel::SelectDYN_ALLOC(SDValue Op) { + SDNode *N = Op.getNode(); + DebugLoc dl = N->getDebugLoc(); + EVT VT = Op.getValueType(); + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + SDValue Align = Op.getOperand(2); + SDValue SP = CurDAG->getRegister(ARM::SP, MVT::i32); + int32_t AlignVal = cast<ConstantSDNode>(Align)->getSExtValue(); + if (AlignVal < 0) + // We need to align the stack. Use Thumb1 tAND which is the only thumb + // instruction that can read and write SP. This matches to a pseudo + // instruction that has a chain to ensure the result is written back to + // the stack pointer. + SP = SDValue(CurDAG->getMachineNode(ARM::tANDsp, dl, VT, SP, Align), 0); + + bool isC = isa<ConstantSDNode>(Size); + uint32_t C = isC ? cast<ConstantSDNode>(Size)->getZExtValue() : ~0UL; + // Handle the most common case for both Thumb1 and Thumb2: + // tSUBspi - immediate is between 0 ... 508 inclusive. + if (C <= 508 && ((C & 3) == 0)) + // FIXME: tSUBspi encode scale 4 implicitly. + return CurDAG->SelectNodeTo(N, ARM::tSUBspi_, VT, MVT::Other, SP, + CurDAG->getTargetConstant(C/4, MVT::i32), + Chain); + + if (Subtarget->isThumb1Only()) { + // Use tADDspr since Thumb1 does not have a sub r, sp, r. ARMISelLowering + // should have negated the size operand already. FIXME: We can't insert + // new target independent node at this stage so we are forced to negate + // it earlier. Is there a better solution? + return CurDAG->SelectNodeTo(N, ARM::tADDspr_, VT, MVT::Other, SP, Size, + Chain); + } else if (Subtarget->isThumb2()) { + if (isC && Predicate_t2_so_imm(Size.getNode())) { + // t2SUBrSPi + SDValue Ops[] = { SP, CurDAG->getTargetConstant(C, MVT::i32), Chain }; + return CurDAG->SelectNodeTo(N, ARM::t2SUBrSPi_, VT, MVT::Other, Ops, 3); + } else if (isC && Predicate_imm0_4095(Size.getNode())) { + // t2SUBrSPi12 + SDValue Ops[] = { SP, CurDAG->getTargetConstant(C, MVT::i32), Chain }; + return CurDAG->SelectNodeTo(N, ARM::t2SUBrSPi12_, VT, MVT::Other, Ops, 3); + } else { + // t2SUBrSPs + SDValue Ops[] = { SP, Size, + getI32Imm(ARM_AM::getSORegOpc(ARM_AM::lsl,0)), Chain }; + return CurDAG->SelectNodeTo(N, ARM::t2SUBrSPs_, VT, MVT::Other, Ops, 4); + } + } + + // FIXME: Add ADD / SUB sp instructions for ARM. + return 0; +} + +/// PairDRegs - Insert a pair of double registers into an implicit def to +/// form a quad register. +SDNode *ARMDAGToDAGISel::PairDRegs(EVT VT, SDValue V0, SDValue V1) { + DebugLoc dl = V0.getNode()->getDebugLoc(); + SDValue Undef = + SDValue(CurDAG->getMachineNode(TargetInstrInfo::IMPLICIT_DEF, dl, VT), 0); + SDValue SubReg0 = CurDAG->getTargetConstant(ARM::DSUBREG_0, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(ARM::DSUBREG_1, MVT::i32); + SDNode *Pair = CurDAG->getMachineNode(TargetInstrInfo::INSERT_SUBREG, dl, + VT, Undef, V0, SubReg0); + return CurDAG->getMachineNode(TargetInstrInfo::INSERT_SUBREG, dl, + VT, SDValue(Pair, 0), V1, SubReg1); +} + +/// GetNEONSubregVT - Given a type for a 128-bit NEON vector, return the type +/// for a 64-bit subregister of the vector. +static EVT GetNEONSubregVT(EVT VT) { + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled NEON type"); + case MVT::v16i8: return MVT::v8i8; + case MVT::v8i16: return MVT::v4i16; + case MVT::v4f32: return MVT::v2f32; + case MVT::v4i32: return MVT::v2i32; + case MVT::v2i64: return MVT::v1i64; + } +} + +SDNode *ARMDAGToDAGISel::SelectVLD(SDValue Op, unsigned NumVecs, + unsigned *DOpcodes, unsigned *QOpcodes0, + unsigned *QOpcodes1) { + assert(NumVecs >=2 && NumVecs <= 4 && "VLD NumVecs out-of-range"); + SDNode *N = Op.getNode(); + DebugLoc dl = N->getDebugLoc(); + + SDValue MemAddr, MemUpdate, MemOpc; + if (!SelectAddrMode6(Op, N->getOperand(2), MemAddr, MemUpdate, MemOpc)) + return NULL; + + SDValue Chain = N->getOperand(0); + EVT VT = N->getValueType(0); + bool is64BitVector = VT.is64BitVector(); + + unsigned OpcodeIndex; + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vld type"); + // Double-register operations: + case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4i16: OpcodeIndex = 1; break; + case MVT::v2f32: + case MVT::v2i32: OpcodeIndex = 2; break; + case MVT::v1i64: OpcodeIndex = 3; break; + // Quad-register operations: + case MVT::v16i8: OpcodeIndex = 0; break; + case MVT::v8i16: OpcodeIndex = 1; break; + case MVT::v4f32: + case MVT::v4i32: OpcodeIndex = 2; break; + } + + if (is64BitVector) { + unsigned Opc = DOpcodes[OpcodeIndex]; + const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc, Chain }; + std::vector<EVT> ResTys(NumVecs, VT); + ResTys.push_back(MVT::Other); + return CurDAG->getMachineNode(Opc, dl, ResTys, Ops, 4); + } + + EVT RegVT = GetNEONSubregVT(VT); + if (NumVecs == 2) { + // Quad registers are directly supported for VLD2, + // loading 2 pairs of D regs. + unsigned Opc = QOpcodes0[OpcodeIndex]; + const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc, Chain }; + std::vector<EVT> ResTys(4, VT); + ResTys.push_back(MVT::Other); + SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops, 4); + Chain = SDValue(VLd, 4); + + // Combine the even and odd subregs to produce the result. + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { + SDNode *Q = PairDRegs(VT, SDValue(VLd, 2*Vec), SDValue(VLd, 2*Vec+1)); + ReplaceUses(SDValue(N, Vec), SDValue(Q, 0)); + } + } else { + // Otherwise, quad registers are loaded with two separate instructions, + // where one loads the even registers and the other loads the odd registers. + + // Enable writeback to the address register. + MemOpc = CurDAG->getTargetConstant(ARM_AM::getAM6Opc(true), MVT::i32); + + std::vector<EVT> ResTys(NumVecs, RegVT); + ResTys.push_back(MemAddr.getValueType()); + ResTys.push_back(MVT::Other); + + // Load the even subreg. + unsigned Opc = QOpcodes0[OpcodeIndex]; + const SDValue OpsA[] = { MemAddr, MemUpdate, MemOpc, Chain }; + SDNode *VLdA = CurDAG->getMachineNode(Opc, dl, ResTys, OpsA, 4); + Chain = SDValue(VLdA, NumVecs+1); + + // Load the odd subreg. + Opc = QOpcodes1[OpcodeIndex]; + const SDValue OpsB[] = { SDValue(VLdA, NumVecs), MemUpdate, MemOpc, Chain }; + SDNode *VLdB = CurDAG->getMachineNode(Opc, dl, ResTys, OpsB, 4); + Chain = SDValue(VLdB, NumVecs+1); + + // Combine the even and odd subregs to produce the result. + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { + SDNode *Q = PairDRegs(VT, SDValue(VLdA, Vec), SDValue(VLdB, Vec)); + ReplaceUses(SDValue(N, Vec), SDValue(Q, 0)); + } + } + ReplaceUses(SDValue(N, NumVecs), Chain); + return NULL; +} + +SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDValue Op, bool IsLoad, + unsigned NumVecs, unsigned *DOpcodes, + unsigned *QOpcodes0, + unsigned *QOpcodes1) { + assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range"); + SDNode *N = Op.getNode(); + DebugLoc dl = N->getDebugLoc(); + + SDValue MemAddr, MemUpdate, MemOpc; + if (!SelectAddrMode6(Op, N->getOperand(2), MemAddr, MemUpdate, MemOpc)) + return NULL; + + SDValue Chain = N->getOperand(0); + unsigned Lane = + cast<ConstantSDNode>(N->getOperand(NumVecs+3))->getZExtValue(); + EVT VT = IsLoad ? N->getValueType(0) : N->getOperand(3).getValueType(); + bool is64BitVector = VT.is64BitVector(); + + // Quad registers are handled by load/store of subregs. Find the subreg info. + unsigned NumElts = 0; + int SubregIdx = 0; + EVT RegVT = VT; + if (!is64BitVector) { + RegVT = GetNEONSubregVT(VT); + NumElts = RegVT.getVectorNumElements(); + SubregIdx = (Lane < NumElts) ? ARM::DSUBREG_0 : ARM::DSUBREG_1; + } + + unsigned OpcodeIndex; + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vld/vst lane type"); + // Double-register operations: + case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4i16: OpcodeIndex = 1; break; + case MVT::v2f32: + case MVT::v2i32: OpcodeIndex = 2; break; + // Quad-register operations: + case MVT::v8i16: OpcodeIndex = 0; break; + case MVT::v4f32: + case MVT::v4i32: OpcodeIndex = 1; break; + } + + SmallVector<SDValue, 9> Ops; + Ops.push_back(MemAddr); + Ops.push_back(MemUpdate); + Ops.push_back(MemOpc); + + unsigned Opc = 0; + if (is64BitVector) { + Opc = DOpcodes[OpcodeIndex]; + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + Ops.push_back(N->getOperand(Vec+3)); + } else { + // Check if this is loading the even or odd subreg of a Q register. + if (Lane < NumElts) { + Opc = QOpcodes0[OpcodeIndex]; + } else { + Lane -= NumElts; + Opc = QOpcodes1[OpcodeIndex]; + } + // Extract the subregs of the input vector. + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + Ops.push_back(CurDAG->getTargetExtractSubreg(SubregIdx, dl, RegVT, + N->getOperand(Vec+3))); + } + Ops.push_back(getI32Imm(Lane)); + Ops.push_back(Chain); + + if (!IsLoad) + return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), NumVecs+5); + + std::vector<EVT> ResTys(NumVecs, RegVT); + ResTys.push_back(MVT::Other); + SDNode *VLdLn = + CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), NumVecs+5); + // For a 64-bit vector load to D registers, nothing more needs to be done. + if (is64BitVector) + return VLdLn; + + // For 128-bit vectors, take the 64-bit results of the load and insert them + // as subregs into the result. + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { + SDValue QuadVec = CurDAG->getTargetInsertSubreg(SubregIdx, dl, VT, + N->getOperand(Vec+3), + SDValue(VLdLn, Vec)); + ReplaceUses(SDValue(N, Vec), QuadVec); } + Chain = SDValue(VLdLn, NumVecs); + ReplaceUses(SDValue(N, NumVecs), Chain); return NULL; } +SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDValue Op, + unsigned Opc) { + if (!Subtarget->hasV6T2Ops()) + return NULL; + + unsigned Shl_imm = 0; + if (isOpcWithIntImmediate(Op.getOperand(0).getNode(), ISD::SHL, Shl_imm)){ + assert(Shl_imm > 0 && Shl_imm < 32 && "bad amount in shift node!"); + unsigned Srl_imm = 0; + if (isInt32Immediate(Op.getOperand(1), Srl_imm)) { + assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!"); + unsigned Width = 32 - Srl_imm; + int LSB = Srl_imm - Shl_imm; + if ((LSB + Width) > 32) + return NULL; + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { Op.getOperand(0).getOperand(0), + CurDAG->getTargetConstant(LSB, MVT::i32), + CurDAG->getTargetConstant(Width, MVT::i32), + getAL(CurDAG), Reg0 }; + return CurDAG->SelectNodeTo(Op.getNode(), Opc, MVT::i32, Ops, 5); + } + } + return NULL; +} SDNode *ARMDAGToDAGISel::Select(SDValue Op) { SDNode *N = Op.getNode(); @@ -848,44 +1216,50 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) { case ISD::Constant: { unsigned Val = cast<ConstantSDNode>(N)->getZExtValue(); bool UseCP = true; - if (Subtarget->isThumb()) { - if (Subtarget->hasThumb2()) - // Thumb2 has the MOVT instruction, so all immediates can - // be done with MOV + MOVT, at worst. - UseCP = 0; - else + if (Subtarget->hasThumb2()) + // Thumb2-aware targets have the MOVT instruction, so all immediates can + // be done with MOV + MOVT, at worst. + UseCP = 0; + else { + if (Subtarget->isThumb()) { UseCP = (Val > 255 && // MOV ~Val > 255 && // MOV + MVN !ARM_AM::isThumbImmShiftedVal(Val)); // MOV + LSL - } else - UseCP = (ARM_AM::getSOImmVal(Val) == -1 && // MOV - ARM_AM::getSOImmVal(~Val) == -1 && // MVN - !ARM_AM::isSOImmTwoPartVal(Val)); // two instrs. + } else + UseCP = (ARM_AM::getSOImmVal(Val) == -1 && // MOV + ARM_AM::getSOImmVal(~Val) == -1 && // MVN + !ARM_AM::isSOImmTwoPartVal(Val)); // two instrs. + } + if (UseCP) { SDValue CPIdx = - CurDAG->getTargetConstantPool(ConstantInt::get(Type::Int32Ty, Val), + CurDAG->getTargetConstantPool(ConstantInt::get( + Type::getInt32Ty(*CurDAG->getContext()), Val), TLI.getPointerTy()); SDNode *ResNode; - if (Subtarget->isThumb()) - ResNode = CurDAG->getTargetNode(ARM::tLDRcp, dl, MVT::i32, MVT::Other, - CPIdx, CurDAG->getEntryNode()); - else { + if (Subtarget->isThumb1Only()) { + SDValue Pred = CurDAG->getTargetConstant(0xEULL, MVT::i32); + SDValue PredReg = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { CPIdx, Pred, PredReg, CurDAG->getEntryNode() }; + ResNode = CurDAG->getMachineNode(ARM::tLDRcp, dl, MVT::i32, MVT::Other, + Ops, 4); + } else { SDValue Ops[] = { - CPIdx, + CPIdx, CurDAG->getRegister(0, MVT::i32), CurDAG->getTargetConstant(0, MVT::i32), getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), CurDAG->getEntryNode() }; - ResNode=CurDAG->getTargetNode(ARM::LDRcp, dl, MVT::i32, MVT::Other, - Ops, 6); + ResNode=CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other, + Ops, 6); } ReplaceUses(Op, SDValue(ResNode, 0)); return NULL; } - + // Other cases are autogenerated. break; } @@ -893,80 +1267,106 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) { // Selects to ADDri FI, 0 which in turn will become ADDri SP, imm. int FI = cast<FrameIndexSDNode>(N)->getIndex(); SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); - if (Subtarget->isThumb()) { + if (Subtarget->isThumb1Only()) { return CurDAG->SelectNodeTo(N, ARM::tADDrSPi, MVT::i32, TFI, CurDAG->getTargetConstant(0, MVT::i32)); } else { + unsigned Opc = ((Subtarget->isThumb() && Subtarget->hasThumb2()) ? + ARM::t2ADDri : ARM::ADDri); SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32), - getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), - CurDAG->getRegister(0, MVT::i32) }; - return CurDAG->SelectNodeTo(N, ARM::ADDri, MVT::i32, Ops, 5); + getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5); } } - case ISD::ADD: { - if (!Subtarget->isThumb()) - break; - // Select add sp, c to tADDhirr. - SDValue N0 = Op.getOperand(0); - SDValue N1 = Op.getOperand(1); - RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(Op.getOperand(0)); - RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(Op.getOperand(1)); - if (LHSR && LHSR->getReg() == ARM::SP) { - std::swap(N0, N1); - std::swap(LHSR, RHSR); - } - if (RHSR && RHSR->getReg() == ARM::SP) { - SDValue Val = SDValue(CurDAG->getTargetNode(ARM::tMOVlor2hir, dl, - Op.getValueType(), N0, N0), 0); - return CurDAG->SelectNodeTo(N, ARM::tADDhirr, Op.getValueType(), Val, N1); - } + case ARMISD::DYN_ALLOC: + return SelectDYN_ALLOC(Op); + case ISD::SRL: + if (SDNode *I = SelectV6T2BitfieldExtractOp(Op, + Subtarget->isThumb() ? ARM::t2UBFX : ARM::UBFX)) + return I; + break; + case ISD::SRA: + if (SDNode *I = SelectV6T2BitfieldExtractOp(Op, + Subtarget->isThumb() ? ARM::t2SBFX : ARM::SBFX)) + return I; break; - } case ISD::MUL: - if (Subtarget->isThumb()) + if (Subtarget->isThumb1Only()) break; if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { unsigned RHSV = C->getZExtValue(); if (!RHSV) break; if (isPowerOf2_32(RHSV-1)) { // 2^n+1? + unsigned ShImm = Log2_32(RHSV-1); + if (ShImm >= 32) + break; SDValue V = Op.getOperand(0); - unsigned ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, Log2_32(RHSV-1)); - SDValue Ops[] = { V, V, CurDAG->getRegister(0, MVT::i32), - CurDAG->getTargetConstant(ShImm, MVT::i32), - getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), - CurDAG->getRegister(0, MVT::i32) }; - return CurDAG->SelectNodeTo(N, ARM::ADDrs, MVT::i32, Ops, 7); + ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm); + SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, MVT::i32); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + if (Subtarget->isThumb()) { + SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0, Reg0 }; + return CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops, 6); + } else { + SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 }; + return CurDAG->SelectNodeTo(N, ARM::ADDrs, MVT::i32, Ops, 7); + } } if (isPowerOf2_32(RHSV+1)) { // 2^n-1? + unsigned ShImm = Log2_32(RHSV+1); + if (ShImm >= 32) + break; SDValue V = Op.getOperand(0); - unsigned ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, Log2_32(RHSV+1)); - SDValue Ops[] = { V, V, CurDAG->getRegister(0, MVT::i32), - CurDAG->getTargetConstant(ShImm, MVT::i32), - getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), - CurDAG->getRegister(0, MVT::i32) }; - return CurDAG->SelectNodeTo(N, ARM::RSBrs, MVT::i32, Ops, 7); + ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm); + SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, MVT::i32); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + if (Subtarget->isThumb()) { + SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0 }; + return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops, 5); + } else { + SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 }; + return CurDAG->SelectNodeTo(N, ARM::RSBrs, MVT::i32, Ops, 7); + } } } break; case ARMISD::FMRRD: - return CurDAG->getTargetNode(ARM::FMRRD, dl, MVT::i32, MVT::i32, - Op.getOperand(0), getAL(CurDAG), - CurDAG->getRegister(0, MVT::i32)); + return CurDAG->getMachineNode(ARM::FMRRD, dl, MVT::i32, MVT::i32, + Op.getOperand(0), getAL(CurDAG), + CurDAG->getRegister(0, MVT::i32)); case ISD::UMUL_LOHI: { - SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1), + if (Subtarget->isThumb1Only()) + break; + if (Subtarget->isThumb()) { + SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1), + getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops,4); + } else { + SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1), getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), CurDAG->getRegister(0, MVT::i32) }; - return CurDAG->getTargetNode(ARM::UMULL, dl, MVT::i32, MVT::i32, Ops, 5); + return CurDAG->getMachineNode(ARM::UMULL, dl, MVT::i32, MVT::i32, Ops, 5); + } } case ISD::SMUL_LOHI: { - SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1), + if (Subtarget->isThumb1Only()) + break; + if (Subtarget->isThumb()) { + SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1), + getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops,4); + } else { + SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1), getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), CurDAG->getRegister(0, MVT::i32) }; - return CurDAG->getTargetNode(ARM::SMULL, dl, MVT::i32, MVT::i32, Ops, 5); + return CurDAG->getMachineNode(ARM::SMULL, dl, MVT::i32, MVT::i32, Ops, 5); + } } case ISD::LOAD: { SDNode *ResNode = 0; - if (Subtarget->isThumb2()) + if (Subtarget->isThumb() && Subtarget->hasThumb2()) ResNode = SelectT2IndexedLoad(Op); else ResNode = SelectARMIndexedLoad(Op); @@ -988,7 +1388,7 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) { // Emits: (t2Bcc:void (bb:Other):$dst, (imm:i32):$cc) // Pattern complexity = 6 cost = 1 size = 0 - unsigned Opc = Subtarget->isThumb() ? + unsigned Opc = Subtarget->isThumb() ? ((Subtarget->hasThumb2()) ? ARM::t2Bcc : ARM::tBcc) : ARM::Bcc; SDValue Chain = Op.getOperand(0); SDValue N1 = Op.getOperand(1); @@ -1003,8 +1403,8 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) { cast<ConstantSDNode>(N2)->getZExtValue()), MVT::i32); SDValue Ops[] = { N1, Tmp2, N3, Chain, InFlag }; - SDNode *ResNode = CurDAG->getTargetNode(Opc, dl, MVT::Other, - MVT::Flag, Ops, 5); + SDNode *ResNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, + MVT::Flag, Ops, 5); Chain = SDValue(ResNode, 0); if (Op.getNode()->getNumValues() == 2) { InFlag = SDValue(ResNode, 1); @@ -1014,8 +1414,7 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) { return NULL; } case ARMISD::CMOV: { - bool isThumb = Subtarget->isThumb(); - MVT VT = Op.getValueType(); + EVT VT = Op.getValueType(); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2 = Op.getOperand(2); @@ -1024,39 +1423,79 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) { assert(N2.getOpcode() == ISD::Constant); assert(N3.getOpcode() == ISD::Register); - // Pattern: (ARMcmov:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc) - // Emits: (MOVCCs:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc) - // Pattern complexity = 18 cost = 1 size = 0 - SDValue CPTmp0; - SDValue CPTmp1; - SDValue CPTmp2; - if (!isThumb && VT == MVT::i32 && - SelectShifterOperandReg(Op, N1, CPTmp0, CPTmp1, CPTmp2)) { - SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned) - cast<ConstantSDNode>(N2)->getZExtValue()), - MVT::i32); - SDValue Ops[] = { N0, CPTmp0, CPTmp1, CPTmp2, Tmp2, N3, InFlag }; - return CurDAG->SelectNodeTo(Op.getNode(), ARM::MOVCCs, MVT::i32, Ops, 7); - } + if (!Subtarget->isThumb1Only() && VT == MVT::i32) { + // Pattern: (ARMcmov:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc) + // Emits: (MOVCCs:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc) + // Pattern complexity = 18 cost = 1 size = 0 + SDValue CPTmp0; + SDValue CPTmp1; + SDValue CPTmp2; + if (Subtarget->isThumb()) { + if (SelectT2ShifterOperandReg(Op, N1, CPTmp0, CPTmp1)) { + unsigned SOVal = cast<ConstantSDNode>(CPTmp1)->getZExtValue(); + unsigned SOShOp = ARM_AM::getSORegShOp(SOVal); + unsigned Opc = 0; + switch (SOShOp) { + case ARM_AM::lsl: Opc = ARM::t2MOVCClsl; break; + case ARM_AM::lsr: Opc = ARM::t2MOVCClsr; break; + case ARM_AM::asr: Opc = ARM::t2MOVCCasr; break; + case ARM_AM::ror: Opc = ARM::t2MOVCCror; break; + default: + llvm_unreachable("Unknown so_reg opcode!"); + break; + } + SDValue SOShImm = + CurDAG->getTargetConstant(ARM_AM::getSORegOffset(SOVal), MVT::i32); + SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned) + cast<ConstantSDNode>(N2)->getZExtValue()), + MVT::i32); + SDValue Ops[] = { N0, CPTmp0, SOShImm, Tmp2, N3, InFlag }; + return CurDAG->SelectNodeTo(Op.getNode(), Opc, MVT::i32,Ops, 6); + } + } else { + if (SelectShifterOperandReg(Op, N1, CPTmp0, CPTmp1, CPTmp2)) { + SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned) + cast<ConstantSDNode>(N2)->getZExtValue()), + MVT::i32); + SDValue Ops[] = { N0, CPTmp0, CPTmp1, CPTmp2, Tmp2, N3, InFlag }; + return CurDAG->SelectNodeTo(Op.getNode(), + ARM::MOVCCs, MVT::i32, Ops, 7); + } + } - // Pattern: (ARMcmov:i32 GPR:i32:$false, - // (imm:i32)<<P:Predicate_so_imm>><<X:so_imm_XFORM>>:$true, - // (imm:i32):$cc) - // Emits: (MOVCCi:i32 GPR:i32:$false, - // (so_imm_XFORM:i32 (imm:i32):$true), (imm:i32):$cc) - // Pattern complexity = 10 cost = 1 size = 0 - if (VT == MVT::i32 && - N3.getOpcode() == ISD::Constant && - Predicate_so_imm(N3.getNode())) { - SDValue Tmp1 = CurDAG->getTargetConstant(((unsigned) - cast<ConstantSDNode>(N1)->getZExtValue()), - MVT::i32); - Tmp1 = Transform_so_imm_XFORM(Tmp1.getNode()); - SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned) - cast<ConstantSDNode>(N2)->getZExtValue()), - MVT::i32); - SDValue Ops[] = { N0, Tmp1, Tmp2, N3, InFlag }; - return CurDAG->SelectNodeTo(Op.getNode(), ARM::MOVCCi, MVT::i32, Ops, 5); + // Pattern: (ARMcmov:i32 GPR:i32:$false, + // (imm:i32)<<P:Predicate_so_imm>>:$true, + // (imm:i32):$cc) + // Emits: (MOVCCi:i32 GPR:i32:$false, + // (so_imm:i32 (imm:i32):$true), (imm:i32):$cc) + // Pattern complexity = 10 cost = 1 size = 0 + if (N3.getOpcode() == ISD::Constant) { + if (Subtarget->isThumb()) { + if (Predicate_t2_so_imm(N3.getNode())) { + SDValue Tmp1 = CurDAG->getTargetConstant(((unsigned) + cast<ConstantSDNode>(N1)->getZExtValue()), + MVT::i32); + SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned) + cast<ConstantSDNode>(N2)->getZExtValue()), + MVT::i32); + SDValue Ops[] = { N0, Tmp1, Tmp2, N3, InFlag }; + return CurDAG->SelectNodeTo(Op.getNode(), + ARM::t2MOVCCi, MVT::i32, Ops, 5); + } + } else { + if (Predicate_so_imm(N3.getNode())) { + SDValue Tmp1 = CurDAG->getTargetConstant(((unsigned) + cast<ConstantSDNode>(N1)->getZExtValue()), + MVT::i32); + SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned) + cast<ConstantSDNode>(N2)->getZExtValue()), + MVT::i32); + SDValue Ops[] = { N0, Tmp1, Tmp2, N3, InFlag }; + return CurDAG->SelectNodeTo(Op.getNode(), + ARM::MOVCCi, MVT::i32, Ops, 5); + } + } + } } // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc) @@ -1073,23 +1512,25 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) { MVT::i32); SDValue Ops[] = { N0, N1, Tmp2, N3, InFlag }; unsigned Opc = 0; - switch (VT.getSimpleVT()) { + switch (VT.getSimpleVT().SimpleTy) { default: assert(false && "Illegal conditional move type!"); break; case MVT::i32: - Opc = isThumb ? ARM::tMOVCCr : ARM::MOVCCr; + Opc = Subtarget->isThumb() + ? (Subtarget->hasThumb2() ? ARM::t2MOVCCr : ARM::tMOVCCr_pseudo) + : ARM::MOVCCr; break; case MVT::f32: Opc = ARM::FCPYScc; break; case MVT::f64: Opc = ARM::FCPYDcc; - break; + break; } return CurDAG->SelectNodeTo(Op.getNode(), Opc, VT, Ops, 5); } case ARMISD::CNEG: { - MVT VT = Op.getValueType(); + EVT VT = Op.getValueType(); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2 = Op.getOperand(2); @@ -1103,7 +1544,7 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) { MVT::i32); SDValue Ops[] = { N0, N1, Tmp2, N3, InFlag }; unsigned Opc = 0; - switch (VT.getSimpleVT()) { + switch (VT.getSimpleVT().SimpleTy) { default: assert(false && "Illegal conditional move type!"); break; case MVT::f32: @@ -1116,104 +1557,308 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) { return CurDAG->SelectNodeTo(Op.getNode(), Opc, VT, Ops, 5); } - case ISD::DECLARE: { - SDValue Chain = Op.getOperand(0); - SDValue N1 = Op.getOperand(1); - SDValue N2 = Op.getOperand(2); - FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(N1); - // FIXME: handle VLAs. - if (!FINode) { - ReplaceUses(Op.getValue(0), Chain); - return NULL; + case ARMISD::VZIP: { + unsigned Opc = 0; + EVT VT = N->getValueType(0); + switch (VT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::v8i8: Opc = ARM::VZIPd8; break; + case MVT::v4i16: Opc = ARM::VZIPd16; break; + case MVT::v2f32: + case MVT::v2i32: Opc = ARM::VZIPd32; break; + case MVT::v16i8: Opc = ARM::VZIPq8; break; + case MVT::v8i16: Opc = ARM::VZIPq16; break; + case MVT::v4f32: + case MVT::v4i32: Opc = ARM::VZIPq32; break; } - if (N2.getOpcode() == ARMISD::PIC_ADD && isa<LoadSDNode>(N2.getOperand(0))) - N2 = N2.getOperand(0); - LoadSDNode *Ld = dyn_cast<LoadSDNode>(N2); - if (!Ld) { - ReplaceUses(Op.getValue(0), Chain); - return NULL; + return CurDAG->getMachineNode(Opc, dl, VT, VT, + N->getOperand(0), N->getOperand(1)); + } + case ARMISD::VUZP: { + unsigned Opc = 0; + EVT VT = N->getValueType(0); + switch (VT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::v8i8: Opc = ARM::VUZPd8; break; + case MVT::v4i16: Opc = ARM::VUZPd16; break; + case MVT::v2f32: + case MVT::v2i32: Opc = ARM::VUZPd32; break; + case MVT::v16i8: Opc = ARM::VUZPq8; break; + case MVT::v8i16: Opc = ARM::VUZPq16; break; + case MVT::v4f32: + case MVT::v4i32: Opc = ARM::VUZPq32; break; } - SDValue BasePtr = Ld->getBasePtr(); - assert(BasePtr.getOpcode() == ARMISD::Wrapper && - isa<ConstantPoolSDNode>(BasePtr.getOperand(0)) && - "llvm.dbg.variable should be a constantpool node"); - ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(BasePtr.getOperand(0)); - GlobalValue *GV = 0; - if (CP->isMachineConstantPoolEntry()) { - ARMConstantPoolValue *ACPV = (ARMConstantPoolValue*)CP->getMachineCPVal(); - GV = ACPV->getGV(); - } else - GV = dyn_cast<GlobalValue>(CP->getConstVal()); - if (!GV) { - ReplaceUses(Op.getValue(0), Chain); - return NULL; + return CurDAG->getMachineNode(Opc, dl, VT, VT, + N->getOperand(0), N->getOperand(1)); + } + case ARMISD::VTRN: { + unsigned Opc = 0; + EVT VT = N->getValueType(0); + switch (VT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::v8i8: Opc = ARM::VTRNd8; break; + case MVT::v4i16: Opc = ARM::VTRNd16; break; + case MVT::v2f32: + case MVT::v2i32: Opc = ARM::VTRNd32; break; + case MVT::v16i8: Opc = ARM::VTRNq8; break; + case MVT::v8i16: Opc = ARM::VTRNq16; break; + case MVT::v4f32: + case MVT::v4i32: Opc = ARM::VTRNq32; break; } - - SDValue Tmp1 = CurDAG->getTargetFrameIndex(FINode->getIndex(), - TLI.getPointerTy()); - SDValue Tmp2 = CurDAG->getTargetGlobalAddress(GV, TLI.getPointerTy()); - SDValue Ops[] = { Tmp1, Tmp2, Chain }; - return CurDAG->getTargetNode(TargetInstrInfo::DECLARE, dl, - MVT::Other, Ops, 3); + return CurDAG->getMachineNode(Opc, dl, VT, VT, + N->getOperand(0), N->getOperand(1)); } - case ISD::CONCAT_VECTORS: { - MVT VT = Op.getValueType(); - assert(VT.is128BitVector() && Op.getNumOperands() == 2 && - "unexpected CONCAT_VECTORS"); - SDValue N0 = Op.getOperand(0); - SDValue N1 = Op.getOperand(1); - SDNode *Result = - CurDAG->getTargetNode(TargetInstrInfo::IMPLICIT_DEF, dl, VT); - if (N0.getOpcode() != ISD::UNDEF) - Result = CurDAG->getTargetNode(TargetInstrInfo::INSERT_SUBREG, dl, VT, - SDValue(Result, 0), N0, - CurDAG->getTargetConstant(arm_dsubreg_0, - MVT::i32)); - if (N1.getOpcode() != ISD::UNDEF) - Result = CurDAG->getTargetNode(TargetInstrInfo::INSERT_SUBREG, dl, VT, - SDValue(Result, 0), N1, - CurDAG->getTargetConstant(arm_dsubreg_1, - MVT::i32)); - return Result; - } + case ISD::INTRINSIC_VOID: + case ISD::INTRINSIC_W_CHAIN: { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + EVT VT = N->getValueType(0); + unsigned Opc = 0; + + switch (IntNo) { + default: + break; + + case Intrinsic::arm_neon_vld2: { + unsigned DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16, + ARM::VLD2d32, ARM::VLD2d64 }; + unsigned QOpcodes[] = { ARM::VLD2q8, ARM::VLD2q16, ARM::VLD2q32 }; + return SelectVLD(Op, 2, DOpcodes, QOpcodes, 0); + } - case ISD::VECTOR_SHUFFLE: { - MVT VT = Op.getValueType(); - - // Match 128-bit splat to VDUPLANEQ. (This could be done with a Pat in - // ARMInstrNEON.td but it is awkward because the shuffle mask needs to be - // transformed first into a lane number and then to both a subregister - // index and an adjusted lane number.) If the source operand is a - // SCALAR_TO_VECTOR, leave it so it will be matched later as a VDUP. - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - if (VT.is128BitVector() && SVOp->isSplat() && - Op.getOperand(0).getOpcode() != ISD::SCALAR_TO_VECTOR && - Op.getOperand(1).getOpcode() == ISD::UNDEF) { - unsigned LaneVal = SVOp->getSplatIndex(); - - MVT HalfVT; - unsigned Opc = 0; - switch (VT.getVectorElementType().getSimpleVT()) { - default: assert(false && "unhandled VDUP splat type"); - case MVT::i8: Opc = ARM::VDUPLN8q; HalfVT = MVT::v8i8; break; - case MVT::i16: Opc = ARM::VDUPLN16q; HalfVT = MVT::v4i16; break; - case MVT::i32: Opc = ARM::VDUPLN32q; HalfVT = MVT::v2i32; break; - case MVT::f32: Opc = ARM::VDUPLNfq; HalfVT = MVT::v2f32; break; + case Intrinsic::arm_neon_vld3: { + unsigned DOpcodes[] = { ARM::VLD3d8, ARM::VLD3d16, + ARM::VLD3d32, ARM::VLD3d64 }; + unsigned QOpcodes0[] = { ARM::VLD3q8a, ARM::VLD3q16a, ARM::VLD3q32a }; + unsigned QOpcodes1[] = { ARM::VLD3q8b, ARM::VLD3q16b, ARM::VLD3q32b }; + return SelectVLD(Op, 3, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vld4: { + unsigned DOpcodes[] = { ARM::VLD4d8, ARM::VLD4d16, + ARM::VLD4d32, ARM::VLD4d64 }; + unsigned QOpcodes0[] = { ARM::VLD4q8a, ARM::VLD4q16a, ARM::VLD4q32a }; + unsigned QOpcodes1[] = { ARM::VLD4q8b, ARM::VLD4q16b, ARM::VLD4q32b }; + return SelectVLD(Op, 4, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vld2lane: { + unsigned DOpcodes[] = { ARM::VLD2LNd8, ARM::VLD2LNd16, ARM::VLD2LNd32 }; + unsigned QOpcodes0[] = { ARM::VLD2LNq16a, ARM::VLD2LNq32a }; + unsigned QOpcodes1[] = { ARM::VLD2LNq16b, ARM::VLD2LNq32b }; + return SelectVLDSTLane(Op, true, 2, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vld3lane: { + unsigned DOpcodes[] = { ARM::VLD3LNd8, ARM::VLD3LNd16, ARM::VLD3LNd32 }; + unsigned QOpcodes0[] = { ARM::VLD3LNq16a, ARM::VLD3LNq32a }; + unsigned QOpcodes1[] = { ARM::VLD3LNq16b, ARM::VLD3LNq32b }; + return SelectVLDSTLane(Op, true, 3, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vld4lane: { + unsigned DOpcodes[] = { ARM::VLD4LNd8, ARM::VLD4LNd16, ARM::VLD4LNd32 }; + unsigned QOpcodes0[] = { ARM::VLD4LNq16a, ARM::VLD4LNq32a }; + unsigned QOpcodes1[] = { ARM::VLD4LNq16b, ARM::VLD4LNq32b }; + return SelectVLDSTLane(Op, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vst2: { + SDValue MemAddr, MemUpdate, MemOpc; + if (!SelectAddrMode6(Op, N->getOperand(2), MemAddr, MemUpdate, MemOpc)) + return NULL; + SDValue Chain = N->getOperand(0); + VT = N->getOperand(3).getValueType(); + if (VT.is64BitVector()) { + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vst2 type"); + case MVT::v8i8: Opc = ARM::VST2d8; break; + case MVT::v4i16: Opc = ARM::VST2d16; break; + case MVT::v2f32: + case MVT::v2i32: Opc = ARM::VST2d32; break; + case MVT::v1i64: Opc = ARM::VST2d64; break; + } + const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc, + N->getOperand(3), N->getOperand(4), Chain }; + return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 6); + } + // Quad registers are stored as pairs of double registers. + EVT RegVT; + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vst2 type"); + case MVT::v16i8: Opc = ARM::VST2q8; RegVT = MVT::v8i8; break; + case MVT::v8i16: Opc = ARM::VST2q16; RegVT = MVT::v4i16; break; + case MVT::v4f32: Opc = ARM::VST2q32; RegVT = MVT::v2f32; break; + case MVT::v4i32: Opc = ARM::VST2q32; RegVT = MVT::v2i32; break; + } + SDValue D0 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT, + N->getOperand(3)); + SDValue D1 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT, + N->getOperand(3)); + SDValue D2 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT, + N->getOperand(4)); + SDValue D3 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT, + N->getOperand(4)); + const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc, + D0, D1, D2, D3, Chain }; + return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 8); + } + + case Intrinsic::arm_neon_vst3: { + SDValue MemAddr, MemUpdate, MemOpc; + if (!SelectAddrMode6(Op, N->getOperand(2), MemAddr, MemUpdate, MemOpc)) + return NULL; + SDValue Chain = N->getOperand(0); + VT = N->getOperand(3).getValueType(); + if (VT.is64BitVector()) { + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vst3 type"); + case MVT::v8i8: Opc = ARM::VST3d8; break; + case MVT::v4i16: Opc = ARM::VST3d16; break; + case MVT::v2f32: + case MVT::v2i32: Opc = ARM::VST3d32; break; + case MVT::v1i64: Opc = ARM::VST3d64; break; + } + const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc, + N->getOperand(3), N->getOperand(4), + N->getOperand(5), Chain }; + return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 7); } + // Quad registers are stored with two separate instructions, where one + // stores the even registers and the other stores the odd registers. + EVT RegVT; + unsigned Opc2 = 0; + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vst3 type"); + case MVT::v16i8: + Opc = ARM::VST3q8a; Opc2 = ARM::VST3q8b; RegVT = MVT::v8i8; break; + case MVT::v8i16: + Opc = ARM::VST3q16a; Opc2 = ARM::VST3q16b; RegVT = MVT::v4i16; break; + case MVT::v4f32: + Opc = ARM::VST3q32a; Opc2 = ARM::VST3q32b; RegVT = MVT::v2f32; break; + case MVT::v4i32: + Opc = ARM::VST3q32a; Opc2 = ARM::VST3q32b; RegVT = MVT::v2i32; break; + } + // Enable writeback to the address register. + MemOpc = CurDAG->getTargetConstant(ARM_AM::getAM6Opc(true), MVT::i32); + + SDValue D0 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT, + N->getOperand(3)); + SDValue D2 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT, + N->getOperand(4)); + SDValue D4 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT, + N->getOperand(5)); + const SDValue OpsA[] = { MemAddr, MemUpdate, MemOpc, D0, D2, D4, Chain }; + SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(), + MVT::Other, OpsA, 7); + Chain = SDValue(VStA, 1); + + SDValue D1 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT, + N->getOperand(3)); + SDValue D3 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT, + N->getOperand(4)); + SDValue D5 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT, + N->getOperand(5)); + MemAddr = SDValue(VStA, 0); + const SDValue OpsB[] = { MemAddr, MemUpdate, MemOpc, D1, D3, D5, Chain }; + SDNode *VStB = CurDAG->getMachineNode(Opc2, dl, MemAddr.getValueType(), + MVT::Other, OpsB, 7); + Chain = SDValue(VStB, 1); + ReplaceUses(SDValue(N, 0), Chain); + return NULL; + } - // The source operand needs to be changed to a subreg of the original - // 128-bit operand, and the lane number needs to be adjusted accordingly. - unsigned NumElts = VT.getVectorNumElements() / 2; - unsigned SRVal = (LaneVal < NumElts ? arm_dsubreg_0 : arm_dsubreg_1); - SDValue SR = CurDAG->getTargetConstant(SRVal, MVT::i32); - SDValue NewLane = CurDAG->getTargetConstant(LaneVal % NumElts, MVT::i32); - SDNode *SubReg = CurDAG->getTargetNode(TargetInstrInfo::EXTRACT_SUBREG, - dl, HalfVT, N->getOperand(0), SR); - return CurDAG->SelectNodeTo(N, Opc, VT, SDValue(SubReg, 0), NewLane); + case Intrinsic::arm_neon_vst4: { + SDValue MemAddr, MemUpdate, MemOpc; + if (!SelectAddrMode6(Op, N->getOperand(2), MemAddr, MemUpdate, MemOpc)) + return NULL; + SDValue Chain = N->getOperand(0); + VT = N->getOperand(3).getValueType(); + if (VT.is64BitVector()) { + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vst4 type"); + case MVT::v8i8: Opc = ARM::VST4d8; break; + case MVT::v4i16: Opc = ARM::VST4d16; break; + case MVT::v2f32: + case MVT::v2i32: Opc = ARM::VST4d32; break; + case MVT::v1i64: Opc = ARM::VST4d64; break; + } + const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc, + N->getOperand(3), N->getOperand(4), + N->getOperand(5), N->getOperand(6), Chain }; + return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 8); + } + // Quad registers are stored with two separate instructions, where one + // stores the even registers and the other stores the odd registers. + EVT RegVT; + unsigned Opc2 = 0; + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vst4 type"); + case MVT::v16i8: + Opc = ARM::VST4q8a; Opc2 = ARM::VST4q8b; RegVT = MVT::v8i8; break; + case MVT::v8i16: + Opc = ARM::VST4q16a; Opc2 = ARM::VST4q16b; RegVT = MVT::v4i16; break; + case MVT::v4f32: + Opc = ARM::VST4q32a; Opc2 = ARM::VST4q32b; RegVT = MVT::v2f32; break; + case MVT::v4i32: + Opc = ARM::VST4q32a; Opc2 = ARM::VST4q32b; RegVT = MVT::v2i32; break; + } + // Enable writeback to the address register. + MemOpc = CurDAG->getTargetConstant(ARM_AM::getAM6Opc(true), MVT::i32); + + SDValue D0 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT, + N->getOperand(3)); + SDValue D2 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT, + N->getOperand(4)); + SDValue D4 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT, + N->getOperand(5)); + SDValue D6 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT, + N->getOperand(6)); + const SDValue OpsA[] = { MemAddr, MemUpdate, MemOpc, + D0, D2, D4, D6, Chain }; + SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(), + MVT::Other, OpsA, 8); + Chain = SDValue(VStA, 1); + + SDValue D1 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT, + N->getOperand(3)); + SDValue D3 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT, + N->getOperand(4)); + SDValue D5 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT, + N->getOperand(5)); + SDValue D7 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT, + N->getOperand(6)); + MemAddr = SDValue(VStA, 0); + const SDValue OpsB[] = { MemAddr, MemUpdate, MemOpc, + D1, D3, D5, D7, Chain }; + SDNode *VStB = CurDAG->getMachineNode(Opc2, dl, MemAddr.getValueType(), + MVT::Other, OpsB, 8); + Chain = SDValue(VStB, 1); + ReplaceUses(SDValue(N, 0), Chain); + return NULL; } - break; + case Intrinsic::arm_neon_vst2lane: { + unsigned DOpcodes[] = { ARM::VST2LNd8, ARM::VST2LNd16, ARM::VST2LNd32 }; + unsigned QOpcodes0[] = { ARM::VST2LNq16a, ARM::VST2LNq32a }; + unsigned QOpcodes1[] = { ARM::VST2LNq16b, ARM::VST2LNq32b }; + return SelectVLDSTLane(Op, false, 2, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vst3lane: { + unsigned DOpcodes[] = { ARM::VST3LNd8, ARM::VST3LNd16, ARM::VST3LNd32 }; + unsigned QOpcodes0[] = { ARM::VST3LNq16a, ARM::VST3LNq32a }; + unsigned QOpcodes1[] = { ARM::VST3LNq16b, ARM::VST3LNq32b }; + return SelectVLDSTLane(Op, false, 3, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vst4lane: { + unsigned DOpcodes[] = { ARM::VST4LNd8, ARM::VST4LNd16, ARM::VST4LNd32 }; + unsigned QOpcodes0[] = { ARM::VST4LNq16a, ARM::VST4LNq32a }; + unsigned QOpcodes1[] = { ARM::VST4LNq16b, ARM::VST4LNq32b }; + return SelectVLDSTLane(Op, false, 4, DOpcodes, QOpcodes0, QOpcodes1); + } + } } } @@ -1224,20 +1869,17 @@ bool ARMDAGToDAGISel:: SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode, std::vector<SDValue> &OutOps) { assert(ConstraintCode == 'm' && "unexpected asm memory constraint"); - - SDValue Base, Offset, Opc; - if (!SelectAddrMode2(Op, Op, Base, Offset, Opc)) - return true; - - OutOps.push_back(Base); - OutOps.push_back(Offset); - OutOps.push_back(Opc); + // Require the address to be in a register. That is safe for all ARM + // variants and it is hard to do anything much smarter without knowing + // how the operand is used. + OutOps.push_back(Op); return false; } /// createARMISelDag - This pass converts a legalized DAG into a /// ARM-specific DAG, ready for instruction scheduling. /// -FunctionPass *llvm::createARMISelDag(ARMBaseTargetMachine &TM) { - return new ARMDAGToDAGISel(TM); +FunctionPass *llvm::createARMISelDag(ARMBaseTargetMachine &TM, + CodeGenOpt::Level OptLevel) { + return new ARMDAGToDAGISel(TM, OptLevel); } diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 41c9ecc..426cecb 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -17,9 +17,11 @@ #include "ARMConstantPoolValue.h" #include "ARMISelLowering.h" #include "ARMMachineFunctionInfo.h" +#include "ARMPerfectShuffle.h" #include "ARMRegisterInfo.h" #include "ARMSubtarget.h" #include "ARMTargetMachine.h" +#include "ARMTargetObjectFile.h" #include "llvm/CallingConv.h" #include "llvm/Constants.h" #include "llvm/Function.h" @@ -36,74 +38,101 @@ #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/Target/TargetOptions.h" #include "llvm/ADT/VectorExtras.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" +#include <sstream> using namespace llvm; -static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, +static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State); -static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, +static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State); -static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, +static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State); -static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, +static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State); -void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, - MVT PromotedBitwiseVT) { +void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT, + EVT PromotedBitwiseVT) { if (VT != PromotedLdStVT) { - setOperationAction(ISD::LOAD, VT, Promote); - AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); + setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); + AddPromotedToType (ISD::LOAD, VT.getSimpleVT(), + PromotedLdStVT.getSimpleVT()); - setOperationAction(ISD::STORE, VT, Promote); - AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); + setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); + AddPromotedToType (ISD::STORE, VT.getSimpleVT(), + PromotedLdStVT.getSimpleVT()); } - MVT ElemTy = VT.getVectorElementType(); + EVT ElemTy = VT.getVectorElementType(); if (ElemTy != MVT::i64 && ElemTy != MVT::f64) - setOperationAction(ISD::VSETCC, VT, Custom); + setOperationAction(ISD::VSETCC, VT.getSimpleVT(), Custom); if (ElemTy == MVT::i8 || ElemTy == MVT::i16) - setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); - setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); - setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); + if (ElemTy != MVT::i32) { + setOperationAction(ISD::SINT_TO_FP, VT.getSimpleVT(), Expand); + setOperationAction(ISD::UINT_TO_FP, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Expand); + } + setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Expand); if (VT.isInteger()) { - setOperationAction(ISD::SHL, VT, Custom); - setOperationAction(ISD::SRA, VT, Custom); - setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom); + setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom); + setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom); } // Promote all bit-wise operations. if (VT.isInteger() && VT != PromotedBitwiseVT) { - setOperationAction(ISD::AND, VT, Promote); - AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT); - setOperationAction(ISD::OR, VT, Promote); - AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT); - setOperationAction(ISD::XOR, VT, Promote); - AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT); - } -} - -void ARMTargetLowering::addDRTypeForNEON(MVT VT) { + setOperationAction(ISD::AND, VT.getSimpleVT(), Promote); + AddPromotedToType (ISD::AND, VT.getSimpleVT(), + PromotedBitwiseVT.getSimpleVT()); + setOperationAction(ISD::OR, VT.getSimpleVT(), Promote); + AddPromotedToType (ISD::OR, VT.getSimpleVT(), + PromotedBitwiseVT.getSimpleVT()); + setOperationAction(ISD::XOR, VT.getSimpleVT(), Promote); + AddPromotedToType (ISD::XOR, VT.getSimpleVT(), + PromotedBitwiseVT.getSimpleVT()); + } + + // Neon does not support vector divide/remainder operations. + setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand); + setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FDIV, VT.getSimpleVT(), Expand); + setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand); + setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand); +} + +void ARMTargetLowering::addDRTypeForNEON(EVT VT) { addRegisterClass(VT, ARM::DPRRegisterClass); addTypeForNEON(VT, MVT::f64, MVT::v2i32); } -void ARMTargetLowering::addQRTypeForNEON(MVT VT) { +void ARMTargetLowering::addQRTypeForNEON(EVT VT) { addRegisterClass(VT, ARM::QPRRegisterClass); addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); } +static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) { + if (TM.getSubtarget<ARMSubtarget>().isTargetDarwin()) + return new TargetLoweringObjectFileMachO(); + return new ARMElfTargetObjectFile(); +} + ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) - : TargetLowering(TM), ARMPCLabelIndex(0) { + : TargetLowering(TM, createTLOF(TM)), ARMPCLabelIndex(0) { Subtarget = &TM.getSubtarget<ARMSubtarget>(); if (Subtarget->isTargetDarwin()) { @@ -188,11 +217,20 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setLibcallName(RTLIB::SRL_I128, 0); setLibcallName(RTLIB::SRA_I128, 0); - if (Subtarget->isThumb()) + // Libcalls should use the AAPCS base standard ABI, even if hard float + // is in effect, as per the ARM RTABI specification, section 4.1.2. + if (Subtarget->isAAPCS_ABI()) { + for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) { + setLibcallCallingConv(static_cast<RTLIB::Libcall>(i), + CallingConv::ARM_AAPCS); + } + } + + if (Subtarget->isThumb1Only()) addRegisterClass(MVT::i32, ARM::tGPRRegisterClass); else addRegisterClass(MVT::i32, ARM::GPRRegisterClass); - if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb()) { + if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) { addRegisterClass(MVT::f32, ARM::SPRRegisterClass); addRegisterClass(MVT::f64, ARM::DPRRegisterClass); @@ -213,6 +251,39 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) addQRTypeForNEON(MVT::v4i32); addQRTypeForNEON(MVT::v2i64); + // v2f64 is legal so that QR subregs can be extracted as f64 elements, but + // neither Neon nor VFP support any arithmetic operations on it. + setOperationAction(ISD::FADD, MVT::v2f64, Expand); + setOperationAction(ISD::FSUB, MVT::v2f64, Expand); + setOperationAction(ISD::FMUL, MVT::v2f64, Expand); + setOperationAction(ISD::FDIV, MVT::v2f64, Expand); + setOperationAction(ISD::FREM, MVT::v2f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); + setOperationAction(ISD::VSETCC, MVT::v2f64, Expand); + setOperationAction(ISD::FNEG, MVT::v2f64, Expand); + setOperationAction(ISD::FABS, MVT::v2f64, Expand); + setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); + setOperationAction(ISD::FSIN, MVT::v2f64, Expand); + setOperationAction(ISD::FCOS, MVT::v2f64, Expand); + setOperationAction(ISD::FPOWI, MVT::v2f64, Expand); + setOperationAction(ISD::FPOW, MVT::v2f64, Expand); + setOperationAction(ISD::FLOG, MVT::v2f64, Expand); + setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); + setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); + setOperationAction(ISD::FEXP, MVT::v2f64, Expand); + setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); + setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); + setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); + setOperationAction(ISD::FRINT, MVT::v2f64, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); + setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); + + // Neon does not support some operations on v1i64 and v2i64 types. + setOperationAction(ISD::MUL, MVT::v1i64, Expand); + setOperationAction(ISD::MUL, MVT::v2i64, Expand); + setOperationAction(ISD::VSETCC, MVT::v1i64, Expand); + setOperationAction(ISD::VSETCC, MVT::v2i64, Expand); + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRL); @@ -246,7 +317,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) } // i64 operation support. - if (Subtarget->isThumb()) { + if (Subtarget->isThumb1Only()) { setOperationAction(ISD::MUL, MVT::i64, Expand); setOperationAction(ISD::MULHU, MVT::i32, Expand); setOperationAction(ISD::MULHS, MVT::i32, Expand); @@ -287,7 +358,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand); setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand); - setOperationAction(ISD::RET, MVT::Other, Custom); setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::ConstantPool, MVT::i32, Custom); setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom); @@ -300,7 +370,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::VAEND, MVT::Other, Expand); setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); + setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); + // FIXME: Shouldn't need this, since no register is used, but the legalizer + // doesn't yet know how to not do that for SjLj. + setExceptionSelectorRegister(ARM::R0); + if (Subtarget->isThumb()) + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); + else + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand); if (!Subtarget->hasV6Ops() && !Subtarget->isThumb2()) { @@ -309,7 +386,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) } setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); - if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb()) + if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) // Turn f64->i64 into FMRRD, i64 -> f64 to FMDRR iff target supports vfp2. setOperationAction(ISD::BIT_CONVERT, MVT::i64, Custom); @@ -339,7 +416,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::FCOS, MVT::f64, Expand); setOperationAction(ISD::FREM, MVT::f64, Expand); setOperationAction(ISD::FREM, MVT::f32, Expand); - if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb()) { + if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) { setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); } @@ -347,7 +424,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::FPOW, MVT::f32, Expand); // int <-> fp are custom expanded into bit_convert + ARMISD ops. - if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb()) { + if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) { setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); @@ -361,26 +438,19 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setStackPointerRegisterToSaveRestore(ARM::SP); setSchedulingPreference(SchedulingForRegPressure); - setIfCvtBlockSizeLimit(Subtarget->isThumb() ? 0 : 10); - setIfCvtDupBlockSizeLimit(Subtarget->isThumb() ? 0 : 2); - - if (!Subtarget->isThumb()) { - // Use branch latency information to determine if-conversion limits. - // FIXME: If-converter should use instruction latency of the branch being - // eliminated to compute the threshold. For ARMv6, the branch "latency" - // varies depending on whether it's dynamically or statically predicted - // and on whether the destination is in the prefetch buffer. - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - const InstrItineraryData &InstrItins = Subtarget->getInstrItineraryData(); - unsigned Latency= InstrItins.getLatency(TII->get(ARM::Bcc).getSchedClass()); - if (Latency > 1) { - setIfCvtBlockSizeLimit(Latency-1); - if (Latency > 2) - setIfCvtDupBlockSizeLimit(Latency-2); - } else { - setIfCvtBlockSizeLimit(10); - setIfCvtDupBlockSizeLimit(2); - } + + // FIXME: If-converter should use instruction latency to determine + // profitability rather than relying on fixed limits. + if (Subtarget->getCPUString() == "generic") { + // Generic (and overly aggressive) if-conversion limits. + setIfCvtBlockSizeLimit(10); + setIfCvtDupBlockSizeLimit(2); + } else if (Subtarget->hasV6Ops()) { + setIfCvtBlockSizeLimit(2); + setIfCvtDupBlockSizeLimit(1); + } else { + setIfCvtBlockSizeLimit(3); + setIfCvtDupBlockSizeLimit(2); } maxStoresPerMemcpy = 1; //// temporary - rewrite interface to use type @@ -401,6 +471,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::tCALL: return "ARMISD::tCALL"; case ARMISD::BRCOND: return "ARMISD::BRCOND"; case ARMISD::BR_JT: return "ARMISD::BR_JT"; + case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; case ARMISD::CMP: return "ARMISD::CMP"; @@ -425,6 +496,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; + case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; + case ARMISD::VCEQ: return "ARMISD::VCEQ"; case ARMISD::VCGE: return "ARMISD::VCGE"; case ARMISD::VCGEU: return "ARMISD::VCGEU"; @@ -453,13 +526,21 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu"; case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; - case ARMISD::VDUPLANEQ: return "ARMISD::VDUPLANEQ"; + case ARMISD::VDUP: return "ARMISD::VDUP"; + case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; + case ARMISD::VEXT: return "ARMISD::VEXT"; + case ARMISD::VREV64: return "ARMISD::VREV64"; + case ARMISD::VREV32: return "ARMISD::VREV32"; + case ARMISD::VREV16: return "ARMISD::VREV16"; + case ARMISD::VZIP: return "ARMISD::VZIP"; + case ARMISD::VUZP: return "ARMISD::VUZP"; + case ARMISD::VTRN: return "ARMISD::VTRN"; } } /// getFunctionAlignment - Return the Log2 alignment of this function. unsigned ARMTargetLowering::getFunctionAlignment(const Function *F) const { - return getTargetMachine().getSubtarget<ARMSubtarget>().isThumb() ? 1 : 2; + return getTargetMachine().getSubtarget<ARMSubtarget>().isThumb() ? 0 : 1; } //===----------------------------------------------------------------------===// @@ -469,7 +550,7 @@ unsigned ARMTargetLowering::getFunctionAlignment(const Function *F) const { /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { switch (CC) { - default: assert(0 && "Unknown condition code!"); + default: llvm_unreachable("Unknown condition code!"); case ISD::SETNE: return ARMCC::NE; case ISD::SETEQ: return ARMCC::EQ; case ISD::SETGT: return ARMCC::GT; @@ -483,15 +564,12 @@ static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { } } -/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. It -/// returns true if the operands should be inverted to form the proper -/// comparison. -static bool FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, +/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. +static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, ARMCC::CondCodes &CondCode2) { - bool Invert = false; CondCode2 = ARMCC::AL; switch (CC) { - default: assert(0 && "Unknown FP condition!"); + default: llvm_unreachable("Unknown FP condition!"); case ISD::SETEQ: case ISD::SETOEQ: CondCode = ARMCC::EQ; break; case ISD::SETGT: @@ -499,7 +577,7 @@ static bool FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, case ISD::SETGE: case ISD::SETOGE: CondCode = ARMCC::GE; break; case ISD::SETOLT: CondCode = ARMCC::MI; break; - case ISD::SETOLE: CondCode = ARMCC::GT; Invert = true; break; + case ISD::SETOLE: CondCode = ARMCC::LS; break; case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break; case ISD::SETO: CondCode = ARMCC::VC; break; case ISD::SETUO: CondCode = ARMCC::VS; break; @@ -513,24 +591,16 @@ static bool FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, case ISD::SETNE: case ISD::SETUNE: CondCode = ARMCC::NE; break; } - return Invert; } //===----------------------------------------------------------------------===// // Calling Convention Implementation -// -// The lower operations present on calling convention works on this order: -// LowerCALL (virt regs --> phys regs, virt regs --> stack) -// LowerFORMAL_ARGUMENTS (phys --> virt regs, stack --> virt regs) -// LowerRET (virt regs --> phys regs) -// LowerCALL (phys regs --> virt regs) -// //===----------------------------------------------------------------------===// #include "ARMGenCallingConv.inc" // APCS f64 is in register pairs, possibly split to stack -static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT, +static bool f64AssignAPCS(unsigned &ValNo, EVT &ValVT, EVT &LocVT, CCValAssign::LocInfo &LocInfo, CCState &State, bool CanFail) { static const unsigned RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; @@ -560,7 +630,7 @@ static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return true; } -static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, +static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State) { @@ -573,7 +643,7 @@ static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, } // AAPCS f64 is in aligned register pairs -static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT, +static bool f64AssignAAPCS(unsigned &ValNo, EVT &ValVT, EVT &LocVT, CCValAssign::LocInfo &LocInfo, CCState &State, bool CanFail) { static const unsigned HiRegList[] = { ARM::R0, ARM::R2 }; @@ -603,7 +673,7 @@ static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return true; } -static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, +static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State) { @@ -615,7 +685,7 @@ static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return true; // we handled it } -static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT, +static bool f64RetAssign(unsigned &ValNo, EVT &ValVT, EVT &LocVT, CCValAssign::LocInfo &LocInfo, CCState &State) { static const unsigned HiRegList[] = { ARM::R0, ARM::R2 }; static const unsigned LoRegList[] = { ARM::R1, ARM::R3 }; @@ -635,7 +705,7 @@ static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return true; } -static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, +static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State) { @@ -646,7 +716,7 @@ static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return true; // we handled it } -static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, +static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State) { @@ -656,49 +726,48 @@ static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, /// CCAssignFnForNode - Selects the correct CCAssignFn for a the /// given CallingConvention value. -CCAssignFn *ARMTargetLowering::CCAssignFnForNode(unsigned CC, - bool Return) const { +CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, + bool Return, + bool isVarArg) const { switch (CC) { default: - assert(0 && "Unsupported calling convention"); + llvm_unreachable("Unsupported calling convention"); case CallingConv::C: case CallingConv::Fast: - // Use target triple & subtarget features to do actual dispatch. - if (Subtarget->isAAPCS_ABI()) { - if (Subtarget->hasVFP2() && - FloatABIType == FloatABI::Hard) - return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); - else - return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS); - } else - return (Return ? RetCC_ARM_APCS: CC_ARM_APCS); + // Use target triple & subtarget features to do actual dispatch. + if (Subtarget->isAAPCS_ABI()) { + if (Subtarget->hasVFP2() && + FloatABIType == FloatABI::Hard && !isVarArg) + return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); + else + return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS); + } else + return (Return ? RetCC_ARM_APCS: CC_ARM_APCS); case CallingConv::ARM_AAPCS_VFP: - return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); + return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); case CallingConv::ARM_AAPCS: - return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS); + return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS); case CallingConv::ARM_APCS: - return (Return ? RetCC_ARM_APCS: CC_ARM_APCS); + return (Return ? RetCC_ARM_APCS: CC_ARM_APCS); } } -/// LowerCallResult - Lower the result values of an ISD::CALL into the -/// appropriate copies out of appropriate physical registers. This assumes that -/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call -/// being lowered. The returns a SDNode with the same number of values as the -/// ISD::CALL. -SDNode *ARMTargetLowering:: -LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall, - unsigned CallingConv, SelectionDAG &DAG) { +/// LowerCallResult - Lower the result values of a call into the +/// appropriate copies out of appropriate physical registers. +SDValue +ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) { - DebugLoc dl = TheCall->getDebugLoc(); // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; - bool isVarArg = TheCall->isVarArg(); - CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs); - CCInfo.AnalyzeCallResult(TheCall, - CCAssignFnForNode(CallingConv, /* Return*/ true)); - - SmallVector<SDValue, 8> ResultVals; + CCState CCInfo(CallConv, isVarArg, getTargetMachine(), + RVLocs, *DAG.getContext()); + CCInfo.AnalyzeCallResult(Ins, + CCAssignFnForNode(CallConv, /* Return*/ true, + isVarArg)); // Copy all of the result registers out of their specified physreg. for (unsigned i = 0; i != RVLocs.size(); ++i) { @@ -743,20 +812,17 @@ LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall, } switch (VA.getLocInfo()) { - default: assert(0 && "Unknown loc info!"); + default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: Val = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), Val); break; } - ResultVals.push_back(Val); + InVals.push_back(Val); } - // Merge everything together with a MERGE_VALUES node. - ResultVals.push_back(Chain); - return DAG.getNode(ISD::MERGE_VALUES, dl, TheCall->getVTList(), - &ResultVals[0], ResultVals.size()).getNode(); + return Chain; } /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified @@ -776,11 +842,11 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, /// LowerMemOpCallTo - Store the argument to the stack. SDValue -ARMTargetLowering::LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG, - const SDValue &StackPtr, - const CCValAssign &VA, SDValue Chain, - SDValue Arg, ISD::ArgFlagsTy Flags) { - DebugLoc dl = TheCall->getDebugLoc(); +ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, + SDValue StackPtr, SDValue Arg, + DebugLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, + ISD::ArgFlagsTy Flags) { unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); @@ -791,14 +857,13 @@ ARMTargetLowering::LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG, PseudoSourceValue::getStack(), LocMemOffset); } -void ARMTargetLowering::PassF64ArgInRegs(CallSDNode *TheCall, SelectionDAG &DAG, +void ARMTargetLowering::PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg, RegsToPassVector &RegsToPass, CCValAssign &VA, CCValAssign &NextVA, SDValue &StackPtr, SmallVector<SDValue, 8> &MemOpChains, ISD::ArgFlagsTy Flags) { - DebugLoc dl = TheCall->getDebugLoc(); SDValue fmrrd = DAG.getNode(ARMISD::FMRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Arg); @@ -811,27 +876,31 @@ void ARMTargetLowering::PassF64ArgInRegs(CallSDNode *TheCall, SelectionDAG &DAG, if (StackPtr.getNode() == 0) StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); - MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, NextVA, - Chain, fmrrd.getValue(1), Flags)); + MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1), + dl, DAG, NextVA, + Flags)); } } -/// LowerCALL - Lowering a ISD::CALL node into a callseq_start <- +/// LowerCall - Lowering a call into a callseq_start <- /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter /// nodes. -SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { - CallSDNode *TheCall = cast<CallSDNode>(Op.getNode()); - MVT RetVT = TheCall->getRetValType(0); - SDValue Chain = TheCall->getChain(); - unsigned CC = TheCall->getCallingConv(); - bool isVarArg = TheCall->isVarArg(); - SDValue Callee = TheCall->getCallee(); - DebugLoc dl = TheCall->getDebugLoc(); +SDValue +ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, + CallingConv::ID CallConv, bool isVarArg, + bool isTailCall, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) { // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); - CCInfo.AnalyzeCallOperands(TheCall, CCAssignFnForNode(CC, /* Return*/ false)); + CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs, + *DAG.getContext()); + CCInfo.AnalyzeCallOperands(Outs, + CCAssignFnForNode(CallConv, /* Return*/ false, + isVarArg)); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); @@ -851,12 +920,12 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { i != e; ++i, ++realArgIdx) { CCValAssign &VA = ArgLocs[i]; - SDValue Arg = TheCall->getArg(realArgIdx); - ISD::ArgFlagsTy Flags = TheCall->getArgFlags(realArgIdx); + SDValue Arg = Outs[realArgIdx].Val; + ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; // Promote the value if needed. switch (VA.getLocInfo()) { - default: assert(0 && "Unknown loc info!"); + default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::SExt: Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); @@ -872,7 +941,7 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { break; } - // f64 and v2f64 are passed in i32 pairs and must be split into pieces + // f64 and v2f64 might be passed in i32 pairs and must be split into pieces if (VA.needsCustom()) { if (VA.getLocVT() == MVT::v2f64) { SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, @@ -880,23 +949,23 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, DAG.getConstant(1, MVT::i32)); - PassF64ArgInRegs(TheCall, DAG, Chain, Op0, RegsToPass, + PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); VA = ArgLocs[++i]; // skip ahead to next loc if (VA.isRegLoc()) { - PassF64ArgInRegs(TheCall, DAG, Chain, Op1, RegsToPass, + PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); } else { assert(VA.isMemLoc()); if (StackPtr.getNode() == 0) StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); - MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA, - Chain, Op1, Flags)); + MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, + dl, DAG, VA, Flags)); } } else { - PassF64ArgInRegs(TheCall, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], + PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); } } else if (VA.isRegLoc()) { @@ -906,8 +975,8 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { if (StackPtr.getNode() == 0) StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); - MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA, - Chain, Arg, Flags)); + MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, + dl, DAG, VA, Flags)); } } @@ -933,17 +1002,17 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { GlobalValue *GV = G->getGlobal(); isDirect = true; - bool isExt = (GV->isDeclaration() || GV->hasWeakLinkage() || - GV->hasLinkOnceLinkage()); + bool isExt = GV->isDeclaration() || GV->isWeakForLinker(); bool isStub = (isExt && Subtarget->isTargetDarwin()) && getTargetMachine().getRelocationModel() != Reloc::Static; isARMFunc = !Subtarget->isThumb() || isStub; // ARM call to a local ARM function is predicable. isLocalARMFunc = !Subtarget->isThumb() && !isExt; // tBX takes a register source operand. - if (isARMFunc && Subtarget->isThumb() && !Subtarget->hasV5TOps()) { - ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, ARMPCLabelIndex, - ARMCP::CPStub, 4); + if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { + ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, + ARMPCLabelIndex, + ARMCP::CPValue, 4); SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad(getPointerTy(), dl, @@ -960,9 +1029,9 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { isARMFunc = !Subtarget->isThumb() || isStub; // tBX takes a register source operand. const char *Sym = S->getSymbol(); - if (isARMFunc && Subtarget->isThumb() && !Subtarget->hasV5TOps()) { - ARMConstantPoolValue *CPV = new ARMConstantPoolValue(Sym, ARMPCLabelIndex, - ARMCP::CPStub, 4); + if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { + ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(), + Sym, ARMPCLabelIndex, 4); SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad(getPointerTy(), dl, @@ -977,7 +1046,7 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { // FIXME: handle tail calls differently. unsigned CallOpc; if (Subtarget->isThumb()) { - if (!Subtarget->hasV5TOps() && (!isDirect || isARMFunc)) + if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; else CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL; @@ -986,7 +1055,7 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { ? (isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL) : ARMISD::CALL_NOLINK; } - if (CallOpc == ARMISD::CALL_NOLINK && !Subtarget->isThumb()) { + if (CallOpc == ARMISD::CALL_NOLINK && !Subtarget->isThumb1Only()) { // implicit def LR - LR mustn't be allocated as GRP:$dst of CALL_NOLINK Chain = DAG.getCopyToReg(Chain, dl, ARM::LR, DAG.getUNDEF(MVT::i32),InFlag); InFlag = Chain.getValue(1); @@ -1011,30 +1080,31 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), DAG.getIntPtrConstant(0, true), InFlag); - if (RetVT != MVT::Other) + if (!Ins.empty()) InFlag = Chain.getValue(1); // Handle result values, copying them out of physregs into vregs that we // return. - return SDValue(LowerCallResult(Chain, InFlag, TheCall, CC, DAG), - Op.getResNo()); + return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, + dl, DAG, InVals); } -SDValue ARMTargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) { - // The chain is always operand #0 - SDValue Chain = Op.getOperand(0); - DebugLoc dl = Op.getDebugLoc(); +SDValue +ARMTargetLowering::LowerReturn(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + DebugLoc dl, SelectionDAG &DAG) { // CCValAssign - represent the assignment of the return value to a location. SmallVector<CCValAssign, 16> RVLocs; - unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv(); - bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg(); // CCState - Info about the registers and stack slots. - CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs); + CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs, + *DAG.getContext()); - // Analyze return values of ISD::RET. - CCInfo.AnalyzeReturn(Op.getNode(), CCAssignFnForNode(CC, /* Return */ true)); + // Analyze outgoing return values. + CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true, + isVarArg)); // If this is the first return lowered for this function, add // the regs to the liveout set for the function. @@ -1053,12 +1123,10 @@ SDValue ARMTargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); - // ISD::RET => ret chain, (regnum1,val1), ... - // So i*2+1 index only the regnums - SDValue Arg = Op.getOperand(realRVLocIdx*2+1); + SDValue Arg = Outs[realRVLocIdx].Val; switch (VA.getLocInfo()) { - default: assert(0 && "Unknown loc info!"); + default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: Arg = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), Arg); @@ -1112,13 +1180,13 @@ SDValue ARMTargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) { } // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as -// their target countpart wrapped in the ARMISD::Wrapper node. Suppose N is +// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is // one of the above mentioned nodes. It has to be wrapped because otherwise // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only // be used to form addressing mode. These wrapped nodes will be selected // into MOVi. static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) { - MVT PtrVT = Op.getValueType(); + EVT PtrVT = Op.getValueType(); // FIXME there is no actual debug info here DebugLoc dl = Op.getDebugLoc(); ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); @@ -1137,11 +1205,11 @@ SDValue ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG) { DebugLoc dl = GA->getDebugLoc(); - MVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(); unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; ARMConstantPoolValue *CPV = - new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue, - PCAdj, "tlsgd", true); + new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex, + ARMCP::CPValue, PCAdj, "tlsgd", true); SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument, NULL, 0); @@ -1154,12 +1222,13 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, ArgListTy Args; ArgListEntry Entry; Entry.Node = Argument; - Entry.Ty = (const Type *) Type::Int32Ty; + Entry.Ty = (const Type *) Type::getInt32Ty(*DAG.getContext()); Args.push_back(Entry); // FIXME: is there useful debug info available here? std::pair<SDValue, SDValue> CallResult = - LowerCallTo(Chain, (const Type *) Type::Int32Ty, false, false, false, false, - 0, CallingConv::C, false, + LowerCallTo(Chain, (const Type *) Type::getInt32Ty(*DAG.getContext()), + false, false, false, false, + 0, CallingConv::C, false, /*isReturnValueUsed=*/true, DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl); return CallResult.first; } @@ -1173,16 +1242,16 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, DebugLoc dl = GA->getDebugLoc(); SDValue Offset; SDValue Chain = DAG.getEntryNode(); - MVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(); // Get the Thread Pointer SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); - if (GV->isDeclaration()){ + if (GV->isDeclaration()) { // initial exec model unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; ARMConstantPoolValue *CPV = - new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue, - PCAdj, "gottpoff", true); + new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex, + ARMCP::CPValue, PCAdj, "gottpoff", true); Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, NULL, 0); @@ -1194,8 +1263,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, NULL, 0); } else { // local exec model - ARMConstantPoolValue *CPV = - new ARMConstantPoolValue(GV, ARMCP::CPValue, "tpoff"); + ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, "tpoff"); Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, NULL, 0); @@ -1222,59 +1290,47 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) { SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) { - MVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(); DebugLoc dl = Op.getDebugLoc(); GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); if (RelocM == Reloc::PIC_) { bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility(); ARMConstantPoolValue *CPV = - new ARMConstantPoolValue(GV, ARMCP::CPValue, UseGOTOFF ? "GOTOFF":"GOT"); + new ARMConstantPoolValue(GV, UseGOTOFF ? "GOTOFF" : "GOT"); SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), - CPAddr, NULL, 0); + CPAddr, + PseudoSourceValue::getConstantPool(), 0); SDValue Chain = Result.getValue(1); SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT); Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT); if (!UseGOTOFF) - Result = DAG.getLoad(PtrVT, dl, Chain, Result, NULL, 0); + Result = DAG.getLoad(PtrVT, dl, Chain, Result, + PseudoSourceValue::getGOT(), 0); return Result; } else { SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, NULL, 0); + return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, + PseudoSourceValue::getConstantPool(), 0); } } -/// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol -/// even in non-static mode. -static bool GVIsIndirectSymbol(GlobalValue *GV, Reloc::Model RelocM) { - // If symbol visibility is hidden, the extra load is not needed if - // the symbol is definitely defined in the current translation unit. - bool isDecl = GV->isDeclaration() && !GV->hasNotBeenReadFromBitcode(); - if (GV->hasHiddenVisibility() && (!isDecl && !GV->hasCommonLinkage())) - return false; - return RelocM != Reloc::Static && (isDecl || GV->isWeakForLinker()); -} - SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) { - MVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(); DebugLoc dl = Op.getDebugLoc(); GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); - bool IsIndirect = GVIsIndirectSymbol(GV, RelocM); SDValue CPAddr; if (RelocM == Reloc::Static) CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); else { - unsigned PCAdj = (RelocM != Reloc::PIC_) - ? 0 : (Subtarget->isThumb() ? 4 : 8); - ARMCP::ARMCPKind Kind = IsIndirect ? ARMCP::CPNonLazyPtr - : ARMCP::CPValue; - ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, ARMPCLabelIndex, - Kind, PCAdj); + unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb()?4:8); + ARMConstantPoolValue *CPV = + new ARMConstantPoolValue(GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj); CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); } CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); @@ -1286,7 +1342,8 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32); Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); } - if (IsIndirect) + + if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) Result = DAG.getLoad(PtrVT, dl, Chain, Result, NULL, 0); return Result; @@ -1296,32 +1353,55 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG){ assert(Subtarget->isTargetELF() && "GLOBAL OFFSET TABLE not implemented for non-ELF targets"); - MVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(); DebugLoc dl = Op.getDebugLoc(); unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; - ARMConstantPoolValue *CPV = new ARMConstantPoolValue("_GLOBAL_OFFSET_TABLE_", - ARMPCLabelIndex, - ARMCP::CPValue, PCAdj); + ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(), + "_GLOBAL_OFFSET_TABLE_", + ARMPCLabelIndex, PCAdj); SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, NULL, 0); + SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, + PseudoSourceValue::getConstantPool(), 0); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32); return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); } SDValue ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { - MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); DebugLoc dl = Op.getDebugLoc(); switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. - case Intrinsic::arm_thread_pointer: - return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); + case Intrinsic::arm_thread_pointer: { + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); + } + case Intrinsic::eh_sjlj_lsda: { + MachineFunction &MF = DAG.getMachineFunction(); + EVT PtrVT = getPointerTy(); + DebugLoc dl = Op.getDebugLoc(); + Reloc::Model RelocM = getTargetMachine().getRelocationModel(); + SDValue CPAddr; + unsigned PCAdj = (RelocM != Reloc::PIC_) + ? 0 : (Subtarget->isThumb() ? 4 : 8); + ARMConstantPoolValue *CPV = + new ARMConstantPoolValue(MF.getFunction(), ARMPCLabelIndex, + ARMCP::CPLSDA, PCAdj); + CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + SDValue Result = + DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, NULL, 0); + SDValue Chain = Result.getValue(1); + + if (RelocM == Reloc::PIC_) { + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32); + Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); + } + return Result; + } case Intrinsic::eh_sjlj_setjmp: - SDValue Res = DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, MVT::i32, - Op.getOperand(1)); - return Res; + return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, MVT::i32, Op.getOperand(1)); } } @@ -1330,13 +1410,60 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG, // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. DebugLoc dl = Op.getDebugLoc(); - MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT); const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0); } SDValue +ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) { + SDNode *Node = Op.getNode(); + DebugLoc dl = Node->getDebugLoc(); + EVT VT = Node->getValueType(0); + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + SDValue Align = Op.getOperand(2); + + // Chain the dynamic stack allocation so that it doesn't modify the stack + // pointer when other instructions are using the stack. + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true)); + + unsigned AlignVal = cast<ConstantSDNode>(Align)->getZExtValue(); + unsigned StackAlign = getTargetMachine().getFrameInfo()->getStackAlignment(); + if (AlignVal > StackAlign) + // Do this now since selection pass cannot introduce new target + // independent node. + Align = DAG.getConstant(-(uint64_t)AlignVal, VT); + + // In Thumb1 mode, there isn't a "sub r, sp, r" instruction, we will end up + // using a "add r, sp, r" instead. Negate the size now so we don't have to + // do even more horrible hack later. + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + if (AFI->isThumb1OnlyFunction()) { + bool Negate = true; + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Size); + if (C) { + uint32_t Val = C->getZExtValue(); + if (Val <= 508 && ((Val & 3) == 0)) + Negate = false; + } + if (Negate) + Size = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, VT), Size); + } + + SDVTList VTList = DAG.getVTList(VT, MVT::Other); + SDValue Ops1[] = { Chain, Size, Align }; + SDValue Res = DAG.getNode(ARMISD::DYN_ALLOC, dl, VTList, Ops1, 3); + Chain = Res.getValue(1); + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true), + DAG.getIntPtrConstant(0, true), SDValue()); + SDValue Ops2[] = { Res, Chain }; + return DAG.getMergeValues(Ops2, 2, dl); +} + +SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, SDValue &Root, SelectionDAG &DAG, DebugLoc dl) { @@ -1344,7 +1471,7 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); TargetRegisterClass *RC; - if (AFI->isThumbFunction()) + if (AFI->isThumb1OnlyFunction()) RC = ARM::tGPRRegisterClass; else RC = ARM::GPRRegisterClass; @@ -1371,21 +1498,25 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, } SDValue -ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { +ARMTargetLowering::LowerFormalArguments(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> + &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) { + MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); - SDValue Root = Op.getOperand(0); - DebugLoc dl = Op.getDebugLoc(); - bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0; - unsigned CC = MF.getFunction()->getCallingConv(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); - CCInfo.AnalyzeFormalArguments(Op.getNode(), - CCAssignFnForNode(CC, /* Return*/ false)); + CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs, + *DAG.getContext()); + CCInfo.AnalyzeFormalArguments(Ins, + CCAssignFnForNode(CallConv, /* Return*/ false, + isVarArg)); SmallVector<SDValue, 16> ArgValues; @@ -1394,7 +1525,7 @@ ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { // Arguments stored in registers. if (VA.isRegLoc()) { - MVT RegVT = VA.getLocVT(); + EVT RegVT = VA.getLocVT(); SDValue ArgValue; if (VA.needsCustom()) { @@ -1404,43 +1535,43 @@ ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { if (VA.getLocVT() == MVT::v2f64) { SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], - Root, DAG, dl); + Chain, DAG, dl); VA = ArgLocs[++i]; // skip ahead to next loc SDValue ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], - Root, DAG, dl); + Chain, DAG, dl); ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue, ArgValue1, DAG.getIntPtrConstant(0)); ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue, ArgValue2, DAG.getIntPtrConstant(1)); } else - ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Root, DAG, dl); + ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); } else { TargetRegisterClass *RC; - if (FloatABIType == FloatABI::Hard && RegVT == MVT::f32) + + if (RegVT == MVT::f32) RC = ARM::SPRRegisterClass; - else if (FloatABIType == FloatABI::Hard && RegVT == MVT::f64) + else if (RegVT == MVT::f64) RC = ARM::DPRRegisterClass; - else if (AFI->isThumbFunction()) - RC = ARM::tGPRRegisterClass; + else if (RegVT == MVT::v2f64) + RC = ARM::QPRRegisterClass; + else if (RegVT == MVT::i32) + RC = (AFI->isThumb1OnlyFunction() ? + ARM::tGPRRegisterClass : ARM::GPRRegisterClass); else - RC = ARM::GPRRegisterClass; - - assert((RegVT == MVT::i32 || RegVT == MVT::f32 || - (FloatABIType == FloatABI::Hard && RegVT == MVT::f64)) && - "RegVT not supported by FORMAL_ARGUMENTS Lowering"); + llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); // Transform the arguments in physical registers into virtual ones. unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); - ArgValue = DAG.getCopyFromReg(Root, dl, Reg, RegVT); + ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); } // If this is an 8 or 16-bit value, it is really passed promoted // to 32 bits. Insert an assert[sz]ext to capture this, then // truncate to the right size. switch (VA.getLocInfo()) { - default: assert(0 && "Unknown loc info!"); + default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); @@ -1457,7 +1588,7 @@ ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { break; } - ArgValues.push_back(ArgValue); + InVals.push_back(ArgValue); } else { // VA.isRegLoc() @@ -1470,7 +1601,7 @@ ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { // Create load nodes to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); - ArgValues.push_back(DAG.getLoad(VA.getValVT(), dl, Root, FIN, NULL, 0)); + InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, NULL, 0)); } } @@ -1500,31 +1631,27 @@ ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { SmallVector<SDValue, 4> MemOps; for (; NumGPRs < 4; ++NumGPRs) { TargetRegisterClass *RC; - if (AFI->isThumbFunction()) + if (AFI->isThumb1OnlyFunction()) RC = ARM::tGPRRegisterClass; else RC = ARM::GPRRegisterClass; unsigned VReg = MF.addLiveIn(GPRArgRegs[NumGPRs], RC); - SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::i32); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, NULL, 0); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, DAG.getConstant(4, getPointerTy())); } if (!MemOps.empty()) - Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - &MemOps[0], MemOps.size()); + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &MemOps[0], MemOps.size()); } else // This will point to the next argument passed via stack. VarArgsFrameIndex = MFI->CreateFixedObject(4, ArgOffset); } - ArgValues.push_back(Root); - - // Return the new list of results. - return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(), - &ArgValues[0], ArgValues.size()).getValue(Op.getResNo()); + return Chain; } /// isFloatingPointZero - Return true if this is +0.0. @@ -1543,46 +1670,46 @@ static bool isFloatingPointZero(SDValue Op) { return false; } -static bool isLegalCmpImmediate(unsigned C, bool isThumb) { - return ( isThumb && (C & ~255U) == 0) || - (!isThumb && ARM_AM::getSOImmVal(C) != -1); +static bool isLegalCmpImmediate(unsigned C, bool isThumb1Only) { + return ( isThumb1Only && (C & ~255U) == 0) || + (!isThumb1Only && ARM_AM::getSOImmVal(C) != -1); } /// Returns appropriate ARM CMP (cmp) and corresponding condition code for /// the given operands. static SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, - SDValue &ARMCC, SelectionDAG &DAG, bool isThumb, + SDValue &ARMCC, SelectionDAG &DAG, bool isThumb1Only, DebugLoc dl) { if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { unsigned C = RHSC->getZExtValue(); - if (!isLegalCmpImmediate(C, isThumb)) { + if (!isLegalCmpImmediate(C, isThumb1Only)) { // Constant does not fit, try adjusting it by one? switch (CC) { default: break; case ISD::SETLT: case ISD::SETGE: - if (isLegalCmpImmediate(C-1, isThumb)) { + if (isLegalCmpImmediate(C-1, isThumb1Only)) { CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; RHS = DAG.getConstant(C-1, MVT::i32); } break; case ISD::SETULT: case ISD::SETUGE: - if (C > 0 && isLegalCmpImmediate(C-1, isThumb)) { + if (C > 0 && isLegalCmpImmediate(C-1, isThumb1Only)) { CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; RHS = DAG.getConstant(C-1, MVT::i32); } break; case ISD::SETLE: case ISD::SETGT: - if (isLegalCmpImmediate(C+1, isThumb)) { + if (isLegalCmpImmediate(C+1, isThumb1Only)) { CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; RHS = DAG.getConstant(C+1, MVT::i32); } break; case ISD::SETULE: case ISD::SETUGT: - if (C < 0xffffffff && isLegalCmpImmediate(C+1, isThumb)) { + if (C < 0xffffffff && isLegalCmpImmediate(C+1, isThumb1Only)) { CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; RHS = DAG.getConstant(C+1, MVT::i32); } @@ -1620,7 +1747,7 @@ static SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { - MVT VT = Op.getValueType(); + EVT VT = Op.getValueType(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); @@ -1631,13 +1758,12 @@ static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG, if (LHS.getValueType() == MVT::i32) { SDValue ARMCC; SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); - SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, ST->isThumb(), dl); + SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, ST->isThumb1Only(), dl); return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMCC, CCR,Cmp); } ARMCC::CondCodes CondCode, CondCode2; - if (FPCCToARMCC(CC, CondCode, CondCode2)) - std::swap(TrueVal, FalseVal); + FPCCToARMCC(CC, CondCode, CondCode2); SDValue ARMCC = DAG.getConstant(CondCode, MVT::i32); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); @@ -1666,16 +1792,14 @@ static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG, if (LHS.getValueType() == MVT::i32) { SDValue ARMCC; SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); - SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, ST->isThumb(), dl); + SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, ST->isThumb1Only(), dl); return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMCC, CCR,Cmp); } assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); ARMCC::CondCodes CondCode, CondCode2; - if (FPCCToARMCC(CC, CondCode, CondCode2)) - // Swap the LHS/RHS of the comparison if needed. - std::swap(LHS, RHS); + FPCCToARMCC(CC, CondCode, CondCode2); SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); SDValue ARMCC = DAG.getConstant(CondCode, MVT::i32); @@ -1697,21 +1821,32 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) { SDValue Index = Op.getOperand(2); DebugLoc dl = Op.getDebugLoc(); - MVT PTy = getPointerTy(); + EVT PTy = getPointerTy(); JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); - SDValue UId = DAG.getConstant(AFI->createJumpTableUId(), PTy); + SDValue UId = DAG.getConstant(AFI->createJumpTableUId(), PTy); SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI, UId); Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, PTy)); SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table); - bool isPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_; - Addr = DAG.getLoad(isPIC ? (MVT)MVT::i32 : PTy, dl, - Chain, Addr, NULL, 0); - Chain = Addr.getValue(1); - if (isPIC) + if (Subtarget->isThumb2()) { + // Thumb2 uses a two-level jump. That is, it jumps into the jump table + // which does another jump to the destination. This also makes it easier + // to translate it to TBB / TBH later. + // FIXME: This might not work if the function is extremely large. + return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, + Addr, Op.getOperand(2), JTI, UId); + } + if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { + Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, NULL, 0); + Chain = Addr.getValue(1); Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table); - return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId); + return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId); + } else { + Addr = DAG.getLoad(PTy, dl, Chain, Addr, NULL, 0); + Chain = Addr.getValue(1); + return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId); + } } static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) { @@ -1723,7 +1858,7 @@ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) { } static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) { - MVT VT = Op.getValueType(); + EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); unsigned Opc = Op.getOpcode() == ISD::SINT_TO_FP ? ARMISD::SITOF : ARMISD::UITOF; @@ -1737,8 +1872,8 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { SDValue Tmp0 = Op.getOperand(0); SDValue Tmp1 = Op.getOperand(1); DebugLoc dl = Op.getDebugLoc(); - MVT VT = Op.getValueType(); - MVT SrcVT = Tmp1.getValueType(); + EVT VT = Op.getValueType(); + EVT SrcVT = Tmp1.getValueType(); SDValue AbsVal = DAG.getNode(ISD::FABS, dl, VT, Tmp0); SDValue Cmp = getVFPCmp(Tmp1, DAG.getConstantFP(0.0, SrcVT), DAG, dl); SDValue ARMCC = DAG.getConstant(ARMCC::LT, MVT::i32); @@ -1749,7 +1884,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) { MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); MFI->setFrameAddressIsTaken(true); - MVT VT = Op.getValueType(); + EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); unsigned FrameReg = (Subtarget->isThumb() || Subtarget->isTargetDarwin()) @@ -1784,7 +1919,7 @@ ARMTargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, unsigned BytesLeft = SizeVal & 3; unsigned NumMemOps = SizeVal >> 2; unsigned EmittedNumMemOps = 0; - MVT VT = MVT::i32; + EVT VT = MVT::i32; unsigned VTSize = 4; unsigned i = 0; const unsigned MAX_LOADS_IN_LDM = 6; @@ -1890,45 +2025,55 @@ static SDValue ExpandBIT_CONVERT(SDNode *N, SelectionDAG &DAG) { /// getZeroVector - Returns a vector of specified type with all zero elements. /// -static SDValue getZeroVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) { +static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); // Zero vectors are used to represent vector negation and in those cases // will be implemented with the NEON VNEG instruction. However, VNEG does // not support i64 elements, so sometimes the zero vectors will need to be // explicitly constructed. For those cases, and potentially other uses in - // the future, always build zero vectors as <4 x i32> or <2 x i32> bitcasted + // the future, always build zero vectors as <16 x i8> or <8 x i8> bitcasted // to their dest type. This ensures they get CSE'd. SDValue Vec; - SDValue Cst = DAG.getTargetConstant(0, MVT::i32); - if (VT.getSizeInBits() == 64) - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); - else - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); + SDValue Cst = DAG.getTargetConstant(0, MVT::i8); + SmallVector<SDValue, 8> Ops; + MVT TVT; + + if (VT.getSizeInBits() == 64) { + Ops.assign(8, Cst); TVT = MVT::v8i8; + } else { + Ops.assign(16, Cst); TVT = MVT::v16i8; + } + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, TVT, &Ops[0], Ops.size()); return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); } /// getOnesVector - Returns a vector of specified type with all bits set. /// -static SDValue getOnesVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) { +static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); - // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest - // type. This ensures they get CSE'd. + // Always build ones vectors as <16 x i32> or <8 x i32> bitcasted to their + // dest type. This ensures they get CSE'd. SDValue Vec; - SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); - if (VT.getSizeInBits() == 64) - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); - else - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); + SDValue Cst = DAG.getTargetConstant(0xFF, MVT::i8); + SmallVector<SDValue, 8> Ops; + MVT TVT; + + if (VT.getSizeInBits() == 64) { + Ops.assign(8, Cst); TVT = MVT::v8i8; + } else { + Ops.assign(16, Cst); TVT = MVT::v16i8; + } + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, TVT, &Ops[0], Ops.size()); return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); } static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { - MVT VT = N->getValueType(0); + EVT VT = N->getValueType(0); DebugLoc dl = N->getDebugLoc(); // Lower vector shifts on NEON to use VSHL. @@ -1947,7 +2092,7 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, // NEON uses the same intrinsics for both left and right shifts. For // right shifts, the shift amounts are negative, so negate the vector of // shift amounts. - MVT ShiftVT = N->getOperand(1).getValueType(); + EVT ShiftVT = N->getOperand(1).getValueType(); SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1)); @@ -1959,8 +2104,11 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, N->getOperand(0), NegatedCount); } - assert(VT == MVT::i64 && - (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && + // We can get here for a node like i32 = ISD::SHL i32, i64 + if (VT != MVT::i64) + return SDValue(); + + assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && "Unknown shift to lower!"); // We only lower SRA, SRL of 1 here, all others use generic lowering. @@ -1969,7 +2117,7 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, return SDValue(); // If we are in thumb mode, we don't have RRX. - if (ST->isThumb()) return SDValue(); + if (ST->isThumb1Only()) return SDValue(); // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), @@ -1998,13 +2146,13 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); - MVT VT = Op.getValueType(); + EVT VT = Op.getValueType(); ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); DebugLoc dl = Op.getDebugLoc(); if (Op.getOperand(1).getValueType().isFloatingPoint()) { switch (SetCCOpcode) { - default: assert(0 && "Illegal FP comparison"); break; + default: llvm_unreachable("Illegal FP comparison"); break; case ISD::SETUNE: case ISD::SETNE: Invert = true; // Fallthrough case ISD::SETOEQ: @@ -2043,7 +2191,7 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { } else { // Integer comparisons. switch (SetCCOpcode) { - default: assert(0 && "Illegal integer comparison"); break; + default: llvm_unreachable("Illegal integer comparison"); break; case ISD::SETNE: Invert = true; case ISD::SETEQ: Opc = ARMISD::VCEQ; break; case ISD::SETLT: Swap = true; @@ -2056,7 +2204,7 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { case ISD::SETUGE: Opc = ARMISD::VCGEU; break; } - // Detect VTST (Vector Test Bits) = vicmp ne (and (op0, op1), zero). + // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). if (Opc == ARMISD::VCEQ) { SDValue AndOp; @@ -2147,7 +2295,7 @@ static SDValue isVMOVSplat(uint64_t SplatBits, uint64_t SplatUndef, } default: - assert(0 && "unexpected size for isVMOVSplat"); + llvm_unreachable("unexpected size for isVMOVSplat"); break; } @@ -2174,22 +2322,123 @@ SDValue ARM::getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { SplatBitSize, DAG); } -static SDValue BuildSplat(SDValue Val, MVT VT, SelectionDAG &DAG, DebugLoc dl) { +static bool isVEXTMask(const SmallVectorImpl<int> &M, EVT VT, + bool &ReverseVEXT, unsigned &Imm) { + unsigned NumElts = VT.getVectorNumElements(); + ReverseVEXT = false; + Imm = M[0]; + + // If this is a VEXT shuffle, the immediate value is the index of the first + // element. The other shuffle indices must be the successive elements after + // the first one. + unsigned ExpectedElt = Imm; + for (unsigned i = 1; i < NumElts; ++i) { + // Increment the expected index. If it wraps around, it may still be + // a VEXT but the source vectors must be swapped. + ExpectedElt += 1; + if (ExpectedElt == NumElts * 2) { + ExpectedElt = 0; + ReverseVEXT = true; + } + + if (ExpectedElt != static_cast<unsigned>(M[i])) + return false; + } + + // Adjust the index value if the source operands will be swapped. + if (ReverseVEXT) + Imm -= NumElts; + + return true; +} + +/// isVREVMask - Check if a vector shuffle corresponds to a VREV +/// instruction with the specified blocksize. (The order of the elements +/// within each block of the vector is reversed.) +static bool isVREVMask(const SmallVectorImpl<int> &M, EVT VT, + unsigned BlockSize) { + assert((BlockSize==16 || BlockSize==32 || BlockSize==64) && + "Only possible block sizes for VREV are: 16, 32, 64"); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + unsigned BlockElts = M[0] + 1; + + if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) + return false; + + for (unsigned i = 0; i < NumElts; ++i) { + if ((unsigned) M[i] != + (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts)) + return false; + } + + return true; +} + +static bool isVTRNMask(const SmallVectorImpl<int> &M, EVT VT, + unsigned &WhichResult) { + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i < NumElts; i += 2) { + if ((unsigned) M[i] != i + WhichResult || + (unsigned) M[i+1] != i + NumElts + WhichResult) + return false; + } + return true; +} + +static bool isVUZPMask(const SmallVectorImpl<int> &M, EVT VT, + unsigned &WhichResult) { + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i != NumElts; ++i) { + if ((unsigned) M[i] != 2 * i + WhichResult) + return false; + } + + // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. + if (VT.is64BitVector() && VT.getVectorElementType().getSizeInBits() == 32) + return false; + + return true; +} + +static bool isVZIPMask(const SmallVectorImpl<int> &M, EVT VT, + unsigned &WhichResult) { + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned i = 0; i != NumElts; i += 2) { + if ((unsigned) M[i] != Idx || + (unsigned) M[i+1] != Idx + NumElts) + return false; + Idx += 1; + } + + // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. + if (VT.is64BitVector() && VT.getVectorElementType().getSizeInBits() == 32) + return false; + + return true; +} + +static SDValue BuildSplat(SDValue Val, EVT VT, SelectionDAG &DAG, DebugLoc dl) { // Canonicalize all-zeros and all-ones vectors. - ConstantSDNode *ConstVal = dyn_cast<ConstantSDNode>(Val.getNode()); + ConstantSDNode *ConstVal = cast<ConstantSDNode>(Val.getNode()); if (ConstVal->isNullValue()) return getZeroVector(VT, DAG, dl); if (ConstVal->isAllOnesValue()) return getOnesVector(VT, DAG, dl); - MVT CanonicalVT; + EVT CanonicalVT; if (VT.is64BitVector()) { switch (Val.getValueType().getSizeInBits()) { case 8: CanonicalVT = MVT::v8i8; break; case 16: CanonicalVT = MVT::v4i16; break; case 32: CanonicalVT = MVT::v2i32; break; case 64: CanonicalVT = MVT::v1i64; break; - default: assert(0 && "unexpected splat element type"); break; + default: llvm_unreachable("unexpected splat element type"); break; } } else { assert(VT.is128BitVector() && "unknown splat vector size"); @@ -2198,7 +2447,7 @@ static SDValue BuildSplat(SDValue Val, MVT VT, SelectionDAG &DAG, DebugLoc dl) { case 16: CanonicalVT = MVT::v8i16; break; case 32: CanonicalVT = MVT::v4i32; break; case 64: CanonicalVT = MVT::v2i64; break; - default: assert(0 && "unexpected splat element type"); break; + default: llvm_unreachable("unexpected splat element type"); break; } } @@ -2213,69 +2462,291 @@ static SDValue BuildSplat(SDValue Val, MVT VT, SelectionDAG &DAG, DebugLoc dl) { // If this is a case we can't handle, return null and let the default // expansion code take care of it. static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { - BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); - assert(BVN != 0 && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR"); + BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); DebugLoc dl = Op.getDebugLoc(); + EVT VT = Op.getValueType(); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { - SDValue Val = isVMOVSplat(SplatBits.getZExtValue(), - SplatUndef.getZExtValue(), SplatBitSize, DAG); - if (Val.getNode()) - return BuildSplat(Val, Op.getValueType(), DAG, dl); + if (SplatBitSize <= 64) { + SDValue Val = isVMOVSplat(SplatBits.getZExtValue(), + SplatUndef.getZExtValue(), SplatBitSize, DAG); + if (Val.getNode()) + return BuildSplat(Val, VT, DAG, dl); + } + } + + // If there are only 2 elements in a 128-bit vector, insert them into an + // undef vector. This handles the common case for 128-bit vector argument + // passing, where the insertions should be translated to subreg accesses + // with no real instructions. + if (VT.is128BitVector() && Op.getNumOperands() == 2) { + SDValue Val = DAG.getUNDEF(VT); + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + if (Op0.getOpcode() != ISD::UNDEF) + Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, Op0, + DAG.getIntPtrConstant(0)); + if (Op1.getOpcode() != ISD::UNDEF) + Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, Op1, + DAG.getIntPtrConstant(1)); + return Val; } return SDValue(); } -static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { - return Op; +/// isShuffleMaskLegal - Targets can use this to indicate that they only +/// support *some* VECTOR_SHUFFLE operations, those with specific masks. +/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values +/// are assumed to be legal. +bool +ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, + EVT VT) const { + if (VT.getVectorNumElements() == 4 && + (VT.is128BitVector() || VT.is64BitVector())) { + unsigned PFIndexes[4]; + for (unsigned i = 0; i != 4; ++i) { + if (M[i] < 0) + PFIndexes[i] = 8; + else + PFIndexes[i] = M[i]; + } + + // Compute the index in the perfect shuffle table. + unsigned PFTableIndex = + PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; + unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; + unsigned Cost = (PFEntry >> 30); + + if (Cost <= 4) + return true; + } + + bool ReverseVEXT; + unsigned Imm, WhichResult; + + return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || + isVREVMask(M, VT, 64) || + isVREVMask(M, VT, 32) || + isVREVMask(M, VT, 16) || + isVEXTMask(M, VT, ReverseVEXT, Imm) || + isVTRNMask(M, VT, WhichResult) || + isVUZPMask(M, VT, WhichResult) || + isVZIPMask(M, VT, WhichResult)); +} + +/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit +/// the specified operations to build the shuffle. +static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, + SDValue RHS, SelectionDAG &DAG, + DebugLoc dl) { + unsigned OpNum = (PFEntry >> 26) & 0x0F; + unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); + unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); + + enum { + OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> + OP_VREV, + OP_VDUP0, + OP_VDUP1, + OP_VDUP2, + OP_VDUP3, + OP_VEXT1, + OP_VEXT2, + OP_VEXT3, + OP_VUZPL, // VUZP, left result + OP_VUZPR, // VUZP, right result + OP_VZIPL, // VZIP, left result + OP_VZIPR, // VZIP, right result + OP_VTRNL, // VTRN, left result + OP_VTRNR // VTRN, right result + }; + + if (OpNum == OP_COPY) { + if (LHSID == (1*9+2)*9+3) return LHS; + assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); + return RHS; + } + + SDValue OpLHS, OpRHS; + OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); + OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); + EVT VT = OpLHS.getValueType(); + + switch (OpNum) { + default: llvm_unreachable("Unknown shuffle opcode!"); + case OP_VREV: + return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); + case OP_VDUP0: + case OP_VDUP1: + case OP_VDUP2: + case OP_VDUP3: + return DAG.getNode(ARMISD::VDUPLANE, dl, VT, + OpLHS, DAG.getConstant(OpNum-OP_VDUP0, MVT::i32)); + case OP_VEXT1: + case OP_VEXT2: + case OP_VEXT3: + return DAG.getNode(ARMISD::VEXT, dl, VT, + OpLHS, OpRHS, + DAG.getConstant(OpNum-OP_VEXT1+1, MVT::i32)); + case OP_VUZPL: + case OP_VUZPR: + return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), + OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); + case OP_VZIPL: + case OP_VZIPR: + return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), + OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); + case OP_VTRNL: + case OP_VTRNR: + return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), + OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); + } } -static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { - return Op; +static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + DebugLoc dl = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); + SmallVector<int, 8> ShuffleMask; + + // Convert shuffles that are directly supported on NEON to target-specific + // DAG nodes, instead of keeping them as shuffles and matching them again + // during code selection. This is more efficient and avoids the possibility + // of inconsistencies between legalization and selection. + // FIXME: floating-point vectors should be canonicalized to integer vectors + // of the same time so that they get CSEd properly. + SVN->getMask(ShuffleMask); + + if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) { + int Lane = SVN->getSplatIndex(); + if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { + return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); + } + return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, + DAG.getConstant(Lane, MVT::i32)); + } + + bool ReverseVEXT; + unsigned Imm; + if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { + if (ReverseVEXT) + std::swap(V1, V2); + return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, + DAG.getConstant(Imm, MVT::i32)); + } + + if (isVREVMask(ShuffleMask, VT, 64)) + return DAG.getNode(ARMISD::VREV64, dl, VT, V1); + if (isVREVMask(ShuffleMask, VT, 32)) + return DAG.getNode(ARMISD::VREV32, dl, VT, V1); + if (isVREVMask(ShuffleMask, VT, 16)) + return DAG.getNode(ARMISD::VREV16, dl, VT, V1); + + // Check for Neon shuffles that modify both input vectors in place. + // If both results are used, i.e., if there are two shuffles with the same + // source operands and with masks corresponding to both results of one of + // these operations, DAG memoization will ensure that a single node is + // used for both shuffles. + unsigned WhichResult; + if (isVTRNMask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), + V1, V2).getValue(WhichResult); + if (isVUZPMask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), + V1, V2).getValue(WhichResult); + if (isVZIPMask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), + V1, V2).getValue(WhichResult); + + // If the shuffle is not directly supported and it has 4 elements, use + // the PerfectShuffle-generated table to synthesize it from other shuffles. + if (VT.getVectorNumElements() == 4 && + (VT.is128BitVector() || VT.is64BitVector())) { + unsigned PFIndexes[4]; + for (unsigned i = 0; i != 4; ++i) { + if (ShuffleMask[i] < 0) + PFIndexes[i] = 8; + else + PFIndexes[i] = ShuffleMask[i]; + } + + // Compute the index in the perfect shuffle table. + unsigned PFTableIndex = + PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; + + unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; + unsigned Cost = (PFEntry >> 30); + + if (Cost <= 4) + return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); + } + + return SDValue(); } static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { - MVT VT = Op.getValueType(); + EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); - assert((VT == MVT::i8 || VT == MVT::i16) && - "unexpected type for custom-lowering vector extract"); SDValue Vec = Op.getOperand(0); SDValue Lane = Op.getOperand(1); + + // FIXME: This is invalid for 8 and 16-bit elements - the information about + // sign / zero extension is lost! Op = DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); Op = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Op, DAG.getValueType(VT)); - return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); + + if (VT.bitsLT(MVT::i32)) + Op = DAG.getNode(ISD::TRUNCATE, dl, VT, Op); + else if (VT.bitsGT(MVT::i32)) + Op = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Op); + + return Op; } -static SDValue LowerCONCAT_VECTORS(SDValue Op) { - if (Op.getValueType().is128BitVector() && Op.getNumOperands() == 2) - return Op; - return SDValue(); +static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { + // The only time a CONCAT_VECTORS operation can have legal types is when + // two 64-bit vectors are concatenated to a 128-bit vector. + assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && + "unexpected CONCAT_VECTORS"); + DebugLoc dl = Op.getDebugLoc(); + SDValue Val = DAG.getUNDEF(MVT::v2f64); + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + if (Op0.getOpcode() != ISD::UNDEF) + Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, Op0), + DAG.getIntPtrConstant(0)); + if (Op1.getOpcode() != ISD::UNDEF) + Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, Op1), + DAG.getIntPtrConstant(1)); + return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Val); } SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { switch (Op.getOpcode()) { - default: assert(0 && "Don't know how to custom lower this!"); abort(); + default: llvm_unreachable("Don't know how to custom lower this!"); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::GlobalAddress: return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) : LowerGlobalAddressELF(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); - case ISD::CALL: return LowerCALL(Op, DAG); - case ISD::RET: return LowerRET(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG, Subtarget); case ISD::BR_CC: return LowerBR_CC(Op, DAG, Subtarget); case ISD::BR_JT: return LowerBR_JT(Op, DAG); + case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG, VarArgsFrameIndex); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); - case ISD::FORMAL_ARGUMENTS: return LowerFORMAL_ARGUMENTS(Op, DAG); case ISD::RETURNADDR: break; case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG); @@ -2287,9 +2758,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { case ISD::VSETCC: return LowerVSETCC(Op, DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); - case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); - case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op); + case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); } return SDValue(); } @@ -2301,7 +2771,7 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, SelectionDAG &DAG) { switch (N->getOpcode()) { default: - assert(0 && "Don't know how to custom expand this!"); + llvm_unreachable("Don't know how to custom expand this!"); return; case ISD::BIT_CONVERT: Results.push_back(ExpandBIT_CONVERT(N, DAG)); @@ -2322,12 +2792,14 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, MachineBasicBlock * ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, - MachineBasicBlock *BB) const { + MachineBasicBlock *BB, + DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const { const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); DebugLoc dl = MI->getDebugLoc(); switch (MI->getOpcode()) { - default: assert(false && "Unexpected instr type to insert"); - case ARM::tMOVCCr: { + default: + llvm_unreachable("Unexpected instr type to insert"); + case ARM::tMOVCCr_pseudo: { // To "insert" a SELECT_CC instruction, we actually have to insert the // diamond control-flow pattern. The incoming instruction knows the // destination vreg to set, the condition code register to branch on, the @@ -2352,12 +2824,15 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, F->insert(It, sinkMBB); // Update machine-CFG edges by first adding all successors of the current // block to the new block which will contain the Phi node for the select. - for(MachineBasicBlock::succ_iterator i = BB->succ_begin(), - e = BB->succ_end(); i != e; ++i) - sinkMBB->addSuccessor(*i); + // Also inform sdisel of the edge changes. + for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), + E = BB->succ_end(); I != E; ++I) { + EM->insert(std::make_pair(*I, sinkMBB)); + sinkMBB->addSuccessor(*I); + } // Next, remove all successors of the current block, and add the true // and fallthrough blocks as its successors. - while(!BB->succ_empty()) + while (!BB->succ_empty()) BB->removeSuccessor(BB->succ_begin()); BB->addSuccessor(copy0MBB); BB->addSuccessor(sinkMBB); @@ -2381,6 +2856,78 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. return BB; } + + case ARM::tANDsp: + case ARM::tADDspr_: + case ARM::tSUBspi_: + case ARM::t2SUBrSPi_: + case ARM::t2SUBrSPi12_: + case ARM::t2SUBrSPs_: { + MachineFunction *MF = BB->getParent(); + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned SrcReg = MI->getOperand(1).getReg(); + bool DstIsDead = MI->getOperand(0).isDead(); + bool SrcIsKill = MI->getOperand(1).isKill(); + + if (SrcReg != ARM::SP) { + // Copy the source to SP from virtual register. + const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(SrcReg); + unsigned CopyOpc = (RC == ARM::tGPRRegisterClass) + ? ARM::tMOVtgpr2gpr : ARM::tMOVgpr2gpr; + BuildMI(BB, dl, TII->get(CopyOpc), ARM::SP) + .addReg(SrcReg, getKillRegState(SrcIsKill)); + } + + unsigned OpOpc = 0; + bool NeedPred = false, NeedCC = false, NeedOp3 = false; + switch (MI->getOpcode()) { + default: + llvm_unreachable("Unexpected pseudo instruction!"); + case ARM::tANDsp: + OpOpc = ARM::tAND; + NeedPred = true; + break; + case ARM::tADDspr_: + OpOpc = ARM::tADDspr; + break; + case ARM::tSUBspi_: + OpOpc = ARM::tSUBspi; + break; + case ARM::t2SUBrSPi_: + OpOpc = ARM::t2SUBrSPi; + NeedPred = true; NeedCC = true; + break; + case ARM::t2SUBrSPi12_: + OpOpc = ARM::t2SUBrSPi12; + NeedPred = true; + break; + case ARM::t2SUBrSPs_: + OpOpc = ARM::t2SUBrSPs; + NeedPred = true; NeedCC = true; NeedOp3 = true; + break; + } + MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(OpOpc), ARM::SP); + if (OpOpc == ARM::tAND) + AddDefaultT1CC(MIB); + MIB.addReg(ARM::SP); + MIB.addOperand(MI->getOperand(2)); + if (NeedOp3) + MIB.addOperand(MI->getOperand(3)); + if (NeedPred) + AddDefaultPred(MIB); + if (NeedCC) + AddDefaultCC(MIB); + + // Copy the result from SP to virtual register. + const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(DstReg); + unsigned CopyOpc = (RC == ARM::tGPRRegisterClass) + ? ARM::tMOVgpr2tgpr : ARM::tMOVgpr2gpr; + BuildMI(BB, dl, TII->get(CopyOpc)) + .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstIsDead)) + .addReg(ARM::SP); + MF->DeleteMachineInstr(MI); // The pseudo instruction is gone now. + return BB; + } } } @@ -2393,7 +2940,7 @@ SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - MVT VT = N->getValueType(0); + EVT VT = N->getValueType(0); unsigned Opc = N->getOpcode(); bool isSlctCC = Slct.getOpcode() == ISD::SELECT_CC; SDValue LHS = isSlctCC ? Slct.getOperand(2) : Slct.getOperand(1); @@ -2421,7 +2968,7 @@ SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, cast<ConstantSDNode>(RHS)->isNullValue()) { std::swap(LHS, RHS); SDValue Op0 = Slct.getOperand(0); - MVT OpVT = isSlctCC ? Op0.getValueType() : + EVT OpVT = isSlctCC ? Op0.getValueType() : Op0.getOperand(0).getValueType(); bool isInt = OpVT.isInteger(); CC = ISD::getSetCCInverse(CC, isInt); @@ -2516,7 +3063,7 @@ static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { /// operand of a vector shift left operation. That value must be in the range: /// 0 <= Value < ElementBits for a left shift; or /// 0 <= Value <= ElementBits for a long left shift. -static bool isVShiftLImm(SDValue Op, MVT VT, bool isLong, int64_t &Cnt) { +static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); if (! getVShiftImm(Op, ElementBits, Cnt)) @@ -2530,7 +3077,7 @@ static bool isVShiftLImm(SDValue Op, MVT VT, bool isLong, int64_t &Cnt) { /// absolute value must be in the range: /// 1 <= |Value| <= ElementBits for a right shift; or /// 1 <= |Value| <= ElementBits/2 for a narrow right shift. -static bool isVShiftRImm(SDValue Op, MVT VT, bool isNarrow, bool isIntrinsic, +static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); @@ -2571,7 +3118,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { case Intrinsic::arm_neon_vqrshiftns: case Intrinsic::arm_neon_vqrshiftnu: case Intrinsic::arm_neon_vqrshiftnsu: { - MVT VT = N->getOperand(1).getValueType(); + EVT VT = N->getOperand(1).getValueType(); int64_t Cnt; unsigned VShiftOpc = 0; @@ -2593,8 +3140,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { case Intrinsic::arm_neon_vshiftlu: if (isVShiftLImm(N->getOperand(2), VT, true, Cnt)) break; - assert(0 && "invalid shift count for vshll intrinsic"); - abort(); + llvm_unreachable("invalid shift count for vshll intrinsic"); case Intrinsic::arm_neon_vrshifts: case Intrinsic::arm_neon_vrshiftu: @@ -2611,8 +3157,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { case Intrinsic::arm_neon_vqshiftsu: if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) break; - assert(0 && "invalid shift count for vqshlu intrinsic"); - abort(); + llvm_unreachable("invalid shift count for vqshlu intrinsic"); case Intrinsic::arm_neon_vshiftn: case Intrinsic::arm_neon_vrshiftn: @@ -2625,11 +3170,10 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { // Narrowing shifts require an immediate right shift. if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) break; - assert(0 && "invalid shift count for narrowing vector shift intrinsic"); - abort(); + llvm_unreachable("invalid shift count for narrowing vector shift intrinsic"); default: - assert(0 && "unhandled vector shift"); + llvm_unreachable("unhandled vector shift"); } switch (IntNo) { @@ -2678,7 +3222,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { } case Intrinsic::arm_neon_vshiftins: { - MVT VT = N->getOperand(1).getValueType(); + EVT VT = N->getOperand(1).getValueType(); int64_t Cnt; unsigned VShiftOpc = 0; @@ -2687,8 +3231,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) VShiftOpc = ARMISD::VSRI; else { - assert(0 && "invalid shift count for vsli/vsri intrinsic"); - abort(); + llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); } return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0), @@ -2712,7 +3255,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { /// their values after they get legalized to loads from a constant pool. static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { - MVT VT = N->getValueType(0); + EVT VT = N->getValueType(0); // Nothing to be done for scalar shifts. if (! VT.isVector()) @@ -2722,7 +3265,7 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, int64_t Cnt; switch (N->getOpcode()) { - default: assert(0 && "unexpected shift opcode"); + default: llvm_unreachable("unexpected shift opcode"); case ISD::SHL: if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) @@ -2755,8 +3298,8 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { SDValue Vec = N0.getOperand(0); SDValue Lane = N0.getOperand(1); - MVT VT = N->getValueType(0); - MVT EltVT = N0.getValueType(); + EVT VT = N->getValueType(0); + EVT EltVT = N0.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (VT == MVT::i32 && @@ -2765,7 +3308,7 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, unsigned Opc = 0; switch (N->getOpcode()) { - default: assert(0 && "unexpected opcode"); + default: llvm_unreachable("unexpected opcode"); case ISD::SIGN_EXTEND: Opc = ARMISD::VGETLANEs; break; @@ -2802,10 +3345,88 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); } +bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const { + if (!Subtarget->hasV6Ops()) + // Pre-v6 does not support unaligned mem access. + return false; + else if (!Subtarget->hasV6Ops()) { + // v6 may or may not support unaligned mem access. + if (!Subtarget->isTargetDarwin()) + return false; + } + + switch (VT.getSimpleVT().SimpleTy) { + default: + return false; + case MVT::i8: + case MVT::i16: + case MVT::i32: + return true; + // FIXME: VLD1 etc with standard alignment is legal. + } +} + +static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { + if (V < 0) + return false; + + unsigned Scale = 1; + switch (VT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i1: + case MVT::i8: + // Scale == 1; + break; + case MVT::i16: + // Scale == 2; + Scale = 2; + break; + case MVT::i32: + // Scale == 4; + Scale = 4; + break; + } + + if ((V & (Scale - 1)) != 0) + return false; + V /= Scale; + return V == (V & ((1LL << 5) - 1)); +} + +static bool isLegalT2AddressImmediate(int64_t V, EVT VT, + const ARMSubtarget *Subtarget) { + bool isNeg = false; + if (V < 0) { + isNeg = true; + V = - V; + } + + switch (VT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + // + imm12 or - imm8 + if (isNeg) + return V == (V & ((1LL << 8) - 1)); + return V == (V & ((1LL << 12) - 1)); + case MVT::f32: + case MVT::f64: + // Same as ARM mode. FIXME: NEON? + if (!Subtarget->hasVFP2()) + return false; + if ((V & 3) != 0) + return false; + V >>= 2; + return V == (V & ((1LL << 8) - 1)); + } +} + /// isLegalAddressImmediate - Return true if the integer value can be used /// as the offset of the target addressing mode for load / store of the /// given type. -static bool isLegalAddressImmediate(int64_t V, MVT VT, +static bool isLegalAddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget) { if (V == 0) return true; @@ -2813,36 +3434,15 @@ static bool isLegalAddressImmediate(int64_t V, MVT VT, if (!VT.isSimple()) return false; - if (Subtarget->isThumb()) { - if (V < 0) - return false; - - unsigned Scale = 1; - switch (VT.getSimpleVT()) { - default: return false; - case MVT::i1: - case MVT::i8: - // Scale == 1; - break; - case MVT::i16: - // Scale == 2; - Scale = 2; - break; - case MVT::i32: - // Scale == 4; - Scale = 4; - break; - } - - if ((V & (Scale - 1)) != 0) - return false; - V /= Scale; - return V == (V & ((1LL << 5) - 1)); - } + if (Subtarget->isThumb1Only()) + return isLegalT1AddressImmediate(V, VT); + else if (Subtarget->isThumb2()) + return isLegalT2AddressImmediate(V, VT, Subtarget); + // ARM mode. if (V < 0) V = - V; - switch (VT.getSimpleVT()) { + switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i1: case MVT::i8: @@ -2854,7 +3454,7 @@ static bool isLegalAddressImmediate(int64_t V, MVT VT, return V == (V & ((1LL << 8) - 1)); case MVT::f32: case MVT::f64: - if (!Subtarget->hasVFP2()) + if (!Subtarget->hasVFP2()) // FIXME: NEON? return false; if ((V & 3) != 0) return false; @@ -2863,11 +3463,44 @@ static bool isLegalAddressImmediate(int64_t V, MVT VT, } } +bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, + EVT VT) const { + int Scale = AM.Scale; + if (Scale < 0) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + if (Scale == 1) + return true; + // r + r << imm + Scale = Scale & ~1; + return Scale == 2 || Scale == 4 || Scale == 8; + case MVT::i64: + // r + r + if (((unsigned)AM.HasBaseReg + Scale) <= 2) + return true; + return false; + case MVT::isVoid: + // Note, we allow "void" uses (basically, uses that aren't loads or + // stores), because arm allows folding a scale into many arithmetic + // operations. This should be made more precise and revisited later. + + // Allow r << imm, but the imm has to be a multiple of two. + if (Scale & 1) return false; + return isPowerOf2_32(Scale); + } +} + /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM, const Type *Ty) const { - MVT VT = getValueType(Ty, true); + EVT VT = getValueType(Ty, true); if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) return false; @@ -2879,7 +3512,7 @@ bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM, case 0: // no scale reg, must be "r+i" or "r", or "i". break; case 1: - if (Subtarget->isThumb()) + if (Subtarget->isThumb1Only()) return false; // FALL THROUGH. default: @@ -2890,22 +3523,22 @@ bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM, if (!VT.isSimple()) return false; + if (Subtarget->isThumb2()) + return isLegalT2ScaledAddressingMode(AM, VT); + int Scale = AM.Scale; - switch (VT.getSimpleVT()) { + switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i1: case MVT::i8: case MVT::i32: - case MVT::i64: - // This assumes i64 is legalized to a pair of i32. If not (i.e. - // ldrd / strd are used, then its address mode is same as i16. - // r + r if (Scale < 0) Scale = -Scale; if (Scale == 1) return true; // r + r << imm return isPowerOf2_32(Scale & ~1); case MVT::i16: + case MVT::i64: // r + r if (((unsigned)AM.HasBaseReg + Scale) <= 2) return true; @@ -2917,15 +3550,15 @@ bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM, // operations. This should be made more precise and revisited later. // Allow r << imm, but the imm has to be a multiple of two. - if (AM.Scale & 1) return false; - return isPowerOf2_32(AM.Scale); + if (Scale & 1) return false; + return isPowerOf2_32(Scale); } break; } return true; } -static bool getARMIndexedAddressParts(SDNode *Ptr, MVT VT, +static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG) { @@ -2983,7 +3616,7 @@ static bool getARMIndexedAddressParts(SDNode *Ptr, MVT VT, return false; } -static bool getT2IndexedAddressParts(SDNode *Ptr, MVT VT, +static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG) { @@ -3019,7 +3652,7 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, if (Subtarget->isThumb1Only()) return false; - MVT VT; + EVT VT; SDValue Ptr; bool isSEXTLoad = false; if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { @@ -3037,7 +3670,7 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, if (Subtarget->isThumb2()) isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, Offset, isInc, DAG); - else + else isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, Offset, isInc, DAG); if (!isLegal) @@ -3058,7 +3691,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, if (Subtarget->isThumb1Only()) return false; - MVT VT; + EVT VT; SDValue Ptr; bool isSEXTLoad = false; if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { @@ -3074,7 +3707,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, if (Subtarget->isThumb2()) isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, isInc, DAG); - else + else isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, isInc, DAG); if (!isLegal) @@ -3128,12 +3761,12 @@ ARMTargetLowering::getConstraintType(const std::string &Constraint) const { std::pair<unsigned, const TargetRegisterClass*> ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, - MVT VT) const { + EVT VT) const { if (Constraint.size() == 1) { // GCC RS6000 Constraint Letters switch (Constraint[0]) { case 'l': - if (Subtarget->isThumb()) + if (Subtarget->isThumb1Only()) return std::make_pair(0U, ARM::tGPRRegisterClass); else return std::make_pair(0U, ARM::GPRRegisterClass); @@ -3152,7 +3785,7 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, std::vector<unsigned> ARMTargetLowering:: getRegClassForInlineAsmConstraint(const std::string &Constraint, - MVT VT) const { + EVT VT) const { if (Constraint.size() != 1) return std::vector<unsigned>(); @@ -3214,10 +3847,16 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, switch (Constraint) { case 'I': - if (Subtarget->isThumb()) { - // This must be a constant between 0 and 255, for ADD immediates. + if (Subtarget->isThumb1Only()) { + // This must be a constant between 0 and 255, for ADD + // immediates. if (CVal >= 0 && CVal <= 255) break; + } else if (Subtarget->isThumb2()) { + // A constant that can be used as an immediate value in a + // data-processing instruction. + if (ARM_AM::getT2SOImmVal(CVal) != -1) + break; } else { // A constant that can be used as an immediate value in a // data-processing instruction. @@ -3227,7 +3866,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, return; case 'J': - if (Subtarget->isThumb()) { + if (Subtarget->isThumb()) { // FIXME thumb2 // This must be a constant between -255 and -1, for negated ADD // immediates. This can be used in GCC with an "n" modifier that // prints the negated value, for use with SUB instructions. It is @@ -3244,13 +3883,21 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, return; case 'K': - if (Subtarget->isThumb()) { + if (Subtarget->isThumb1Only()) { // A 32-bit value where only one byte has a nonzero value. Exclude // zero to match GCC. This constraint is used by GCC internally for // constants that can be loaded with a move/shift combination. // It is not useful otherwise but is implemented for compatibility. if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) break; + } else if (Subtarget->isThumb2()) { + // A constant whose bitwise inverse can be used as an immediate + // value in a data-processing instruction. This can be used in GCC + // with a "B" modifier that prints the inverted value, for use with + // BIC and MVN instructions. It is not useful otherwise but is + // implemented for compatibility. + if (ARM_AM::getT2SOImmVal(~CVal) != -1) + break; } else { // A constant whose bitwise inverse can be used as an immediate // value in a data-processing instruction. This can be used in GCC @@ -3263,11 +3910,19 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, return; case 'L': - if (Subtarget->isThumb()) { + if (Subtarget->isThumb1Only()) { // This must be a constant between -7 and 7, // for 3-operand ADD/SUB immediate instructions. if (CVal >= -7 && CVal < 7) break; + } else if (Subtarget->isThumb2()) { + // A constant whose negation can be used as an immediate value in a + // data-processing instruction. This can be used in GCC with an "n" + // modifier that prints the negated value, for use with SUB + // instructions. It is not useful otherwise but is implemented for + // compatibility. + if (ARM_AM::getT2SOImmVal(-CVal) != -1) + break; } else { // A constant whose negation can be used as an immediate value in a // data-processing instruction. This can be used in GCC with an "n" @@ -3280,7 +3935,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, return; case 'M': - if (Subtarget->isThumb()) { + if (Subtarget->isThumb()) { // FIXME thumb2 // This must be a multiple of 4 between 0 and 1020, for // ADD sp + immediate. if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) @@ -3295,7 +3950,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, return; case 'N': - if (Subtarget->isThumb()) { + if (Subtarget->isThumb()) { // FIXME thumb2 // This must be a constant between 0 and 31, for shift amounts. if (CVal >= 0 && CVal <= 31) break; @@ -3303,7 +3958,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, return; case 'O': - if (Subtarget->isThumb()) { + if (Subtarget->isThumb()) { // FIXME thumb2 // This must be a multiple of 4 between -508 and 508, for // ADD/SUB sp = sp + immediate. if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) @@ -3322,3 +3977,9 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory, Ops, DAG); } + +bool +ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { + // The ARM target isn't yet aware of offsets. + return false; +} diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 553a86d..7d85f45 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -40,6 +40,7 @@ namespace llvm { tCALL, // Thumb function call. BRCOND, // Conditional branch. BR_JT, // Jumptable branch. + BR2_JT, // Jumptable branch (2 level - jumptable entry is a jump). RET_FLAG, // Return with a flag operand. PIC_ADD, // Add with a PC operand and a PIC label. @@ -64,11 +65,13 @@ namespace llvm { FMRRD, // double to two gprs. FMDRR, // Two gprs to double. - EH_SJLJ_SETJMP, // SjLj exception handling setjmp - EH_SJLJ_LONGJMP, // SjLj exception handling longjmp + EH_SJLJ_SETJMP, // SjLj exception handling setjmp. + EH_SJLJ_LONGJMP, // SjLj exception handling longjmp. THREAD_POINTER, + DYN_ALLOC, // Dynamic allocation on the stack. + VCEQ, // Vector compare equal. VCGE, // Vector compare greater than or equal. VCGEU, // Vector compare unsigned greater than or equal. @@ -112,8 +115,18 @@ namespace llvm { VGETLANEu, // zero-extend vector extract element VGETLANEs, // sign-extend vector extract element - // Vector duplicate lane (128-bit result only; 64-bit is a shuffle) - VDUPLANEQ // splat a lane from a 64-bit vector to a 128-bit vector + // Vector duplicate: + VDUP, + VDUPLANE, + + // Vector shuffles: + VEXT, // extract + VREV64, // reverse elements within 64-bit doublewords + VREV32, // reverse elements within 32-bit words + VREV16, // reverse elements within 16-bit halfwords + VZIP, // zip (interleave) + VUZP, // unzip (deinterleave) + VTRN // transpose }; } @@ -147,11 +160,18 @@ namespace llvm { virtual const char *getTargetNodeName(unsigned Opcode) const; virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI, - MachineBasicBlock *MBB) const; + MachineBasicBlock *MBB, + DenseMap<MachineBasicBlock*, MachineBasicBlock*>*) const; + + /// allowsUnalignedMemoryAccesses - Returns true if the target allows + /// unaligned memory accesses. of the specified type. + /// FIXME: Add getOptimalMemOpType to implement memcpy with NEON? + virtual bool allowsUnalignedMemoryAccesses(EVT VT) const; /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const; + bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const; /// getPreIndexedAddressParts - returns true by value, base pointer and /// offset pointer and addressing mode by reference if the node's address @@ -175,13 +195,15 @@ namespace llvm { APInt &KnownOne, const SelectionDAG &DAG, unsigned Depth) const; + + ConstraintType getConstraintType(const std::string &Constraint) const; std::pair<unsigned, const TargetRegisterClass*> getRegForInlineAsmConstraint(const std::string &Constraint, - MVT VT) const; + EVT VT) const; std::vector<unsigned> getRegClassForInlineAsmConstraint(const std::string &Constraint, - MVT VT) const; + EVT VT) const; /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. If hasMemory is @@ -200,21 +222,23 @@ namespace llvm { /// getFunctionAlignment - Return the Log2 alignment of this function. virtual unsigned getFunctionAlignment(const Function *F) const; + bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const; + bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const; private: /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can /// make the right decision when generating code for different targets. const ARMSubtarget *Subtarget; - /// ARMPCLabelIndex - Keep track the number of ARM PC labels created. + /// ARMPCLabelIndex - Keep track of the number of ARM PC labels created. /// unsigned ARMPCLabelIndex; - void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT); - void addDRTypeForNEON(MVT VT); - void addQRTypeForNEON(MVT VT); + void addTypeForNEON(EVT VT, EVT PromotedLdStVT, EVT PromotedBitwiseVT); + void addDRTypeForNEON(EVT VT); + void addQRTypeForNEON(EVT VT); typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector; - void PassF64ArgInRegs(CallSDNode *TheCall, SelectionDAG &DAG, + void PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg, RegsToPassVector &RegsToPass, CCValAssign &VA, CCValAssign &NextVA, @@ -224,15 +248,13 @@ namespace llvm { SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, SDValue &Root, SelectionDAG &DAG, DebugLoc dl); - CCAssignFn *CCAssignFnForNode(unsigned CC, bool Return) const; - SDValue LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG, - const SDValue &StackPtr, const CCValAssign &VA, - SDValue Chain, SDValue Arg, ISD::ArgFlagsTy Flags); - SDNode *LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall, - unsigned CallingConv, SelectionDAG &DAG); - SDValue LowerCALL(SDValue Op, SelectionDAG &DAG); + CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return, bool isVarArg) const; + SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, + DebugLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, + ISD::ArgFlagsTy Flags); + SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG); SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG); - SDValue LowerRET(SDValue Op, SelectionDAG &DAG); SDValue LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG); SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG); SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG); @@ -241,9 +263,9 @@ namespace llvm { SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA, SelectionDAG &DAG); SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG); - SDValue LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG); SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG); SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG); + SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG); SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, SDValue Chain, @@ -252,6 +274,33 @@ namespace llvm { bool AlwaysInline, const Value *DstSV, uint64_t DstSVOff, const Value *SrcSV, uint64_t SrcSVOff); + SDValue LowerCallResult(SDValue Chain, SDValue InFlag, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals); + + virtual SDValue + LowerFormalArguments(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals); + + virtual SDValue + LowerCall(SDValue Chain, SDValue Callee, + CallingConv::ID CallConv, bool isVarArg, + bool isTailCall, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals); + + virtual SDValue + LowerReturn(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + DebugLoc dl, SelectionDAG &DAG); }; } diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td index 301a6c1..3d19f23 100644 --- a/lib/Target/ARM/ARMInstrFormats.td +++ b/lib/Target/ARM/ARMInstrFormats.td @@ -54,9 +54,16 @@ def NEONGetLnFrm : Format<25>; def NEONSetLnFrm : Format<26>; def NEONDupFrm : Format<27>; -// Misc flag for data processing instructions that indicates whether +// Misc flags. + // the instruction has a Rn register operand. -class UnaryDP { bit isUnaryDataProc = 1; } +// UnaryDP - Indicates this is a unary data processing instruction, i.e. +// it doesn't have a Rn operand. +class UnaryDP { bit isUnaryDataProc = 1; } + +// Xform16Bit - Indicates this Thumb2 instruction may be transformed into +// a 16-bit Thumb instruction if certain conditions are met. +class Xform16Bit { bit canXformTo16Bit = 1; } //===----------------------------------------------------------------------===// // ARM Instruction flags. These need to match ARMInstrInfo.h. @@ -77,7 +84,7 @@ def AddrModeT1_1 : AddrMode<7>; def AddrModeT1_2 : AddrMode<8>; def AddrModeT1_4 : AddrMode<9>; def AddrModeT1_s : AddrMode<10>; -def AddrModeT2_i12: AddrMode<12>; +def AddrModeT2_i12: AddrMode<11>; def AddrModeT2_i8 : AddrMode<12>; def AddrModeT2_so : AddrMode<13>; def AddrModeT2_pc : AddrMode<14>; @@ -103,11 +110,33 @@ def IndexModePost : IndexMode<2>; //===----------------------------------------------------------------------===// +// ARM special operands. +// + +// ARM Predicate operand. Default to 14 = always (AL). Second part is CC +// register whose default is 0 (no register). +def pred : PredicateOperand<OtherVT, (ops i32imm, CCR), + (ops (i32 14), (i32 zero_reg))> { + let PrintMethod = "printPredicateOperand"; +} + +// Conditional code result for instructions whose 's' bit is set, e.g. subs. +def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> { + let PrintMethod = "printSBitModifierOperand"; +} + +// Same as cc_out except it defaults to setting CPSR. +def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> { + let PrintMethod = "printSBitModifierOperand"; +} + +//===----------------------------------------------------------------------===// + // ARM Instruction templates. // class InstARM<AddrMode am, SizeFlagVal sz, IndexMode im, - Format f, string cstr> + Format f, string cstr, InstrItinClass itin> : Instruction { field bits<32> Inst; @@ -130,12 +159,15 @@ class InstARM<AddrMode am, SizeFlagVal sz, IndexMode im, // Attributes specific to ARM instructions... // bit isUnaryDataProc = 0; + bit canXformTo16Bit = 0; let Constraints = cstr; + let Itinerary = itin; } -class PseudoInst<dag oops, dag iops, string asm, list<dag> pattern> - : InstARM<AddrModeNone, SizeSpecial, IndexModeNone, Pseudo, ""> { +class PseudoInst<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : InstARM<AddrModeNone, SizeSpecial, IndexModeNone, Pseudo, "", itin> { let OutOperandList = oops; let InOperandList = iops; let AsmString = asm; @@ -144,9 +176,10 @@ class PseudoInst<dag oops, dag iops, string asm, list<dag> pattern> // Almost all ARM instructions are predicable. class I<dag oops, dag iops, AddrMode am, SizeFlagVal sz, - IndexMode im, Format f, string opc, string asm, string cstr, + IndexMode im, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> - : InstARM<am, sz, im, f, cstr> { + : InstARM<am, sz, im, f, cstr, itin> { let OutOperandList = oops; let InOperandList = !con(iops, (ops pred:$p)); let AsmString = !strconcat(opc, !strconcat("${p}", asm)); @@ -158,9 +191,10 @@ class I<dag oops, dag iops, AddrMode am, SizeFlagVal sz, // an input operand since by default it's a zero register. It will // become an implicit def once it's "flipped". class sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, - IndexMode im, Format f, string opc, string asm, string cstr, + IndexMode im, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> - : InstARM<am, sz, im, f, cstr> { + : InstARM<am, sz, im, f, cstr, itin> { let OutOperandList = oops; let InOperandList = !con(iops, (ops pred:$p, cc_out:$s)); let AsmString = !strconcat(opc, !strconcat("${p}${s}", asm)); @@ -170,8 +204,9 @@ class sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, // Special cases class XI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, - IndexMode im, Format f, string asm, string cstr, list<dag> pattern> - : InstARM<am, sz, im, f, cstr> { + IndexMode im, Format f, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, im, f, cstr, itin> { let OutOperandList = oops; let InOperandList = iops; let AsmString = asm; @@ -179,90 +214,93 @@ class XI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, list<Predicate> Predicates = [IsARM]; } -class AI<dag oops, dag iops, Format f, string opc, - string asm, list<dag> pattern> - : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, opc, - asm, "", pattern>; -class AsI<dag oops, dag iops, Format f, string opc, +class AI<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern>; +class AsI<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : sI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern>; +class AXI<dag oops, dag iops, Format f, InstrItinClass itin, string asm, list<dag> pattern> - : sI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, opc, + : XI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin, asm, "", pattern>; -class AXI<dag oops, dag iops, Format f, string asm, - list<dag> pattern> - : XI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, asm, - "", pattern>; // Ctrl flow instructions -class ABI<bits<4> opcod, dag oops, dag iops, string opc, - string asm, list<dag> pattern> - : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, BrFrm, opc, - asm, "", pattern> { +class ABI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, BrFrm, itin, + opc, asm, "", pattern> { let Inst{27-24} = opcod; } -class ABXI<bits<4> opcod, dag oops, dag iops, string asm, list<dag> pattern> - : XI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, BrFrm, asm, - "", pattern> { +class ABXI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, BrFrm, itin, + asm, "", pattern> { let Inst{27-24} = opcod; } -class ABXIx2<dag oops, dag iops, string asm, list<dag> pattern> - : XI<oops, iops, AddrModeNone, Size8Bytes, IndexModeNone, BrMiscFrm, asm, - "", pattern>; +class ABXIx2<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrModeNone, Size8Bytes, IndexModeNone, BrMiscFrm, itin, + asm, "", pattern>; // BR_JT instructions -class JTI<dag oops, dag iops, string asm, list<dag> pattern> - : XI<oops, iops, AddrModeNone, SizeSpecial, IndexModeNone, BrMiscFrm, +class JTI<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrModeNone, SizeSpecial, IndexModeNone, BrMiscFrm, itin, asm, "", pattern>; // addrmode1 instructions -class AI1<bits<4> opcod, dag oops, dag iops, Format f, string opc, - string asm, list<dag> pattern> - : I<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, opc, - asm, "", pattern> { +class AI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { let Inst{24-21} = opcod; let Inst{27-26} = {0,0}; } -class AsI1<bits<4> opcod, dag oops, dag iops, Format f, string opc, - string asm, list<dag> pattern> - : sI<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, opc, - asm, "", pattern> { +class AsI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : sI<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { let Inst{24-21} = opcod; let Inst{27-26} = {0,0}; } -class AXI1<bits<4> opcod, dag oops, dag iops, Format f, string asm, - list<dag> pattern> - : XI<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, asm, - "", pattern> { +class AXI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, itin, + asm, "", pattern> { let Inst{24-21} = opcod; let Inst{27-26} = {0,0}; } -class AI1x2<dag oops, dag iops, Format f, string opc, - string asm, list<dag> pattern> - : I<oops, iops, AddrMode1, Size8Bytes, IndexModeNone, f, opc, - asm, "", pattern>; +class AI1x2<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode1, Size8Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern>; // addrmode2 loads and stores -class AI2<dag oops, dag iops, Format f, string opc, - string asm, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, opc, - asm, "", pattern> { +class AI2<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { let Inst{27-26} = {0,1}; } // loads -class AI2ldw<dag oops, dag iops, Format f, string opc, - string asm, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, opc, - asm, "", pattern> { +class AI2ldw<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { let Inst{20} = 1; // L bit let Inst{21} = 0; // W bit let Inst{22} = 0; // B bit let Inst{24} = 1; // P bit let Inst{27-26} = {0,1}; } -class AXI2ldw<dag oops, dag iops, Format f, string asm, - list<dag> pattern> - : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, +class AXI2ldw<dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, asm, "", pattern> { let Inst{20} = 1; // L bit let Inst{21} = 0; // W bit @@ -270,19 +308,19 @@ class AXI2ldw<dag oops, dag iops, Format f, string asm, let Inst{24} = 1; // P bit let Inst{27-26} = {0,1}; } -class AI2ldb<dag oops, dag iops, Format f, string opc, - string asm, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, opc, - asm, "", pattern> { +class AI2ldb<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { let Inst{20} = 1; // L bit let Inst{21} = 0; // W bit let Inst{22} = 1; // B bit let Inst{24} = 1; // P bit let Inst{27-26} = {0,1}; } -class AXI2ldb<dag oops, dag iops, Format f, string asm, - list<dag> pattern> - : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, +class AXI2ldb<dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, asm, "", pattern> { let Inst{20} = 1; // L bit let Inst{21} = 0; // W bit @@ -292,19 +330,19 @@ class AXI2ldb<dag oops, dag iops, Format f, string asm, } // stores -class AI2stw<dag oops, dag iops, Format f, string opc, - string asm, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, opc, - asm, "", pattern> { +class AI2stw<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { let Inst{20} = 0; // L bit let Inst{21} = 0; // W bit let Inst{22} = 0; // B bit let Inst{24} = 1; // P bit let Inst{27-26} = {0,1}; } -class AXI2stw<dag oops, dag iops, Format f, string asm, - list<dag> pattern> - : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, +class AXI2stw<dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, asm, "", pattern> { let Inst{20} = 0; // L bit let Inst{21} = 0; // W bit @@ -312,19 +350,19 @@ class AXI2stw<dag oops, dag iops, Format f, string asm, let Inst{24} = 1; // P bit let Inst{27-26} = {0,1}; } -class AI2stb<dag oops, dag iops, Format f, string opc, - string asm, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, opc, - asm, "", pattern> { +class AI2stb<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { let Inst{20} = 0; // L bit let Inst{21} = 0; // W bit let Inst{22} = 1; // B bit let Inst{24} = 1; // P bit let Inst{27-26} = {0,1}; } -class AXI2stb<dag oops, dag iops, Format f, string asm, - list<dag> pattern> - : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, +class AXI2stb<dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin, asm, "", pattern> { let Inst{20} = 0; // L bit let Inst{21} = 0; // W bit @@ -334,20 +372,20 @@ class AXI2stb<dag oops, dag iops, Format f, string asm, } // Pre-indexed loads -class AI2ldwpr<dag oops, dag iops, Format f, string opc, - string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, opc, - asm, cstr, pattern> { +class AI2ldwpr<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin, + opc, asm, cstr, pattern> { let Inst{20} = 1; // L bit let Inst{21} = 1; // W bit let Inst{22} = 0; // B bit let Inst{24} = 1; // P bit let Inst{27-26} = {0,1}; } -class AI2ldbpr<dag oops, dag iops, Format f, string opc, - string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, opc, - asm, cstr, pattern> { +class AI2ldbpr<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin, + opc, asm, cstr, pattern> { let Inst{20} = 1; // L bit let Inst{21} = 1; // W bit let Inst{22} = 1; // B bit @@ -356,20 +394,20 @@ class AI2ldbpr<dag oops, dag iops, Format f, string opc, } // Pre-indexed stores -class AI2stwpr<dag oops, dag iops, Format f, string opc, - string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, opc, - asm, cstr, pattern> { +class AI2stwpr<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin, + opc, asm, cstr, pattern> { let Inst{20} = 0; // L bit let Inst{21} = 1; // W bit let Inst{22} = 0; // B bit let Inst{24} = 1; // P bit let Inst{27-26} = {0,1}; } -class AI2stbpr<dag oops, dag iops, Format f, string opc, - string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, opc, - asm, cstr, pattern> { +class AI2stbpr<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin, + opc, asm, cstr, pattern> { let Inst{20} = 0; // L bit let Inst{21} = 1; // W bit let Inst{22} = 1; // B bit @@ -378,20 +416,20 @@ class AI2stbpr<dag oops, dag iops, Format f, string opc, } // Post-indexed loads -class AI2ldwpo<dag oops, dag iops, Format f, string opc, - string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, opc, - asm, cstr,pattern> { +class AI2ldwpo<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin, + opc, asm, cstr,pattern> { let Inst{20} = 1; // L bit let Inst{21} = 0; // W bit let Inst{22} = 0; // B bit let Inst{24} = 0; // P bit let Inst{27-26} = {0,1}; } -class AI2ldbpo<dag oops, dag iops, Format f, string opc, - string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, opc, - asm, cstr,pattern> { +class AI2ldbpo<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin, + opc, asm, cstr,pattern> { let Inst{20} = 1; // L bit let Inst{21} = 0; // W bit let Inst{22} = 1; // B bit @@ -400,20 +438,20 @@ class AI2ldbpo<dag oops, dag iops, Format f, string opc, } // Post-indexed stores -class AI2stwpo<dag oops, dag iops, Format f, string opc, - string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, opc, - asm, cstr,pattern> { +class AI2stwpo<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin, + opc, asm, cstr,pattern> { let Inst{20} = 0; // L bit let Inst{21} = 0; // W bit let Inst{22} = 0; // B bit let Inst{24} = 0; // P bit let Inst{27-26} = {0,1}; } -class AI2stbpo<dag oops, dag iops, Format f, string opc, - string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, opc, - asm, cstr,pattern> { +class AI2stbpo<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin, + opc, asm, cstr,pattern> { let Inst{20} = 0; // L bit let Inst{21} = 0; // W bit let Inst{22} = 1; // B bit @@ -422,20 +460,20 @@ class AI2stbpo<dag oops, dag iops, Format f, string opc, } // addrmode3 instructions -class AI3<dag oops, dag iops, Format f, string opc, - string asm, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc, - asm, "", pattern>; -class AXI3<dag oops, dag iops, Format f, string asm, - list<dag> pattern> - : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, asm, - "", pattern>; +class AI3<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern>; +class AXI3<dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, + asm, "", pattern>; // loads -class AI3ldh<dag oops, dag iops, Format f, string opc, - string asm, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc, - asm, "", pattern> { +class AI3ldh<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { let Inst{4} = 1; let Inst{5} = 1; // H bit let Inst{6} = 0; // S bit @@ -443,10 +481,11 @@ class AI3ldh<dag oops, dag iops, Format f, string opc, let Inst{20} = 1; // L bit let Inst{21} = 0; // W bit let Inst{24} = 1; // P bit + let Inst{27-25} = 0b000; } -class AXI3ldh<dag oops, dag iops, Format f, string asm, - list<dag> pattern> - : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, +class AXI3ldh<dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, asm, "", pattern> { let Inst{4} = 1; let Inst{5} = 1; // H bit @@ -456,10 +495,10 @@ class AXI3ldh<dag oops, dag iops, Format f, string asm, let Inst{21} = 0; // W bit let Inst{24} = 1; // P bit } -class AI3ldsh<dag oops, dag iops, Format f, string opc, - string asm, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc, - asm, "", pattern> { +class AI3ldsh<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { let Inst{4} = 1; let Inst{5} = 1; // H bit let Inst{6} = 1; // S bit @@ -467,10 +506,11 @@ class AI3ldsh<dag oops, dag iops, Format f, string opc, let Inst{20} = 1; // L bit let Inst{21} = 0; // W bit let Inst{24} = 1; // P bit + let Inst{27-25} = 0b000; } -class AXI3ldsh<dag oops, dag iops, Format f, string asm, - list<dag> pattern> - : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, +class AXI3ldsh<dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, asm, "", pattern> { let Inst{4} = 1; let Inst{5} = 1; // H bit @@ -480,10 +520,10 @@ class AXI3ldsh<dag oops, dag iops, Format f, string asm, let Inst{21} = 0; // W bit let Inst{24} = 1; // P bit } -class AI3ldsb<dag oops, dag iops, Format f, string opc, - string asm, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc, - asm, "", pattern> { +class AI3ldsb<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { let Inst{4} = 1; let Inst{5} = 0; // H bit let Inst{6} = 1; // S bit @@ -491,10 +531,11 @@ class AI3ldsb<dag oops, dag iops, Format f, string opc, let Inst{20} = 1; // L bit let Inst{21} = 0; // W bit let Inst{24} = 1; // P bit + let Inst{27-25} = 0b000; } -class AXI3ldsb<dag oops, dag iops, Format f, string asm, - list<dag> pattern> - : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, +class AXI3ldsb<dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, asm, "", pattern> { let Inst{4} = 1; let Inst{5} = 0; // H bit @@ -504,10 +545,10 @@ class AXI3ldsb<dag oops, dag iops, Format f, string asm, let Inst{21} = 0; // W bit let Inst{24} = 1; // P bit } -class AI3ldd<dag oops, dag iops, Format f, string opc, - string asm, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc, - asm, "", pattern> { +class AI3ldd<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { let Inst{4} = 1; let Inst{5} = 0; // H bit let Inst{6} = 1; // S bit @@ -515,13 +556,14 @@ class AI3ldd<dag oops, dag iops, Format f, string opc, let Inst{20} = 0; // L bit let Inst{21} = 0; // W bit let Inst{24} = 1; // P bit + let Inst{27-25} = 0b000; } // stores -class AI3sth<dag oops, dag iops, Format f, string opc, - string asm, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc, - asm, "", pattern> { +class AI3sth<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { let Inst{4} = 1; let Inst{5} = 1; // H bit let Inst{6} = 0; // S bit @@ -529,10 +571,11 @@ class AI3sth<dag oops, dag iops, Format f, string opc, let Inst{20} = 0; // L bit let Inst{21} = 0; // W bit let Inst{24} = 1; // P bit + let Inst{27-25} = 0b000; } -class AXI3sth<dag oops, dag iops, Format f, string asm, - list<dag> pattern> - : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, +class AXI3sth<dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, asm, "", pattern> { let Inst{4} = 1; let Inst{5} = 1; // H bit @@ -542,10 +585,10 @@ class AXI3sth<dag oops, dag iops, Format f, string asm, let Inst{21} = 0; // W bit let Inst{24} = 1; // P bit } -class AI3std<dag oops, dag iops, Format f, string opc, - string asm, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc, - asm, "", pattern> { +class AI3std<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern> { let Inst{4} = 1; let Inst{5} = 1; // H bit let Inst{6} = 1; // S bit @@ -553,13 +596,14 @@ class AI3std<dag oops, dag iops, Format f, string opc, let Inst{20} = 0; // L bit let Inst{21} = 0; // W bit let Inst{24} = 1; // P bit + let Inst{27-25} = 0b000; } // Pre-indexed loads -class AI3ldhpr<dag oops, dag iops, Format f, string opc, - string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, opc, - asm, cstr, pattern> { +class AI3ldhpr<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin, + opc, asm, cstr, pattern> { let Inst{4} = 1; let Inst{5} = 1; // H bit let Inst{6} = 0; // S bit @@ -567,11 +611,12 @@ class AI3ldhpr<dag oops, dag iops, Format f, string opc, let Inst{20} = 1; // L bit let Inst{21} = 1; // W bit let Inst{24} = 1; // P bit + let Inst{27-25} = 0b000; } -class AI3ldshpr<dag oops, dag iops, Format f, string opc, - string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, opc, - asm, cstr, pattern> { +class AI3ldshpr<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin, + opc, asm, cstr, pattern> { let Inst{4} = 1; let Inst{5} = 1; // H bit let Inst{6} = 1; // S bit @@ -579,11 +624,12 @@ class AI3ldshpr<dag oops, dag iops, Format f, string opc, let Inst{20} = 1; // L bit let Inst{21} = 1; // W bit let Inst{24} = 1; // P bit + let Inst{27-25} = 0b000; } -class AI3ldsbpr<dag oops, dag iops, Format f, string opc, - string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, opc, - asm, cstr, pattern> { +class AI3ldsbpr<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin, + opc, asm, cstr, pattern> { let Inst{4} = 1; let Inst{5} = 0; // H bit let Inst{6} = 1; // S bit @@ -591,13 +637,14 @@ class AI3ldsbpr<dag oops, dag iops, Format f, string opc, let Inst{20} = 1; // L bit let Inst{21} = 1; // W bit let Inst{24} = 1; // P bit + let Inst{27-25} = 0b000; } // Pre-indexed stores -class AI3sthpr<dag oops, dag iops, Format f, string opc, - string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, opc, - asm, cstr, pattern> { +class AI3sthpr<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin, + opc, asm, cstr, pattern> { let Inst{4} = 1; let Inst{5} = 1; // H bit let Inst{6} = 0; // S bit @@ -605,13 +652,14 @@ class AI3sthpr<dag oops, dag iops, Format f, string opc, let Inst{20} = 0; // L bit let Inst{21} = 1; // W bit let Inst{24} = 1; // P bit + let Inst{27-25} = 0b000; } // Post-indexed loads -class AI3ldhpo<dag oops, dag iops, Format f, string opc, - string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, opc, - asm, cstr,pattern> { +class AI3ldhpo<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin, + opc, asm, cstr,pattern> { let Inst{4} = 1; let Inst{5} = 1; // H bit let Inst{6} = 0; // S bit @@ -619,11 +667,12 @@ class AI3ldhpo<dag oops, dag iops, Format f, string opc, let Inst{20} = 1; // L bit let Inst{21} = 1; // W bit let Inst{24} = 0; // P bit + let Inst{27-25} = 0b000; } -class AI3ldshpo<dag oops, dag iops, Format f, string opc, - string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, opc, - asm, cstr,pattern> { +class AI3ldshpo<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin, + opc, asm, cstr,pattern> { let Inst{4} = 1; let Inst{5} = 1; // H bit let Inst{6} = 1; // S bit @@ -631,11 +680,12 @@ class AI3ldshpo<dag oops, dag iops, Format f, string opc, let Inst{20} = 1; // L bit let Inst{21} = 1; // W bit let Inst{24} = 0; // P bit + let Inst{27-25} = 0b000; } -class AI3ldsbpo<dag oops, dag iops, Format f, string opc, - string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, opc, - asm, cstr,pattern> { +class AI3ldsbpo<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin, + opc, asm, cstr,pattern> { let Inst{4} = 1; let Inst{5} = 0; // H bit let Inst{6} = 1; // S bit @@ -643,13 +693,14 @@ class AI3ldsbpo<dag oops, dag iops, Format f, string opc, let Inst{20} = 1; // L bit let Inst{21} = 1; // W bit let Inst{24} = 0; // P bit + let Inst{27-25} = 0b000; } // Post-indexed stores -class AI3sthpo<dag oops, dag iops, Format f, string opc, - string asm, string cstr, list<dag> pattern> - : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, opc, - asm, cstr,pattern> { +class AI3sthpo<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin, + opc, asm, cstr,pattern> { let Inst{4} = 1; let Inst{5} = 1; // H bit let Inst{6} = 0; // S bit @@ -657,57 +708,60 @@ class AI3sthpo<dag oops, dag iops, Format f, string opc, let Inst{20} = 0; // L bit let Inst{21} = 1; // W bit let Inst{24} = 0; // P bit + let Inst{27-25} = 0b000; } // addrmode4 instructions -class AXI4ld<dag oops, dag iops, Format f, string asm, list<dag> pattern> - : XI<oops, iops, AddrMode4, Size4Bytes, IndexModeNone, f, asm, - "", pattern> { +class AXI4ld<dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode4, Size4Bytes, IndexModeNone, f, itin, + asm, "", pattern> { let Inst{20} = 1; // L bit let Inst{22} = 0; // S bit let Inst{27-25} = 0b100; } -class AXI4st<dag oops, dag iops, Format f, string asm, list<dag> pattern> - : XI<oops, iops, AddrMode4, Size4Bytes, IndexModeNone, f, asm, - "", pattern> { +class AXI4st<dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode4, Size4Bytes, IndexModeNone, f, itin, + asm, "", pattern> { let Inst{20} = 0; // L bit let Inst{22} = 0; // S bit let Inst{27-25} = 0b100; } // Unsigned multiply, multiply-accumulate instructions. -class AMul1I<bits<7> opcod, dag oops, dag iops, string opc, - string asm, list<dag> pattern> - : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, opc, - asm, "", pattern> { +class AMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin, + opc, asm, "", pattern> { let Inst{7-4} = 0b1001; let Inst{20} = 0; // S bit let Inst{27-21} = opcod; } -class AsMul1I<bits<7> opcod, dag oops, dag iops, string opc, - string asm, list<dag> pattern> - : sI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, opc, - asm, "", pattern> { +class AsMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : sI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin, + opc, asm, "", pattern> { let Inst{7-4} = 0b1001; let Inst{27-21} = opcod; } // Most significant word multiply -class AMul2I<bits<7> opcod, dag oops, dag iops, string opc, - string asm, list<dag> pattern> - : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, opc, - asm, "", pattern> { +class AMul2I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin, + opc, asm, "", pattern> { let Inst{7-4} = 0b1001; let Inst{20} = 1; let Inst{27-21} = opcod; } // SMUL<x><y> / SMULW<y> / SMLA<x><y> / SMLAW<x><y> -class AMulxyI<bits<7> opcod, dag oops, dag iops, string opc, - string asm, list<dag> pattern> - : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, opc, - asm, "", pattern> { +class AMulxyI<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin, + opc, asm, "", pattern> { let Inst{4} = 0; let Inst{7} = 1; let Inst{20} = 0; @@ -715,19 +769,19 @@ class AMulxyI<bits<7> opcod, dag oops, dag iops, string opc, } // Extend instructions. -class AExtI<bits<8> opcod, dag oops, dag iops, string opc, - string asm, list<dag> pattern> - : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ExtFrm, opc, - asm, "", pattern> { +class AExtI<bits<8> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ExtFrm, itin, + opc, asm, "", pattern> { let Inst{7-4} = 0b0111; let Inst{27-20} = opcod; } // Misc Arithmetic instructions. -class AMiscA1I<bits<8> opcod, dag oops, dag iops, string opc, - string asm, list<dag> pattern> - : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ArithMiscFrm, opc, - asm, "", pattern> { +class AMiscA1I<bits<8> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ArithMiscFrm, itin, + opc, asm, "", pattern> { let Inst{27-20} = opcod; } @@ -751,74 +805,120 @@ class ARMV6Pat<dag pattern, dag result> : Pat<pattern, result> { // TI - Thumb instruction. -class ThumbI<dag outs, dag ins, AddrMode am, SizeFlagVal sz, - string asm, string cstr, list<dag> pattern> - : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr> { - let OutOperandList = outs; - let InOperandList = ins; +class ThumbI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, + InstrItinClass itin, string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr, itin> { + let OutOperandList = oops; + let InOperandList = iops; let AsmString = asm; let Pattern = pattern; list<Predicate> Predicates = [IsThumb]; } -class TI<dag outs, dag ins, string asm, list<dag> pattern> - : ThumbI<outs, ins, AddrModeNone, Size2Bytes, asm, "", pattern>; +class TI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern> + : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "", pattern>; -// BL, BLX(1) are translated by assembler into two instructions -class TIx2<dag outs, dag ins, string asm, list<dag> pattern> - : ThumbI<outs, ins, AddrModeNone, Size4Bytes, asm, "", pattern>; - -// BR_JT instructions -class TJTI<dag outs, dag ins, string asm, list<dag> pattern> - : ThumbI<outs, ins, AddrModeNone, SizeSpecial, asm, "", pattern>; +// Two-address instructions +class TIt<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern> + : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "$lhs = $dst", pattern>; -// TPat - Same as Pat<>, but requires that the compiler be in Thumb mode. -class TPat<dag pattern, dag result> : Pat<pattern, result> { - list<Predicate> Predicates = [IsThumb]; -} +// tBL, tBX instructions +class TIx2<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern> + : ThumbI<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>; -class Tv5Pat<dag pattern, dag result> : Pat<pattern, result> { - list<Predicate> Predicates = [IsThumb, HasV5T]; -} +// BR_JT instructions +class TJTI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern> + : ThumbI<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>; // Thumb1 only -class Thumb1I<dag outs, dag ins, AddrMode am, SizeFlagVal sz, - string asm, string cstr, list<dag> pattern> - : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr> { - let OutOperandList = outs; - let InOperandList = ins; +class Thumb1I<dag oops, dag iops, AddrMode am, SizeFlagVal sz, + InstrItinClass itin, string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr, itin> { + let OutOperandList = oops; + let InOperandList = iops; let AsmString = asm; let Pattern = pattern; list<Predicate> Predicates = [IsThumb1Only]; } -class T1I<dag outs, dag ins, string asm, list<dag> pattern> - : Thumb1I<outs, ins, AddrModeNone, Size2Bytes, asm, "", pattern>; -class T1I1<dag outs, dag ins, string asm, list<dag> pattern> - : Thumb1I<outs, ins, AddrModeT1_1, Size2Bytes, asm, "", pattern>; -class T1I2<dag outs, dag ins, string asm, list<dag> pattern> - : Thumb1I<outs, ins, AddrModeT1_2, Size2Bytes, asm, "", pattern>; -class T1I4<dag outs, dag ins, string asm, list<dag> pattern> - : Thumb1I<outs, ins, AddrModeT1_4, Size2Bytes, asm, "", pattern>; -class T1Is<dag outs, dag ins, string asm, list<dag> pattern> - : Thumb1I<outs, ins, AddrModeT1_s, Size2Bytes, asm, "", pattern>; -class T1Ix2<dag outs, dag ins, string asm, list<dag> pattern> - : Thumb1I<outs, ins, AddrModeNone, Size4Bytes, asm, "", pattern>; -class T1JTI<dag outs, dag ins, string asm, list<dag> pattern> - : Thumb1I<outs, ins, AddrModeNone, SizeSpecial, asm, "", pattern>; +class T1I<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : Thumb1I<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "", pattern>; +class T1Ix2<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : Thumb1I<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>; +class T1JTI<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : Thumb1I<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>; // Two-address instructions -class T1It<dag outs, dag ins, string asm, list<dag> pattern> - : Thumb1I<outs, ins, AddrModeNone, Size2Bytes, asm, "$lhs = $dst", pattern>; +class T1It<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : Thumb1I<oops, iops, AddrModeNone, Size2Bytes, itin, + asm, "$lhs = $dst", pattern>; -class T1Pat<dag pattern, dag result> : Pat<pattern, result> { +// Thumb1 instruction that can either be predicated or set CPSR. +class Thumb1sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, + InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr, itin> { + let OutOperandList = !con(oops, (ops s_cc_out:$s)); + let InOperandList = !con(iops, (ops pred:$p)); + let AsmString = !strconcat(opc, !strconcat("${s}${p}", asm)); + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb1Only]; +} + +class T1sI<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb1sI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm, "", pattern>; + +// Two-address instructions +class T1sIt<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb1sI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm, + "$lhs = $dst", pattern>; + +// Thumb1 instruction that can be predicated. +class Thumb1pI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, + InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ops pred:$p)); + let AsmString = !strconcat(opc, !strconcat("${p}", asm)); + let Pattern = pattern; list<Predicate> Predicates = [IsThumb1Only]; } +class T1pI<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb1pI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm, "", pattern>; + +// Two-address instructions +class T1pIt<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb1pI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm, + "$lhs = $dst", pattern>; + +class T1pI1<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb1pI<oops, iops, AddrModeT1_1, Size2Bytes, itin, opc, asm, "", pattern>; +class T1pI2<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb1pI<oops, iops, AddrModeT1_2, Size2Bytes, itin, opc, asm, "", pattern>; +class T1pI4<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb1pI<oops, iops, AddrModeT1_4, Size2Bytes, itin, opc, asm, "", pattern>; +class T1pIs<dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : Thumb1pI<oops, iops, AddrModeT1_s, Size2Bytes, itin, opc, asm, "", pattern>; + // Thumb2I - Thumb2 instruction. Almost all Thumb2 instructions are predicable. class Thumb2I<dag oops, dag iops, AddrMode am, SizeFlagVal sz, + InstrItinClass itin, string opc, string asm, string cstr, list<dag> pattern> - : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr> { + : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr, itin> { let OutOperandList = oops; let InOperandList = !con(iops, (ops pred:$p)); let AsmString = !strconcat(opc, !strconcat("${p}", asm)); @@ -832,8 +932,9 @@ class Thumb2I<dag oops, dag iops, AddrMode am, SizeFlagVal sz, // FIXME: This uses unified syntax so {s} comes before {p}. We should make it // more consistent. class Thumb2sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, + InstrItinClass itin, string opc, string asm, string cstr, list<dag> pattern> - : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr> { + : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr, itin> { let OutOperandList = oops; let InOperandList = !con(iops, (ops pred:$p, cc_out:$s)); let AsmString = !strconcat(opc, !strconcat("${s}${p}", asm)); @@ -843,8 +944,9 @@ class Thumb2sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, // Special cases class Thumb2XI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, + InstrItinClass itin, string asm, string cstr, list<dag> pattern> - : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr> { + : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr, itin> { let OutOperandList = oops; let InOperandList = iops; let AsmString = asm; @@ -852,31 +954,46 @@ class Thumb2XI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, list<Predicate> Predicates = [IsThumb2]; } -class T2I<dag oops, dag iops, string opc, string asm, list<dag> pattern> - : Thumb2I<oops, iops, AddrModeNone, Size4Bytes, opc, asm, "", pattern>; -class T2Ii12<dag oops, dag iops, string opc, string asm, list<dag> pattern> - : Thumb2I<oops, iops, AddrModeT2_i12, Size4Bytes, opc, asm, "", pattern>; -class T2Ii8<dag oops, dag iops, string opc, string asm, list<dag> pattern> - : Thumb2I<oops, iops, AddrModeT2_i8, Size4Bytes, opc, asm, "", pattern>; -class T2Iso<dag oops, dag iops, string opc, string asm, list<dag> pattern> - : Thumb2I<oops, iops, AddrModeT2_so, Size4Bytes, opc, asm, "", pattern>; -class T2Ipc<dag oops, dag iops, string opc, string asm, list<dag> pattern> - : Thumb2I<oops, iops, AddrModeT2_pc, Size4Bytes, opc, asm, "", pattern>; -class T2Ii8s4<dag oops, dag iops, string opc, string asm, list<dag> pattern> - : Thumb2I<oops, iops, AddrModeT2_i8s4, Size4Bytes, opc, asm, "", pattern>; +class T2I<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeNone, Size4Bytes, itin, opc, asm, "", pattern>; +class T2Ii12<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_i12, Size4Bytes, itin, opc, asm, "", pattern>; +class T2Ii8<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_i8, Size4Bytes, itin, opc, asm, "", pattern>; +class T2Iso<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_so, Size4Bytes, itin, opc, asm, "", pattern>; +class T2Ipc<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_pc, Size4Bytes, itin, opc, asm, "", pattern>; +class T2Ii8s4<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_i8s4, Size4Bytes, itin, opc, asm, "", pattern>; + +class T2sI<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2sI<oops, iops, AddrModeNone, Size4Bytes, itin, opc, asm, "", pattern>; + +class T2XI<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : Thumb2XI<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>; +class T2JTI<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : Thumb2XI<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>; -class T2sI<dag oops, dag iops, string opc, string asm, list<dag> pattern> - : Thumb2sI<oops, iops, AddrModeNone, Size4Bytes, opc, asm, "", pattern>; +class T2Ix2<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeNone, Size8Bytes, itin, opc, asm, "", pattern>; -class T2XI<dag oops, dag iops, string asm, list<dag> pattern> - : Thumb2XI<oops, iops, AddrModeNone, Size4Bytes, asm, "", pattern>; -class T2JTI<dag oops, dag iops, string asm, list<dag> pattern> - : Thumb2XI<oops, iops, AddrModeNone, SizeSpecial, asm, "", pattern>; // T2Iidxldst - Thumb2 indexed load / store instructions. class T2Iidxldst<dag oops, dag iops, AddrMode am, IndexMode im, + InstrItinClass itin, string opc, string asm, string cstr, list<dag> pattern> - : InstARM<am, Size4Bytes, im, ThumbFrm, cstr> { + : InstARM<am, Size4Bytes, im, ThumbFrm, cstr, itin> { let OutOperandList = oops; let InOperandList = !con(iops, (ops pred:$p)); let AsmString = !strconcat(opc, !strconcat("${p}", asm)); @@ -884,6 +1001,15 @@ class T2Iidxldst<dag oops, dag iops, AddrMode am, IndexMode im, list<Predicate> Predicates = [IsThumb2]; } +// Tv5Pat - Same as Pat<>, but requires V5T Thumb mode. +class Tv5Pat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsThumb1Only, HasV5T]; +} + +// T1Pat - Same as Pat<>, but requires that the compiler be in Thumb1 mode. +class T1Pat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsThumb1Only]; +} // T2Pat - Same as Pat<>, but requires that the compiler be in Thumb2 mode. class T2Pat<dag pattern, dag result> : Pat<pattern, result> { @@ -896,11 +1022,41 @@ class T2Pat<dag pattern, dag result> : Pat<pattern, result> { // ARM VFP Instruction templates. // +// Almost all VFP instructions are predicable. +class VFPI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, + IndexMode im, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, im, f, cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ops pred:$p)); + let AsmString = !strconcat(opc, !strconcat("${p}", asm)); + let Pattern = pattern; + list<Predicate> Predicates = [HasVFP2]; +} + +// Special cases +class VFPXI<dag oops, dag iops, AddrMode am, SizeFlagVal sz, + IndexMode im, Format f, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, im, f, cstr, itin> { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = asm; + let Pattern = pattern; + list<Predicate> Predicates = [HasVFP2]; +} + +class VFPAI<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : VFPI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin, + opc, asm, "", pattern>; + // ARM VFP addrmode5 loads and stores class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> - : I<oops, iops, AddrMode5, Size4Bytes, IndexModeNone, - VFPLdStFrm, opc, asm, "", pattern> { + : VFPI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone, + VFPLdStFrm, itin, opc, asm, "", pattern> { // TODO: Mark the instructions with the appropriate subtarget info. let Inst{27-24} = opcod1; let Inst{21-20} = opcod2; @@ -908,9 +1064,10 @@ class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops, } class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> - : I<oops, iops, AddrMode5, Size4Bytes, IndexModeNone, - VFPLdStFrm, opc, asm, "", pattern> { + : VFPI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone, + VFPLdStFrm, itin, opc, asm, "", pattern> { // TODO: Mark the instructions with the appropriate subtarget info. let Inst{27-24} = opcod1; let Inst{21-20} = opcod2; @@ -918,27 +1075,28 @@ class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops, } // Load / store multiple -class AXSI5<dag oops, dag iops, string asm, list<dag> pattern> - : XI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone, - VFPLdStMulFrm, asm, "", pattern> { +class AXDI5<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : VFPXI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone, + VFPLdStMulFrm, itin, asm, "", pattern> { // TODO: Mark the instructions with the appropriate subtarget info. let Inst{27-25} = 0b110; let Inst{11-8} = 0b1011; } -class AXDI5<dag oops, dag iops, string asm, list<dag> pattern> - : XI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone, - VFPLdStMulFrm, asm, "", pattern> { +class AXSI5<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : VFPXI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone, + VFPLdStMulFrm, itin, asm, "", pattern> { // TODO: Mark the instructions with the appropriate subtarget info. let Inst{27-25} = 0b110; let Inst{11-8} = 0b1010; } - // Double precision, unary class ADuI<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3, dag oops, dag iops, - string opc, string asm, list<dag> pattern> - : AI<oops, iops, VFPUnaryFrm, opc, asm, pattern> { + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> { let Inst{27-20} = opcod1; let Inst{19-16} = opcod2; let Inst{11-8} = 0b1011; @@ -946,17 +1104,17 @@ class ADuI<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3, dag oops, dag iops, } // Double precision, binary -class ADbI<bits<8> opcod, dag oops, dag iops, string opc, - string asm, list<dag> pattern> - : AI<oops, iops, VFPBinaryFrm, opc, asm, pattern> { +class ADbI<bits<8> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> { let Inst{27-20} = opcod; let Inst{11-8} = 0b1011; } // Single precision, unary class ASuI<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3, dag oops, dag iops, - string opc, string asm, list<dag> pattern> - : AI<oops, iops, VFPUnaryFrm, opc, asm, pattern> { + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> { // Bits 22 (D bit) and 5 (M bit) will be changed during instruction encoding. let Inst{27-20} = opcod1; let Inst{19-16} = opcod2; @@ -964,48 +1122,74 @@ class ASuI<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3, dag oops, dag iops, let Inst{7-4} = opcod3; } +// Single precision unary, if no NEON +// Same as ASuI except not available if NEON is enabled +class ASuIn<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : ASuI<opcod1, opcod2, opcod2, oops, iops, itin, opc, asm, pattern> { + list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP]; +} + // Single precision, binary -class ASbI<bits<8> opcod, dag oops, dag iops, string opc, - string asm, list<dag> pattern> - : AI<oops, iops, VFPBinaryFrm, opc, asm, pattern> { +class ASbI<bits<8> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> { // Bit 22 (D bit) can be changed during instruction encoding. let Inst{27-20} = opcod; let Inst{11-8} = 0b1010; } +// Single precision binary, if no NEON +// Same as ASbI except not available if NEON is enabled +class ASbIn<bits<8> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : ASbI<opcod, oops, iops, itin, opc, asm, pattern> { + list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP]; +} + // VFP conversion instructions class AVConv1I<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3, - dag oops, dag iops, string opc, string asm, list<dag> pattern> - : AI<oops, iops, VFPConv1Frm, opc, asm, pattern> { + dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : VFPAI<oops, iops, VFPConv1Frm, itin, opc, asm, pattern> { let Inst{27-20} = opcod1; let Inst{19-16} = opcod2; let Inst{11-8} = opcod3; let Inst{6} = 1; } +// VFP conversion instructions, if no NEON +class AVConv1In<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3, + dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AVConv1I<opcod1, opcod2, opcod3, oops, iops, itin, opc, asm, pattern> { + list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP]; +} + class AVConvXI<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, Format f, - string opc, string asm, list<dag> pattern> - : AI<oops, iops, f, opc, asm, pattern> { + InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : VFPAI<oops, iops, f, itin, opc, asm, pattern> { let Inst{27-20} = opcod1; let Inst{11-8} = opcod2; let Inst{4} = 1; } -class AVConv2I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc, - string asm, list<dag> pattern> - : AVConvXI<opcod1, opcod2, oops, iops, VFPConv2Frm, opc, asm, pattern>; +class AVConv2I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AVConvXI<opcod1, opcod2, oops, iops, VFPConv2Frm, itin, opc, asm, pattern>; -class AVConv3I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc, - string asm, list<dag> pattern> - : AVConvXI<opcod1, opcod2, oops, iops, VFPConv3Frm, opc, asm, pattern>; +class AVConv3I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AVConvXI<opcod1, opcod2, oops, iops, VFPConv3Frm, itin, opc, asm, pattern>; -class AVConv4I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc, - string asm, list<dag> pattern> - : AVConvXI<opcod1, opcod2, oops, iops, VFPConv4Frm, opc, asm, pattern>; +class AVConv4I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AVConvXI<opcod1, opcod2, oops, iops, VFPConv4Frm, itin, opc, asm, pattern>; -class AVConv5I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc, - string asm, list<dag> pattern> - : AVConvXI<opcod1, opcod2, oops, iops, VFPConv5Frm, opc, asm, pattern>; +class AVConv5I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AVConvXI<opcod1, opcod2, oops, iops, VFPConv5Frm, itin, opc, asm, pattern>; //===----------------------------------------------------------------------===// @@ -1013,9 +1197,9 @@ class AVConv5I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc, // ARM NEON Instruction templates. // -class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, string asm, - string cstr, list<dag> pattern> - : InstARM<am, Size4Bytes, im, NEONFrm, cstr> { +class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : InstARM<am, Size4Bytes, im, NEONFrm, cstr, itin> { let OutOperandList = oops; let InOperandList = iops; let AsmString = asm; @@ -1023,20 +1207,33 @@ class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, string asm, list<Predicate> Predicates = [HasNEON]; } -class NI<dag oops, dag iops, string asm, list<dag> pattern> - : NeonI<oops, iops, AddrModeNone, IndexModeNone, asm, "", pattern> { +class NI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern> + : NeonI<oops, iops, AddrModeNone, IndexModeNone, itin, asm, "", pattern> { +} + +class NI4<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern> + : NeonI<oops, iops, AddrMode4, IndexModeNone, itin, asm, "", pattern> { +} + +class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4, + dag oops, dag iops, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : NeonI<oops, iops, AddrMode6, IndexModeNone, itin, asm, cstr, pattern> { + let Inst{31-24} = 0b11110100; } -class NDataI<dag oops, dag iops, string asm, string cstr, list<dag> pattern> - : NeonI<oops, iops, AddrModeNone, IndexModeNone, asm, cstr, pattern> { +class NDataI<dag oops, dag iops, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : NeonI<oops, iops, AddrModeNone, IndexModeNone, itin, asm, cstr, pattern> { let Inst{31-25} = 0b1111001; } // NEON "one register and a modified immediate" format. class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6, bit op5, bit op4, - dag oops, dag iops, string asm, string cstr, list<dag> pattern> - : NDataI<oops, iops, asm, cstr, pattern> { + dag oops, dag iops, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : NDataI<oops, iops, itin, asm, cstr, pattern> { let Inst{23} = op23; let Inst{21-19} = op21_19; let Inst{11-8} = op11_8; @@ -1049,8 +1246,9 @@ class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6, // NEON 2 vector register format. class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, - dag oops, dag iops, string asm, string cstr, list<dag> pattern> - : NDataI<oops, iops, asm, cstr, pattern> { + dag oops, dag iops, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : NDataI<oops, iops, itin, asm, cstr, pattern> { let Inst{24-23} = op24_23; let Inst{21-20} = op21_20; let Inst{19-18} = op19_18; @@ -1063,8 +1261,9 @@ class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, // NEON 2 vector register with immediate. class N2VImm<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, bit op6, bit op4, - dag oops, dag iops, string asm, string cstr, list<dag> pattern> - : NDataI<oops, iops, asm, cstr, pattern> { + dag oops, dag iops, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : NDataI<oops, iops, itin, asm, cstr, pattern> { let Inst{24} = op24; let Inst{23} = op23; let Inst{21-16} = op21_16; @@ -1076,8 +1275,9 @@ class N2VImm<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, // NEON 3 vector register format. class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, - dag oops, dag iops, string asm, string cstr, list<dag> pattern> - : NDataI<oops, iops, asm, cstr, pattern> { + dag oops, dag iops, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : NDataI<oops, iops, itin, asm, cstr, pattern> { let Inst{24} = op24; let Inst{23} = op23; let Inst{21-20} = op21_20; @@ -1088,9 +1288,9 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, // NEON VMOVs between scalar and core registers. class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, - dag oops, dag iops, Format f, string opc, string asm, - list<dag> pattern> - : AI<oops, iops, f, opc, asm, pattern> { + dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AI<oops, iops, f, itin, opc, asm, pattern> { let Inst{27-20} = opcod1; let Inst{11-8} = opcod2; let Inst{6-5} = opcod3; @@ -1098,13 +1298,23 @@ class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, list<Predicate> Predicates = [HasNEON]; } class NVGetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, - dag oops, dag iops, string opc, string asm, list<dag> pattern> - : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONGetLnFrm, opc, asm, - pattern>; + dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONGetLnFrm, itin, + opc, asm, pattern>; class NVSetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, - dag oops, dag iops, string opc, string asm, list<dag> pattern> - : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONSetLnFrm, opc, asm, - pattern>; + dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONSetLnFrm, itin, + opc, asm, pattern>; class NVDup<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, - dag oops, dag iops, string opc, string asm, list<dag> pattern> - : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONDupFrm, opc, asm, pattern>; + dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONDupFrm, itin, + opc, asm, pattern>; + +// NEONFPPat - Same as Pat<>, but requires that the compiler be using NEON +// for single-precision FP. +class NEONFPPat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [HasNEON,UseNEONForFP]; +} diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp index 443fdc7..4c92891 100644 --- a/lib/Target/ARM/ARMInstrInfo.cpp +++ b/lib/Target/ARM/ARMInstrInfo.cpp @@ -21,52 +21,15 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" -#include "llvm/Target/TargetAsmInfo.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/Support/CommandLine.h" using namespace llvm; -static cl::opt<bool> -EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden, - cl::desc("Enable ARM 2-addr to 3-addr conv")); - -static inline -const MachineInstrBuilder &AddDefaultPred(const MachineInstrBuilder &MIB) { - return MIB.addImm((int64_t)ARMCC::AL).addReg(0); -} - -static inline -const MachineInstrBuilder &AddDefaultCC(const MachineInstrBuilder &MIB) { - return MIB.addReg(0); -} - -ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget &STI) - : TargetInstrInfoImpl(ARMInsts, array_lengthof(ARMInsts)) { -} - ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) - : ARMBaseInstrInfo(STI), RI(*this, STI) { + : RI(*this, STI), Subtarget(STI) { } -void ARMInstrInfo::reMaterialize(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - unsigned DestReg, - const MachineInstr *Orig) const { - DebugLoc dl = Orig->getDebugLoc(); - if (Orig->getOpcode() == ARM::MOVi2pieces) { - RI.emitLoadConstPool(MBB, I, this, dl, - DestReg, - Orig->getOperand(1).getImm(), - (ARMCC::CondCodes)Orig->getOperand(2).getImm(), - Orig->getOperand(3).getReg()); - return; - } - - MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig); - MI->getOperand(0).setReg(DestReg); - MBB.insert(I, MI); -} - -static unsigned getUnindexedOpcode(unsigned Opc) { +unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const { switch (Opc) { default: break; case ARM::LDR_PRE: @@ -94,820 +57,45 @@ static unsigned getUnindexedOpcode(unsigned Opc) { case ARM::STRB_POST: return ARM::STRB; } - return 0; -} - -MachineInstr * -ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, - MachineBasicBlock::iterator &MBBI, - LiveVariables *LV) const { - if (!EnableARM3Addr) - return NULL; - - MachineInstr *MI = MBBI; - MachineFunction &MF = *MI->getParent()->getParent(); - unsigned TSFlags = MI->getDesc().TSFlags; - bool isPre = false; - switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) { - default: return NULL; - case ARMII::IndexModePre: - isPre = true; - break; - case ARMII::IndexModePost: - break; - } - - // Try splitting an indexed load/store to an un-indexed one plus an add/sub - // operation. - unsigned MemOpc = getUnindexedOpcode(MI->getOpcode()); - if (MemOpc == 0) - return NULL; - - MachineInstr *UpdateMI = NULL; - MachineInstr *MemMI = NULL; - unsigned AddrMode = (TSFlags & ARMII::AddrModeMask); - const TargetInstrDesc &TID = MI->getDesc(); - unsigned NumOps = TID.getNumOperands(); - bool isLoad = !TID.mayStore(); - const MachineOperand &WB = isLoad ? MI->getOperand(1) : MI->getOperand(0); - const MachineOperand &Base = MI->getOperand(2); - const MachineOperand &Offset = MI->getOperand(NumOps-3); - unsigned WBReg = WB.getReg(); - unsigned BaseReg = Base.getReg(); - unsigned OffReg = Offset.getReg(); - unsigned OffImm = MI->getOperand(NumOps-2).getImm(); - ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NumOps-1).getImm(); - switch (AddrMode) { - default: - assert(false && "Unknown indexed op!"); - return NULL; - case ARMII::AddrMode2: { - bool isSub = ARM_AM::getAM2Op(OffImm) == ARM_AM::sub; - unsigned Amt = ARM_AM::getAM2Offset(OffImm); - if (OffReg == 0) { - int SOImmVal = ARM_AM::getSOImmVal(Amt); - if (SOImmVal == -1) - // Can't encode it in a so_imm operand. This transformation will - // add more than 1 instruction. Abandon! - return NULL; - UpdateMI = BuildMI(MF, MI->getDebugLoc(), - get(isSub ? ARM::SUBri : ARM::ADDri), WBReg) - .addReg(BaseReg).addImm(SOImmVal) - .addImm(Pred).addReg(0).addReg(0); - } else if (Amt != 0) { - ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm); - unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt); - UpdateMI = BuildMI(MF, MI->getDebugLoc(), - get(isSub ? ARM::SUBrs : ARM::ADDrs), WBReg) - .addReg(BaseReg).addReg(OffReg).addReg(0).addImm(SOOpc) - .addImm(Pred).addReg(0).addReg(0); - } else - UpdateMI = BuildMI(MF, MI->getDebugLoc(), - get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg) - .addReg(BaseReg).addReg(OffReg) - .addImm(Pred).addReg(0).addReg(0); - break; - } - case ARMII::AddrMode3 : { - bool isSub = ARM_AM::getAM3Op(OffImm) == ARM_AM::sub; - unsigned Amt = ARM_AM::getAM3Offset(OffImm); - if (OffReg == 0) - // Immediate is 8-bits. It's guaranteed to fit in a so_imm operand. - UpdateMI = BuildMI(MF, MI->getDebugLoc(), - get(isSub ? ARM::SUBri : ARM::ADDri), WBReg) - .addReg(BaseReg).addImm(Amt) - .addImm(Pred).addReg(0).addReg(0); - else - UpdateMI = BuildMI(MF, MI->getDebugLoc(), - get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg) - .addReg(BaseReg).addReg(OffReg) - .addImm(Pred).addReg(0).addReg(0); - break; - } - } - - std::vector<MachineInstr*> NewMIs; - if (isPre) { - if (isLoad) - MemMI = BuildMI(MF, MI->getDebugLoc(), - get(MemOpc), MI->getOperand(0).getReg()) - .addReg(WBReg).addReg(0).addImm(0).addImm(Pred); - else - MemMI = BuildMI(MF, MI->getDebugLoc(), - get(MemOpc)).addReg(MI->getOperand(1).getReg()) - .addReg(WBReg).addReg(0).addImm(0).addImm(Pred); - NewMIs.push_back(MemMI); - NewMIs.push_back(UpdateMI); - } else { - if (isLoad) - MemMI = BuildMI(MF, MI->getDebugLoc(), - get(MemOpc), MI->getOperand(0).getReg()) - .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred); - else - MemMI = BuildMI(MF, MI->getDebugLoc(), - get(MemOpc)).addReg(MI->getOperand(1).getReg()) - .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred); - if (WB.isDead()) - UpdateMI->getOperand(0).setIsDead(); - NewMIs.push_back(UpdateMI); - NewMIs.push_back(MemMI); - } - - // Transfer LiveVariables states, kill / dead info. - if (LV) { - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); - if (MO.isReg() && MO.getReg() && - TargetRegisterInfo::isVirtualRegister(MO.getReg())) { - unsigned Reg = MO.getReg(); - - LiveVariables::VarInfo &VI = LV->getVarInfo(Reg); - if (MO.isDef()) { - MachineInstr *NewMI = (Reg == WBReg) ? UpdateMI : MemMI; - if (MO.isDead()) - LV->addVirtualRegisterDead(Reg, NewMI); - } - if (MO.isUse() && MO.isKill()) { - for (unsigned j = 0; j < 2; ++j) { - // Look at the two new MI's in reverse order. - MachineInstr *NewMI = NewMIs[j]; - if (!NewMI->readsRegister(Reg)) - continue; - LV->addVirtualRegisterKilled(Reg, NewMI); - if (VI.removeKill(MI)) - VI.Kills.push_back(NewMI); - break; - } - } - } - } - } - - MFI->insert(MBBI, NewMIs[1]); - MFI->insert(MBBI, NewMIs[0]); - return NewMIs[0]; -} - -// Branch analysis. -bool -ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, - MachineBasicBlock *&FBB, - SmallVectorImpl<MachineOperand> &Cond, - bool AllowModify) const { - // If the block has no terminators, it just falls into the block after it. - MachineBasicBlock::iterator I = MBB.end(); - if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) - return false; - - // Get the last instruction in the block. - MachineInstr *LastInst = I; - - // If there is only one terminator instruction, process it. - unsigned LastOpc = LastInst->getOpcode(); - if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { - if (LastOpc == ARM::B || LastOpc == ARM::tB || LastOpc == ARM::t2B) { - TBB = LastInst->getOperand(0).getMBB(); - return false; - } - if (LastOpc == ARM::Bcc || LastOpc == ARM::tBcc || LastOpc == ARM::t2Bcc) { - // Block ends with fall-through condbranch. - TBB = LastInst->getOperand(0).getMBB(); - Cond.push_back(LastInst->getOperand(1)); - Cond.push_back(LastInst->getOperand(2)); - return false; - } - return true; // Can't handle indirect branch. - } - - // Get the instruction before it if it is a terminator. - MachineInstr *SecondLastInst = I; - - // If there are three terminators, we don't know what sort of block this is. - if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I)) - return true; - - // If the block ends with ARM::B/ARM::tB/ARM::t2B and a - // ARM::Bcc/ARM::tBcc/ARM::t2Bcc, handle it. - unsigned SecondLastOpc = SecondLastInst->getOpcode(); - if ((SecondLastOpc == ARM::Bcc && LastOpc == ARM::B) || - (SecondLastOpc == ARM::tBcc && LastOpc == ARM::tB) || - (SecondLastOpc == ARM::t2Bcc && LastOpc == ARM::t2B)) { - TBB = SecondLastInst->getOperand(0).getMBB(); - Cond.push_back(SecondLastInst->getOperand(1)); - Cond.push_back(SecondLastInst->getOperand(2)); - FBB = LastInst->getOperand(0).getMBB(); - return false; - } - - // If the block ends with two unconditional branches, handle it. The second - // one is not executed, so remove it. - if ((SecondLastOpc == ARM::B || SecondLastOpc==ARM::tB || - SecondLastOpc==ARM::t2B) && - (LastOpc == ARM::B || LastOpc == ARM::tB || LastOpc == ARM::t2B)) { - TBB = SecondLastInst->getOperand(0).getMBB(); - I = LastInst; - if (AllowModify) - I->eraseFromParent(); - return false; - } - - // ...likewise if it ends with a branch table followed by an unconditional - // branch. The branch folder can create these, and we must get rid of them for - // correctness of Thumb constant islands. - if ((SecondLastOpc == ARM::BR_JTr || SecondLastOpc==ARM::BR_JTm || - SecondLastOpc == ARM::BR_JTadd || SecondLastOpc==ARM::tBR_JTr || - SecondLastOpc == ARM::t2BR_JTr || SecondLastOpc==ARM::t2BR_JTm || - SecondLastOpc == ARM::t2BR_JTadd) && - (LastOpc == ARM::B || LastOpc == ARM::tB || LastOpc == ARM::t2B)) { - I = LastInst; - if (AllowModify) - I->eraseFromParent(); - return true; - } - - // Otherwise, can't handle this. - return true; -} - - -unsigned ARMBaseInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { - MachineFunction &MF = *MBB.getParent(); - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - int BOpc = AFI->isThumbFunction() ? - (AFI->isThumb2Function() ? ARM::t2B : ARM::tB) : ARM::B; - int BccOpc = AFI->isThumbFunction() ? - (AFI->isThumb2Function() ? ARM::t2Bcc : ARM::tBcc) : ARM::Bcc; - - MachineBasicBlock::iterator I = MBB.end(); - if (I == MBB.begin()) return 0; - --I; - if (I->getOpcode() != BOpc && I->getOpcode() != BccOpc) - return 0; - - // Remove the branch. - I->eraseFromParent(); - - I = MBB.end(); - - if (I == MBB.begin()) return 1; - --I; - if (I->getOpcode() != BccOpc) - return 1; - - // Remove the branch. - I->eraseFromParent(); - return 2; -} - -unsigned -ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond) const { - // FIXME this should probably have a DebugLoc argument - DebugLoc dl = DebugLoc::getUnknownLoc(); - MachineFunction &MF = *MBB.getParent(); - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - int BOpc = AFI->isThumbFunction() ? - (AFI->isThumb2Function() ? ARM::t2B : ARM::tB) : ARM::B; - int BccOpc = AFI->isThumbFunction() ? - (AFI->isThumb2Function() ? ARM::t2Bcc : ARM::tBcc) : ARM::Bcc; - - // Shouldn't be a fall through. - assert(TBB && "InsertBranch must not be told to insert a fallthrough"); - assert((Cond.size() == 2 || Cond.size() == 0) && - "ARM branch conditions have two components!"); - if (FBB == 0) { - if (Cond.empty()) // Unconditional branch? - BuildMI(&MBB, dl, get(BOpc)).addMBB(TBB); - else - BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB) - .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()); - return 1; - } - - // Two-way conditional branch. - BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB) - .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()); - BuildMI(&MBB, dl, get(BOpc)).addMBB(FBB); - return 2; + return 0; } -bool -ARMBaseInstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const { +bool ARMInstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const { if (MBB.empty()) return false; switch (MBB.back().getOpcode()) { case ARM::BX_RET: // Return. case ARM::LDM_RET: - case ARM::tBX_RET: - case ARM::tBX_RET_vararg: - case ARM::tPOP_RET: case ARM::B: - case ARM::tB: - case ARM::t2B: // Uncond branch. - case ARM::tBR_JTr: - case ARM::t2BR_JTr: case ARM::BR_JTr: // Jumptable branch. - case ARM::t2BR_JTm: case ARM::BR_JTm: // Jumptable branch through mem. - case ARM::t2BR_JTadd: case ARM::BR_JTadd: // Jumptable branch add to pc. return true; - default: return false; - } -} - -bool ARMBaseInstrInfo:: -ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { - ARMCC::CondCodes CC = (ARMCC::CondCodes)(int)Cond[0].getImm(); - Cond[0].setImm(ARMCC::getOppositeCondition(CC)); - return false; -} - -bool ARMBaseInstrInfo::isPredicated(const MachineInstr *MI) const { - int PIdx = MI->findFirstPredOperandIdx(); - return PIdx != -1 && MI->getOperand(PIdx).getImm() != ARMCC::AL; -} - -bool ARMBaseInstrInfo:: -PredicateInstruction(MachineInstr *MI, - const SmallVectorImpl<MachineOperand> &Pred) const { - unsigned Opc = MI->getOpcode(); - if (Opc == ARM::B || Opc == ARM::tB || Opc == ARM::t2B) { - MI->setDesc(get((Opc == ARM::B) ? ARM::Bcc : - ((Opc == ARM::tB) ? ARM::tBcc : ARM::t2Bcc))); - MI->addOperand(MachineOperand::CreateImm(Pred[0].getImm())); - MI->addOperand(MachineOperand::CreateReg(Pred[1].getReg(), false)); - return true; - } - - int PIdx = MI->findFirstPredOperandIdx(); - if (PIdx != -1) { - MachineOperand &PMO = MI->getOperand(PIdx); - PMO.setImm(Pred[0].getImm()); - MI->getOperand(PIdx+1).setReg(Pred[1].getReg()); - return true; - } - return false; -} - -bool ARMBaseInstrInfo:: -SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, - const SmallVectorImpl<MachineOperand> &Pred2) const { - if (Pred1.size() > 2 || Pred2.size() > 2) - return false; - - ARMCC::CondCodes CC1 = (ARMCC::CondCodes)Pred1[0].getImm(); - ARMCC::CondCodes CC2 = (ARMCC::CondCodes)Pred2[0].getImm(); - if (CC1 == CC2) - return true; - - switch (CC1) { - default: - return false; - case ARMCC::AL: - return true; - case ARMCC::HS: - return CC2 == ARMCC::HI; - case ARMCC::LS: - return CC2 == ARMCC::LO || CC2 == ARMCC::EQ; - case ARMCC::GE: - return CC2 == ARMCC::GT; - case ARMCC::LE: - return CC2 == ARMCC::LT; - } -} - -bool ARMBaseInstrInfo::DefinesPredicate(MachineInstr *MI, - std::vector<MachineOperand> &Pred) const { - const TargetInstrDesc &TID = MI->getDesc(); - if (!TID.getImplicitDefs() && !TID.hasOptionalDef()) - return false; - - bool Found = false; - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); - if (MO.isReg() && MO.getReg() == ARM::CPSR) { - Pred.push_back(MO); - Found = true; - } - } - - return Found; -} - - -/// FIXME: Works around a gcc miscompilation with -fstrict-aliasing -static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT, - unsigned JTI) DISABLE_INLINE; -static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT, - unsigned JTI) { - return JT[JTI].MBBs.size(); -} - -/// GetInstSize - Return the size of the specified MachineInstr. -/// -unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { - const MachineBasicBlock &MBB = *MI->getParent(); - const MachineFunction *MF = MBB.getParent(); - const TargetAsmInfo *TAI = MF->getTarget().getTargetAsmInfo(); - - // Basic size info comes from the TSFlags field. - const TargetInstrDesc &TID = MI->getDesc(); - unsigned TSFlags = TID.TSFlags; - - switch ((TSFlags & ARMII::SizeMask) >> ARMII::SizeShift) { - default: { - // If this machine instr is an inline asm, measure it. - if (MI->getOpcode() == ARM::INLINEASM) - return TAI->getInlineAsmLength(MI->getOperand(0).getSymbolName()); - if (MI->isLabel()) - return 0; - switch (MI->getOpcode()) { - default: - assert(0 && "Unknown or unset size field for instr!"); - break; - case TargetInstrInfo::IMPLICIT_DEF: - case TargetInstrInfo::DECLARE: - case TargetInstrInfo::DBG_LABEL: - case TargetInstrInfo::EH_LABEL: - return 0; - } - break; - } - case ARMII::Size8Bytes: return 8; // Arm instruction x 2. - case ARMII::Size4Bytes: return 4; // Arm instruction. - case ARMII::Size2Bytes: return 2; // Thumb instruction. - case ARMII::SizeSpecial: { - switch (MI->getOpcode()) { - case ARM::CONSTPOOL_ENTRY: - // If this machine instr is a constant pool entry, its size is recorded as - // operand #2. - return MI->getOperand(2).getImm(); - case ARM::Int_eh_sjlj_setjmp: return 12; - case ARM::BR_JTr: - case ARM::BR_JTm: - case ARM::BR_JTadd: - case ARM::t2BR_JTr: - case ARM::t2BR_JTm: - case ARM::t2BR_JTadd: - case ARM::tBR_JTr: { - // These are jumptable branches, i.e. a branch followed by an inlined - // jumptable. The size is 4 + 4 * number of entries. - unsigned NumOps = TID.getNumOperands(); - MachineOperand JTOP = - MI->getOperand(NumOps - (TID.isPredicable() ? 3 : 2)); - unsigned JTI = JTOP.getIndex(); - const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); - const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); - assert(JTI < JT.size()); - // Thumb instructions are 2 byte aligned, but JT entries are 4 byte - // 4 aligned. The assembler / linker may add 2 byte padding just before - // the JT entries. The size does not include this padding; the - // constant islands pass does separate bookkeeping for it. - // FIXME: If we know the size of the function is less than (1 << 16) *2 - // bytes, we can use 16-bit entries instead. Then there won't be an - // alignment issue. - return getNumJTEntries(JT, JTI) * 4 + - ((MI->getOpcode()==ARM::tBR_JTr) ? 2 : 4); - } - default: - // Otherwise, pseudo-instruction sizes are zero. - return 0; - } - } - } - return 0; // Not reached -} - -/// Return true if the instruction is a register to register move and -/// leave the source and dest operands in the passed parameters. -/// -bool -ARMBaseInstrInfo::isMoveInstr(const MachineInstr &MI, - unsigned &SrcReg, unsigned &DstReg, - unsigned& SrcSubIdx, unsigned& DstSubIdx) const { - SrcSubIdx = DstSubIdx = 0; // No sub-registers. - - unsigned oc = MI.getOpcode(); - switch (oc) { default: - return false; - case ARM::FCPYS: - case ARM::FCPYD: - case ARM::VMOVD: - case ARM::VMOVQ: - SrcReg = MI.getOperand(1).getReg(); - DstReg = MI.getOperand(0).getReg(); - return true; - case ARM::MOVr: - assert(MI.getDesc().getNumOperands() >= 2 && - MI.getOperand(0).isReg() && - MI.getOperand(1).isReg() && - "Invalid ARM MOV instruction"); - SrcReg = MI.getOperand(1).getReg(); - DstReg = MI.getOperand(0).getReg(); - return true; - } -} - -unsigned -ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const { - switch (MI->getOpcode()) { - default: break; - case ARM::LDR: - if (MI->getOperand(1).isFI() && - MI->getOperand(2).isReg() && - MI->getOperand(3).isImm() && - MI->getOperand(2).getReg() == 0 && - MI->getOperand(3).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); - } - break; - case ARM::FLDD: - case ARM::FLDS: - if (MI->getOperand(1).isFI() && - MI->getOperand(2).isImm() && - MI->getOperand(2).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); - } - break; - } - return 0; -} - -unsigned -ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI, - int &FrameIndex) const { - switch (MI->getOpcode()) { - default: break; - case ARM::STR: - if (MI->getOperand(1).isFI() && - MI->getOperand(2).isReg() && - MI->getOperand(3).isImm() && - MI->getOperand(2).getReg() == 0 && - MI->getOperand(3).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); - } - break; - case ARM::FSTD: - case ARM::FSTS: - if (MI->getOperand(1).isFI() && - MI->getOperand(2).isImm() && - MI->getOperand(2).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); - } break; } - return 0; -} - -bool -ARMBaseInstrInfo::copyRegToReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - unsigned DestReg, unsigned SrcReg, - const TargetRegisterClass *DestRC, - const TargetRegisterClass *SrcRC) const { - DebugLoc DL = DebugLoc::getUnknownLoc(); - if (I != MBB.end()) DL = I->getDebugLoc(); - - if (DestRC != SrcRC) { - // Not yet supported! - return false; - } - - if (DestRC == ARM::GPRRegisterClass) - AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr), DestReg) - .addReg(SrcReg))); - else if (DestRC == ARM::SPRRegisterClass) - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYS), DestReg) - .addReg(SrcReg)); - else if (DestRC == ARM::DPRRegisterClass) - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYD), DestReg) - .addReg(SrcReg)); - else if (DestRC == ARM::QPRRegisterClass) - BuildMI(MBB, I, DL, get(ARM::VMOVQ), DestReg).addReg(SrcReg); - else - return false; - - return true; -} - -void ARMBaseInstrInfo:: -storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - unsigned SrcReg, bool isKill, int FI, - const TargetRegisterClass *RC) const { - DebugLoc DL = DebugLoc::getUnknownLoc(); - if (I != MBB.end()) DL = I->getDebugLoc(); - - if (RC == ARM::GPRRegisterClass) { - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STR)) - .addReg(SrcReg, getKillRegState(isKill)) - .addFrameIndex(FI).addReg(0).addImm(0)); - } else if (RC == ARM::DPRRegisterClass) { - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FSTD)) - .addReg(SrcReg, getKillRegState(isKill)) - .addFrameIndex(FI).addImm(0)); - } else { - assert(RC == ARM::SPRRegisterClass && "Unknown regclass!"); - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FSTS)) - .addReg(SrcReg, getKillRegState(isKill)) - .addFrameIndex(FI).addImm(0)); - } -} - -void -ARMBaseInstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg, - bool isKill, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - SmallVectorImpl<MachineInstr*> &NewMIs) const{ - DebugLoc DL = DebugLoc::getUnknownLoc(); - unsigned Opc = 0; - if (RC == ARM::GPRRegisterClass) { - Opc = ARM::STR; - } else if (RC == ARM::DPRRegisterClass) { - Opc = ARM::FSTD; - } else { - assert(RC == ARM::SPRRegisterClass && "Unknown regclass!"); - Opc = ARM::FSTS; - } - - MachineInstrBuilder MIB = - BuildMI(MF, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill)); - for (unsigned i = 0, e = Addr.size(); i != e; ++i) - MIB.addOperand(Addr[i]); - AddDefaultPred(MIB); - NewMIs.push_back(MIB); - return; -} - -void ARMBaseInstrInfo:: -loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - unsigned DestReg, int FI, - const TargetRegisterClass *RC) const { - DebugLoc DL = DebugLoc::getUnknownLoc(); - if (I != MBB.end()) DL = I->getDebugLoc(); - - if (RC == ARM::GPRRegisterClass) { - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDR), DestReg) - .addFrameIndex(FI).addReg(0).addImm(0)); - } else if (RC == ARM::DPRRegisterClass) { - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FLDD), DestReg) - .addFrameIndex(FI).addImm(0)); - } else { - assert(RC == ARM::SPRRegisterClass && "Unknown regclass!"); - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FLDS), DestReg) - .addFrameIndex(FI).addImm(0)); - } -} - -void ARMBaseInstrInfo:: -loadRegFromAddr(MachineFunction &MF, unsigned DestReg, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - SmallVectorImpl<MachineInstr*> &NewMIs) const { - DebugLoc DL = DebugLoc::getUnknownLoc(); - unsigned Opc = 0; - if (RC == ARM::GPRRegisterClass) { - Opc = ARM::LDR; - } else if (RC == ARM::DPRRegisterClass) { - Opc = ARM::FLDD; - } else { - assert(RC == ARM::SPRRegisterClass && "Unknown regclass!"); - Opc = ARM::FLDS; - } - - MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg); - for (unsigned i = 0, e = Addr.size(); i != e; ++i) - MIB.addOperand(Addr[i]); - AddDefaultPred(MIB); - NewMIs.push_back(MIB); - return; + return false; } -MachineInstr *ARMBaseInstrInfo:: -foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops, int FI) const { - if (Ops.size() != 1) return NULL; - - unsigned OpNum = Ops[0]; - unsigned Opc = MI->getOpcode(); - MachineInstr *NewMI = NULL; - switch (Opc) { - default: break; - case ARM::MOVr: { - if (MI->getOperand(4).getReg() == ARM::CPSR) - // If it is updating CPSR, then it cannot be folded. - break; - unsigned Pred = MI->getOperand(2).getImm(); - unsigned PredReg = MI->getOperand(3).getReg(); - if (OpNum == 0) { // move -> store - unsigned SrcReg = MI->getOperand(1).getReg(); - bool isKill = MI->getOperand(1).isKill(); - bool isUndef = MI->getOperand(1).isUndef(); - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::STR)) - .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)) - .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg); - } else { // move -> load - unsigned DstReg = MI->getOperand(0).getReg(); - bool isDead = MI->getOperand(0).isDead(); - bool isUndef = MI->getOperand(0).isUndef(); - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::LDR)) - .addReg(DstReg, - RegState::Define | - getDeadRegState(isDead) | - getUndefRegState(isUndef)) - .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg); - } - break; - } - case ARM::FCPYS: { - unsigned Pred = MI->getOperand(2).getImm(); - unsigned PredReg = MI->getOperand(3).getReg(); - if (OpNum == 0) { // move -> store - unsigned SrcReg = MI->getOperand(1).getReg(); - bool isKill = MI->getOperand(1).isKill(); - bool isUndef = MI->getOperand(1).isUndef(); - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FSTS)) - .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)) - .addFrameIndex(FI) - .addImm(0).addImm(Pred).addReg(PredReg); - } else { // move -> load - unsigned DstReg = MI->getOperand(0).getReg(); - bool isDead = MI->getOperand(0).isDead(); - bool isUndef = MI->getOperand(0).isUndef(); - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FLDS)) - .addReg(DstReg, - RegState::Define | - getDeadRegState(isDead) | - getUndefRegState(isUndef)) - .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg); - } - break; - } - case ARM::FCPYD: { - unsigned Pred = MI->getOperand(2).getImm(); - unsigned PredReg = MI->getOperand(3).getReg(); - if (OpNum == 0) { // move -> store - unsigned SrcReg = MI->getOperand(1).getReg(); - bool isKill = MI->getOperand(1).isKill(); - bool isUndef = MI->getOperand(1).isUndef(); - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FSTD)) - .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)) - .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg); - } else { // move -> load - unsigned DstReg = MI->getOperand(0).getReg(); - bool isDead = MI->getOperand(0).isDead(); - bool isUndef = MI->getOperand(0).isUndef(); - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FLDD)) - .addReg(DstReg, - RegState::Define | - getDeadRegState(isDead) | - getUndefRegState(isUndef)) - .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg); - } - break; - } +void ARMInstrInfo:: +reMaterialize(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SubIdx, + const MachineInstr *Orig) const { + DebugLoc dl = Orig->getDebugLoc(); + if (Orig->getOpcode() == ARM::MOVi2pieces) { + RI.emitLoadConstPool(MBB, I, dl, + DestReg, SubIdx, + Orig->getOperand(1).getImm(), + (ARMCC::CondCodes)Orig->getOperand(2).getImm(), + Orig->getOperand(3).getReg()); + return; } - return NewMI; -} - -MachineInstr* -ARMBaseInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr* MI, - const SmallVectorImpl<unsigned> &Ops, - MachineInstr* LoadMI) const { - return 0; + MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig); + MI->getOperand(0).setReg(DestReg); + MBB.insert(I, MI); } -bool -ARMBaseInstrInfo::canFoldMemoryOperand(const MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops) const { - if (Ops.size() != 1) return false; - - unsigned Opc = MI->getOpcode(); - switch (Opc) { - default: break; - case ARM::MOVr: - // If it is updating CPSR, then it cannot be folded. - return MI->getOperand(4).getReg() != ARM::CPSR; - case ARM::FCPYS: - case ARM::FCPYD: - return true; - - case ARM::VMOVD: - case ARM::VMOVQ: - return false; // FIXME - } - - return false; -} diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h index 8c8f788..c616949 100644 --- a/lib/Target/ARM/ARMInstrInfo.h +++ b/lib/Target/ARM/ARMInstrInfo.h @@ -15,247 +15,27 @@ #define ARMINSTRUCTIONINFO_H #include "llvm/Target/TargetInstrInfo.h" +#include "ARMBaseInstrInfo.h" #include "ARMRegisterInfo.h" +#include "ARMSubtarget.h" #include "ARM.h" namespace llvm { class ARMSubtarget; -/// ARMII - This namespace holds all of the target specific flags that -/// instruction info tracks. -/// -namespace ARMII { - enum { - //===------------------------------------------------------------------===// - // Instruction Flags. - - //===------------------------------------------------------------------===// - // This four-bit field describes the addressing mode used. - - AddrModeMask = 0xf, - AddrModeNone = 0, - AddrMode1 = 1, - AddrMode2 = 2, - AddrMode3 = 3, - AddrMode4 = 4, - AddrMode5 = 5, - AddrMode6 = 6, - AddrModeT1_1 = 7, - AddrModeT1_2 = 8, - AddrModeT1_4 = 9, - AddrModeT1_s = 10, // i8 * 4 for pc and sp relative data - AddrModeT2_i12 = 11, - AddrModeT2_i8 = 12, - AddrModeT2_so = 13, - AddrModeT2_pc = 14, // +/- i12 for pc relative data - AddrModeT2_i8s4 = 15, // i8 * 4 - - // Size* - Flags to keep track of the size of an instruction. - SizeShift = 4, - SizeMask = 7 << SizeShift, - SizeSpecial = 1, // 0 byte pseudo or special case. - Size8Bytes = 2, - Size4Bytes = 3, - Size2Bytes = 4, - - // IndexMode - Unindex, pre-indexed, or post-indexed. Only valid for load - // and store ops - IndexModeShift = 7, - IndexModeMask = 3 << IndexModeShift, - IndexModePre = 1, - IndexModePost = 2, - - //===------------------------------------------------------------------===// - // Misc flags. - - // UnaryDP - Indicates this is a unary data processing instruction, i.e. - // it doesn't have a Rn operand. - UnaryDP = 1 << 9, - - //===------------------------------------------------------------------===// - // Instruction encoding formats. - // - FormShift = 10, - FormMask = 0x1f << FormShift, - - // Pseudo instructions - Pseudo = 0 << FormShift, - - // Multiply instructions - MulFrm = 1 << FormShift, - - // Branch instructions - BrFrm = 2 << FormShift, - BrMiscFrm = 3 << FormShift, - - // Data Processing instructions - DPFrm = 4 << FormShift, - DPSoRegFrm = 5 << FormShift, - - // Load and Store - LdFrm = 6 << FormShift, - StFrm = 7 << FormShift, - LdMiscFrm = 8 << FormShift, - StMiscFrm = 9 << FormShift, - LdStMulFrm = 10 << FormShift, - - // Miscellaneous arithmetic instructions - ArithMiscFrm = 11 << FormShift, - - // Extend instructions - ExtFrm = 12 << FormShift, - - // VFP formats - VFPUnaryFrm = 13 << FormShift, - VFPBinaryFrm = 14 << FormShift, - VFPConv1Frm = 15 << FormShift, - VFPConv2Frm = 16 << FormShift, - VFPConv3Frm = 17 << FormShift, - VFPConv4Frm = 18 << FormShift, - VFPConv5Frm = 19 << FormShift, - VFPLdStFrm = 20 << FormShift, - VFPLdStMulFrm = 21 << FormShift, - VFPMiscFrm = 22 << FormShift, - - // Thumb format - ThumbFrm = 23 << FormShift, - - // NEON format - NEONFrm = 24 << FormShift, - NEONGetLnFrm = 25 << FormShift, - NEONSetLnFrm = 26 << FormShift, - NEONDupFrm = 27 << FormShift, - - //===------------------------------------------------------------------===// - // Field shifts - such shifts are used to set field while generating - // machine instructions. - M_BitShift = 5, - ShiftImmShift = 5, - ShiftShift = 7, - N_BitShift = 7, - ImmHiShift = 8, - SoRotImmShift = 8, - RegRsShift = 8, - ExtRotImmShift = 10, - RegRdLoShift = 12, - RegRdShift = 12, - RegRdHiShift = 16, - RegRnShift = 16, - S_BitShift = 20, - W_BitShift = 21, - AM3_I_BitShift = 22, - D_BitShift = 22, - U_BitShift = 23, - P_BitShift = 24, - I_BitShift = 25, - CondShift = 28 - }; -} - -class ARMBaseInstrInfo : public TargetInstrInfoImpl { -protected: - // Can be only subclassed. - explicit ARMBaseInstrInfo(const ARMSubtarget &STI); -public: - virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI, - MachineBasicBlock::iterator &MBBI, - LiveVariables *LV) const; - - virtual const ARMBaseRegisterInfo &getRegisterInfo() const =0; - - // Branch analysis. - virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, - MachineBasicBlock *&FBB, - SmallVectorImpl<MachineOperand> &Cond, - bool AllowModify) const; - virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const; - virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond) const; - - virtual bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const; - virtual - bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const; - - // Predication support. - virtual bool isPredicated(const MachineInstr *MI) const; - - ARMCC::CondCodes getPredicate(const MachineInstr *MI) const { - int PIdx = MI->findFirstPredOperandIdx(); - return PIdx != -1 ? (ARMCC::CondCodes)MI->getOperand(PIdx).getImm() - : ARMCC::AL; - } - - virtual - bool PredicateInstruction(MachineInstr *MI, - const SmallVectorImpl<MachineOperand> &Pred) const; - - virtual - bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, - const SmallVectorImpl<MachineOperand> &Pred2) const; - - virtual bool DefinesPredicate(MachineInstr *MI, - std::vector<MachineOperand> &Pred) const; - - /// GetInstSize - Returns the size of the specified MachineInstr. - /// - virtual unsigned GetInstSizeInBytes(const MachineInstr* MI) const; - - /// Return true if the instruction is a register to register move and return - /// the source and dest operands and their sub-register indices by reference. - virtual bool isMoveInstr(const MachineInstr &MI, - unsigned &SrcReg, unsigned &DstReg, - unsigned &SrcSubIdx, unsigned &DstSubIdx) const; - - virtual unsigned isLoadFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const; - virtual unsigned isStoreToStackSlot(const MachineInstr *MI, - int &FrameIndex) const; - - virtual bool copyRegToReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - unsigned DestReg, unsigned SrcReg, - const TargetRegisterClass *DestRC, - const TargetRegisterClass *SrcRC) const; - virtual void storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - unsigned SrcReg, bool isKill, int FrameIndex, - const TargetRegisterClass *RC) const; - - virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - SmallVectorImpl<MachineInstr*> &NewMIs) const; - - virtual void loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - unsigned DestReg, int FrameIndex, - const TargetRegisterClass *RC) const; - - virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - SmallVectorImpl<MachineInstr*> &NewMIs) const; - - virtual bool canFoldMemoryOperand(const MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops) const; - - virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr* MI, - const SmallVectorImpl<unsigned> &Ops, - int FrameIndex) const; - - virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr* MI, - const SmallVectorImpl<unsigned> &Ops, - MachineInstr* LoadMI) const; -}; - class ARMInstrInfo : public ARMBaseInstrInfo { ARMRegisterInfo RI; + const ARMSubtarget &Subtarget; public: explicit ARMInstrInfo(const ARMSubtarget &STI); + // Return the non-pre/post incrementing version of 'Opc'. Return 0 + // if there is not such an opcode. + unsigned getUnindexedOpcode(unsigned Opc) const; + + // Return true if the block does not fall through. + bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const; + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). @@ -263,7 +43,8 @@ public: const ARMRegisterInfo &getRegisterInfo() const { return RI; } void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - unsigned DestReg, const MachineInstr *Orig) const; + unsigned DestReg, unsigned SubIdx, + const MachineInstr *Orig) const; }; } diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 408f47a..8adfac3 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -34,6 +34,10 @@ def SDT_ARMBrJT : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>; +def SDT_ARMBr2JT : SDTypeProfile<0, 4, + [SDTCisPtrTy<0>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; + def SDT_ARMCmp : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>; def SDT_ARMPICAdd : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, @@ -71,6 +75,8 @@ def ARMbrcond : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond, def ARMbrjt : SDNode<"ARMISD::BR_JT", SDT_ARMBrJT, [SDNPHasChain]>; +def ARMbr2jt : SDNode<"ARMISD::BR2_JT", SDT_ARMBr2JT, + [SDNPHasChain]>; def ARMcmp : SDNode<"ARMISD::CMP", SDT_ARMCmp, [SDNPOutFlag]>; @@ -93,10 +99,14 @@ def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP", SDT_ARMEH_SJLJ_Setjmp>; def HasV5T : Predicate<"Subtarget->hasV5TOps()">; def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">; def HasV6 : Predicate<"Subtarget->hasV6Ops()">; +def HasV6T2 : Predicate<"Subtarget->hasV6T2Ops()">; +def NoV6T2 : Predicate<"!Subtarget->hasV6T2Ops()">; def HasV7 : Predicate<"Subtarget->hasV7Ops()">; def HasVFP2 : Predicate<"Subtarget->hasVFP2()">; def HasVFP3 : Predicate<"Subtarget->hasVFP3()">; def HasNEON : Predicate<"Subtarget->hasNEON()">; +def UseNEONForFP : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">; +def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">; def IsThumb : Predicate<"Subtarget->isThumb()">; def IsThumb1Only : Predicate<"Subtarget->isThumb1Only()">; def IsThumb2 : Predicate<"Subtarget->isThumb2()">; @@ -117,25 +127,16 @@ class RegConstraint<string C> { // ARM specific transformation functions and pattern fragments. // -// so_imm_XFORM - Return a so_imm value packed into the format described for -// so_imm def below. -def so_imm_XFORM : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(N->getZExtValue()), - MVT::i32); -}]>; - // so_imm_neg_XFORM - Return a so_imm value packed into the format described for // so_imm_neg def below. def so_imm_neg_XFORM : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(-(int)N->getZExtValue()), - MVT::i32); + return CurDAG->getTargetConstant(-(int)N->getZExtValue(), MVT::i32); }]>; // so_imm_not_XFORM - Return a so_imm value packed into the format described for // so_imm_not def below. def so_imm_not_XFORM : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(~(int)N->getZExtValue()), - MVT::i32); + return CurDAG->getTargetConstant(~(int)N->getZExtValue(), MVT::i32); }]>; // rot_imm predicate - True if the 32-bit immediate is equal to 8, 16, or 24. @@ -169,6 +170,48 @@ def sext_16_node : PatLeaf<(i32 GPR:$a), [{ return CurDAG->ComputeNumSignBits(SDValue(N,0)) >= 17; }]>; +/// bf_inv_mask_imm predicate - An AND mask to clear an arbitrary width bitfield +/// e.g., 0xf000ffff +def bf_inv_mask_imm : Operand<i32>, + PatLeaf<(imm), [{ + uint32_t v = (uint32_t)N->getZExtValue(); + if (v == 0xffffffff) + return 0; + // there can be 1's on either or both "outsides", all the "inside" + // bits must be 0's + unsigned int lsb = 0, msb = 31; + while (v & (1 << msb)) --msb; + while (v & (1 << lsb)) ++lsb; + for (unsigned int i = lsb; i <= msb; ++i) { + if (v & (1 << i)) + return 0; + } + return 1; +}] > { + let PrintMethod = "printBitfieldInvMaskImmOperand"; +} + +/// Split a 32-bit immediate into two 16 bit parts. +def lo16 : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() & 0xffff, + MVT::i32); +}]>; + +def hi16 : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() >> 16, MVT::i32); +}]>; + +def lo16AllZero : PatLeaf<(i32 imm), [{ + // Returns true if all low 16-bits are 0. + return (((uint32_t)N->getZExtValue()) & 0xFFFFUL) == 0; + }], hi16>; + +/// imm0_65535 predicate - True if the 32-bit immediate is in the range +/// [0.65535]. +def imm0_65535 : PatLeaf<(i32 imm), [{ + return (uint32_t)N->getZExtValue() < 65536; +}]>; + class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>; class UnOpFrag <dag res> : PatFrag<(ops node:$Src), res>; @@ -192,6 +235,9 @@ def cpinst_operand : Operand<i32> { def jtblock_operand : Operand<i32> { let PrintMethod = "printJTBlockOperand"; } +def jt2block_operand : Operand<i32> { + let PrintMethod = "printJT2BlockOperand"; +} // Local PC labels. def pclabel : Operand<i32> { @@ -212,9 +258,9 @@ def so_reg : Operand<i32>, // reg reg imm // into so_imm instructions: the 8-bit immediate is the least significant bits // [bits 0-7], the 4-bit shift amount is the next 4 bits [bits 8-11]. def so_imm : Operand<i32>, - PatLeaf<(imm), - [{ return ARM_AM::getSOImmVal(N->getZExtValue()) != -1; }], - so_imm_XFORM> { + PatLeaf<(imm), [{ + return ARM_AM::getSOImmVal(N->getZExtValue()) != -1; + }]> { let PrintMethod = "printSOImmOperand"; } @@ -230,14 +276,18 @@ def so_imm2part : Operand<i32>, def so_imm2part_1 : SDNodeXForm<imm, [{ unsigned V = ARM_AM::getSOImmTwoPartFirst((unsigned)N->getZExtValue()); - return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(V), MVT::i32); + return CurDAG->getTargetConstant(V, MVT::i32); }]>; def so_imm2part_2 : SDNodeXForm<imm, [{ unsigned V = ARM_AM::getSOImmTwoPartSecond((unsigned)N->getZExtValue()); - return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(V), MVT::i32); + return CurDAG->getTargetConstant(V, MVT::i32); }]>; +/// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31]. +def imm0_31 : Operand<i32>, PatLeaf<(imm), [{ + return (int32_t)N->getZExtValue() < 32; +}]>; // Define ARM specific addressing modes. @@ -274,7 +324,7 @@ def am3offset : Operand<i32>, // addrmode4 := reg, <mode|W> // def addrmode4 : Operand<i32>, - ComplexPattern<i32, 2, "", []> { + ComplexPattern<i32, 2, "SelectAddrMode4", []> { let PrintMethod = "printAddrMode4Operand"; let MIOperandInfo = (ops GPR, i32imm); } @@ -303,17 +353,8 @@ def addrmodepc : Operand<i32>, let MIOperandInfo = (ops GPR, i32imm); } -// ARM Predicate operand. Default to 14 = always (AL). Second part is CC -// register whose default is 0 (no register). -def pred : PredicateOperand<OtherVT, (ops i32imm, CCR), - (ops (i32 14), (i32 zero_reg))> { - let PrintMethod = "printPredicateOperand"; -} - -// Conditional code result for instructions whose 's' bit is set, e.g. subs. -// -def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> { - let PrintMethod = "printSBitModifierOperand"; +def nohash_imm : Operand<i32> { + let PrintMethod = "printNoHashImmediate"; } //===----------------------------------------------------------------------===// @@ -329,34 +370,44 @@ include "ARMInstrFormats.td" multiclass AsI1_bin_irs<bits<4> opcod, string opc, PatFrag opnode, bit Commutable = 0> { def ri : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm, - opc, " $dst, $a, $b", - [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>; + IIC_iALUi, opc, " $dst, $a, $b", + [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]> { + let Inst{25} = 1; + } def rr : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm, - opc, " $dst, $a, $b", + IIC_iALUr, opc, " $dst, $a, $b", [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]> { + let Inst{25} = 0; let isCommutable = Commutable; } def rs : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm, - opc, " $dst, $a, $b", - [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>; + IIC_iALUsr, opc, " $dst, $a, $b", + [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]> { + let Inst{25} = 0; + } } /// AI1_bin_s_irs - Similar to AsI1_bin_irs except it sets the 's' bit so the -/// instruction modifies the CSPR register. +/// instruction modifies the CPSR register. let Defs = [CPSR] in { multiclass AI1_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode, bit Commutable = 0> { def ri : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm, - opc, "s $dst, $a, $b", - [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>; + IIC_iALUi, opc, "s $dst, $a, $b", + [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]> { + let Inst{25} = 1; + } def rr : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm, - opc, "s $dst, $a, $b", + IIC_iALUr, opc, "s $dst, $a, $b", [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]> { let isCommutable = Commutable; + let Inst{25} = 0; } def rs : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm, - opc, "s $dst, $a, $b", - [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>; + IIC_iALUsr, opc, "s $dst, $a, $b", + [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]> { + let Inst{25} = 0; + } } } @@ -366,17 +417,25 @@ multiclass AI1_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode, let Defs = [CPSR] in { multiclass AI1_cmp_irs<bits<4> opcod, string opc, PatFrag opnode, bit Commutable = 0> { - def ri : AI1<opcod, (outs), (ins GPR:$a, so_imm:$b), DPFrm, + def ri : AI1<opcod, (outs), (ins GPR:$a, so_imm:$b), DPFrm, IIC_iCMPi, opc, " $a, $b", - [(opnode GPR:$a, so_imm:$b)]>; - def rr : AI1<opcod, (outs), (ins GPR:$a, GPR:$b), DPFrm, + [(opnode GPR:$a, so_imm:$b)]> { + let Inst{20} = 1; + let Inst{25} = 1; + } + def rr : AI1<opcod, (outs), (ins GPR:$a, GPR:$b), DPFrm, IIC_iCMPr, opc, " $a, $b", [(opnode GPR:$a, GPR:$b)]> { + let Inst{20} = 1; + let Inst{25} = 0; let isCommutable = Commutable; } - def rs : AI1<opcod, (outs), (ins GPR:$a, so_reg:$b), DPSoRegFrm, + def rs : AI1<opcod, (outs), (ins GPR:$a, so_reg:$b), DPSoRegFrm, IIC_iCMPsr, opc, " $a, $b", - [(opnode GPR:$a, so_reg:$b)]>; + [(opnode GPR:$a, so_reg:$b)]> { + let Inst{20} = 1; + let Inst{25} = 0; + } } } @@ -384,15 +443,15 @@ multiclass AI1_cmp_irs<bits<4> opcod, string opc, PatFrag opnode, /// register and one whose operand is a register rotated by 8/16/24. /// FIXME: Remove the 'r' variant. Its rot_imm is zero. multiclass AI_unary_rrot<bits<8> opcod, string opc, PatFrag opnode> { - def r : AExtI<opcod, (outs GPR:$dst), (ins GPR:$Src), - opc, " $dst, $Src", - [(set GPR:$dst, (opnode GPR:$Src))]>, + def r : AExtI<opcod, (outs GPR:$dst), (ins GPR:$src), + IIC_iUNAr, opc, " $dst, $src", + [(set GPR:$dst, (opnode GPR:$src))]>, Requires<[IsARM, HasV6]> { let Inst{19-16} = 0b1111; } - def r_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$Src, i32imm:$rot), - opc, " $dst, $Src, ror $rot", - [(set GPR:$dst, (opnode (rotr GPR:$Src, rot_imm:$rot)))]>, + def r_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$src, i32imm:$rot), + IIC_iUNAsi, opc, " $dst, $src, ror $rot", + [(set GPR:$dst, (opnode (rotr GPR:$src, rot_imm:$rot)))]>, Requires<[IsARM, HasV6]> { let Inst{19-16} = 0b1111; } @@ -402,11 +461,11 @@ multiclass AI_unary_rrot<bits<8> opcod, string opc, PatFrag opnode> { /// register and one whose operand is a register rotated by 8/16/24. multiclass AI_bin_rrot<bits<8> opcod, string opc, PatFrag opnode> { def rr : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS), - opc, " $dst, $LHS, $RHS", + IIC_iALUr, opc, " $dst, $LHS, $RHS", [(set GPR:$dst, (opnode GPR:$LHS, GPR:$RHS))]>, Requires<[IsARM, HasV6]>; def rr_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS, i32imm:$rot), - opc, " $dst, $LHS, $RHS, ror $rot", + IIC_iALUsi, opc, " $dst, $LHS, $RHS, ror $rot", [(set GPR:$dst, (opnode GPR:$LHS, (rotr GPR:$RHS, rot_imm:$rot)))]>, Requires<[IsARM, HasV6]>; @@ -417,37 +476,45 @@ let Uses = [CPSR] in { multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, bit Commutable = 0> { def ri : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), - DPFrm, opc, " $dst, $a, $b", + DPFrm, IIC_iALUi, opc, " $dst, $a, $b", [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>, - Requires<[IsARM, CarryDefIsUnused]>; + Requires<[IsARM, CarryDefIsUnused]> { + let Inst{25} = 1; + } def rr : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - DPFrm, opc, " $dst, $a, $b", + DPFrm, IIC_iALUr, opc, " $dst, $a, $b", [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>, Requires<[IsARM, CarryDefIsUnused]> { let isCommutable = Commutable; + let Inst{25} = 0; } def rs : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), - DPSoRegFrm, opc, " $dst, $a, $b", + DPSoRegFrm, IIC_iALUsr, opc, " $dst, $a, $b", [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>, - Requires<[IsARM, CarryDefIsUnused]>; + Requires<[IsARM, CarryDefIsUnused]> { + let Inst{25} = 0; + } // Carry setting variants def Sri : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), - DPFrm, !strconcat(opc, "s $dst, $a, $b"), + DPFrm, IIC_iALUi, !strconcat(opc, "s $dst, $a, $b"), [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>, Requires<[IsARM, CarryDefIsUsed]> { - let Defs = [CPSR]; + let Defs = [CPSR]; + let Inst{25} = 1; } def Srr : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - DPFrm, !strconcat(opc, "s $dst, $a, $b"), + DPFrm, IIC_iALUr, !strconcat(opc, "s $dst, $a, $b"), [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>, Requires<[IsARM, CarryDefIsUsed]> { - let Defs = [CPSR]; + let Defs = [CPSR]; + let Inst{25} = 0; } def Srs : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), - DPSoRegFrm, !strconcat(opc, "s $dst, $a, $b"), + DPSoRegFrm, IIC_iALUsr, !strconcat(opc, "s $dst, $a, $b"), [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>, Requires<[IsARM, CarryDefIsUsed]> { - let Defs = [CPSR]; + let Defs = [CPSR]; + let Inst{25} = 0; } } } @@ -467,23 +534,23 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, let neverHasSideEffects = 1, isNotDuplicable = 1 in def CONSTPOOL_ENTRY : PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx, - i32imm:$size), + i32imm:$size), NoItinerary, "${instid:label} ${cpidx:cpentry}", []>; let Defs = [SP], Uses = [SP] in { def ADJCALLSTACKUP : -PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), +PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), NoItinerary, "@ ADJCALLSTACKUP $amt1", [(ARMcallseq_end timm:$amt1, timm:$amt2)]>; def ADJCALLSTACKDOWN : -PseudoInst<(outs), (ins i32imm:$amt, pred:$p), +PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary, "@ ADJCALLSTACKDOWN $amt", [(ARMcallseq_start timm:$amt)]>; } def DWARF_LOC : -PseudoInst<(outs), (ins i32imm:$line, i32imm:$col, i32imm:$file), +PseudoInst<(outs), (ins i32imm:$line, i32imm:$col, i32imm:$file), NoItinerary, ".loc $file, $line, $col", [(dwarf_loc (i32 imm:$line), (i32 imm:$col), (i32 imm:$file))]>; @@ -491,42 +558,42 @@ PseudoInst<(outs), (ins i32imm:$line, i32imm:$col, i32imm:$file), // Address computation and loads and stores in PIC mode. let isNotDuplicable = 1 in { def PICADD : AXI1<0b0100, (outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p), - Pseudo, "$cp:\n\tadd$p $dst, pc, $a", + Pseudo, IIC_iALUr, "\n$cp:\n\tadd$p $dst, pc, $a", [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>; let AddedComplexity = 10 in { let canFoldAsLoad = 1 in def PICLDR : AXI2ldw<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p), - Pseudo, "${addr:label}:\n\tldr$p $dst, $addr", + Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldr$p $dst, $addr", [(set GPR:$dst, (load addrmodepc:$addr))]>; def PICLDRH : AXI3ldh<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p), - Pseudo, "${addr:label}:\n\tldr${p}h $dst, $addr", + Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldr${p}h $dst, $addr", [(set GPR:$dst, (zextloadi16 addrmodepc:$addr))]>; def PICLDRB : AXI2ldb<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p), - Pseudo, "${addr:label}:\n\tldr${p}b $dst, $addr", + Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldr${p}b $dst, $addr", [(set GPR:$dst, (zextloadi8 addrmodepc:$addr))]>; def PICLDRSH : AXI3ldsh<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p), - Pseudo, "${addr:label}:\n\tldr${p}sh $dst, $addr", + Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldr${p}sh $dst, $addr", [(set GPR:$dst, (sextloadi16 addrmodepc:$addr))]>; def PICLDRSB : AXI3ldsb<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p), - Pseudo, "${addr:label}:\n\tldr${p}sb $dst, $addr", + Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldr${p}sb $dst, $addr", [(set GPR:$dst, (sextloadi8 addrmodepc:$addr))]>; } let AddedComplexity = 10 in { def PICSTR : AXI2stw<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p), - Pseudo, "${addr:label}:\n\tstr$p $src, $addr", + Pseudo, IIC_iStorer, "\n${addr:label}:\n\tstr$p $src, $addr", [(store GPR:$src, addrmodepc:$addr)]>; def PICSTRH : AXI3sth<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p), - Pseudo, "${addr:label}:\n\tstr${p}h $src, $addr", + Pseudo, IIC_iStorer, "\n${addr:label}:\n\tstr${p}h $src, $addr", [(truncstorei16 GPR:$src, addrmodepc:$addr)]>; def PICSTRB : AXI2stb<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p), - Pseudo, "${addr:label}:\n\tstr${p}b $src, $addr", + Pseudo, IIC_iStorer, "\n${addr:label}:\n\tstr${p}b $src, $addr", [(truncstorei8 GPR:$src, addrmodepc:$addr)]>; } } // isNotDuplicable = 1 @@ -534,135 +601,152 @@ def PICSTRB : AXI2stb<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p), // LEApcrel - Load a pc-relative address into a register without offending the // assembler. -def LEApcrel : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, pred:$p), Pseudo, - !strconcat(!strconcat(".set PCRELV${:uid}, ($label-(", - "${:private}PCRELL${:uid}+8))\n"), - !strconcat("${:private}PCRELL${:uid}:\n\t", - "add$p $dst, pc, #PCRELV${:uid}")), +def LEApcrel : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, pred:$p), + Pseudo, IIC_iALUi, + !strconcat(!strconcat(".set ${:private}PCRELV${:uid}, ($label-(", + "${:private}PCRELL${:uid}+8))\n"), + !strconcat("${:private}PCRELL${:uid}:\n\t", + "add$p $dst, pc, #${:private}PCRELV${:uid}")), []>; def LEApcrelJT : AXI1<0x0, (outs GPR:$dst), - (ins i32imm:$label, i32imm:$id, pred:$p), - Pseudo, - !strconcat(!strconcat(".set PCRELV${:uid}, (${label}_${id:no_hash}-(", - "${:private}PCRELL${:uid}+8))\n"), - !strconcat("${:private}PCRELL${:uid}:\n\t", - "add$p $dst, pc, #PCRELV${:uid}")), - []>; + (ins i32imm:$label, nohash_imm:$id, pred:$p), + Pseudo, IIC_iALUi, + !strconcat(!strconcat(".set ${:private}PCRELV${:uid}, " + "(${label}_${id}-(", + "${:private}PCRELL${:uid}+8))\n"), + !strconcat("${:private}PCRELL${:uid}:\n\t", + "add$p $dst, pc, #${:private}PCRELV${:uid}")), + []> { + let Inst{25} = 1; +} //===----------------------------------------------------------------------===// // Control Flow Instructions. // -let isReturn = 1, isTerminator = 1 in - def BX_RET : AI<(outs), (ins), BrMiscFrm, "bx", " lr", [(ARMretflag)]> { +let isReturn = 1, isTerminator = 1, isBarrier = 1 in + def BX_RET : AI<(outs), (ins), BrMiscFrm, IIC_Br, + "bx", " lr", [(ARMretflag)]> { let Inst{7-4} = 0b0001; let Inst{19-8} = 0b111111111111; let Inst{27-20} = 0b00010010; } // FIXME: remove when we have a way to marking a MI with these properties. -// FIXME: $dst1 should be a def. But the extra ops must be in the end of the -// operand list. // FIXME: Should pc be an implicit operand like PICADD, etc? -let isReturn = 1, isTerminator = 1 in +let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1, + hasExtraDefRegAllocReq = 1 in def LDM_RET : AXI4ld<(outs), - (ins addrmode4:$addr, pred:$p, reglist:$dst1, variable_ops), - LdStMulFrm, "ldm${p}${addr:submode} $addr, $dst1", + (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops), + LdStMulFrm, IIC_Br, "ldm${p}${addr:submode} $addr, $wb", []>; // On non-Darwin platforms R9 is callee-saved. -let isCall = 1, Itinerary = IIC_Br, - Defs = [R0, R1, R2, R3, R12, LR, - D0, D1, D2, D3, D4, D5, D6, D7, CPSR] in { +let isCall = 1, + Defs = [R0, R1, R2, R3, R12, LR, + D0, D1, D2, D3, D4, D5, D6, D7, + D16, D17, D18, D19, D20, D21, D22, D23, + D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in { def BL : ABXI<0b1011, (outs), (ins i32imm:$func, variable_ops), - "bl ${func:call}", - [(ARMcall tglobaladdr:$func)]>, Requires<[IsNotDarwin]>; + IIC_Br, "bl ${func:call}", + [(ARMcall tglobaladdr:$func)]>, + Requires<[IsARM, IsNotDarwin]>; def BL_pred : ABI<0b1011, (outs), (ins i32imm:$func, variable_ops), - "bl", " ${func:call}", - [(ARMcall_pred tglobaladdr:$func)]>, Requires<[IsNotDarwin]>; + IIC_Br, "bl", " ${func:call}", + [(ARMcall_pred tglobaladdr:$func)]>, + Requires<[IsARM, IsNotDarwin]>; // ARMv5T and above def BLX : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm, - "blx $func", - [(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T, IsNotDarwin]> { + IIC_Br, "blx $func", + [(ARMcall GPR:$func)]>, + Requires<[IsARM, HasV5T, IsNotDarwin]> { let Inst{7-4} = 0b0011; let Inst{19-8} = 0b111111111111; let Inst{27-20} = 0b00010010; } - let Uses = [LR] in { - // ARMv4T - def BX : ABXIx2<(outs), (ins GPR:$func, variable_ops), - "mov lr, pc\n\tbx $func", - [(ARMcall_nolink GPR:$func)]>, Requires<[IsNotDarwin]>; + // ARMv4T + def BX : ABXIx2<(outs), (ins GPR:$func, variable_ops), + IIC_Br, "mov lr, pc\n\tbx $func", + [(ARMcall_nolink GPR:$func)]>, + Requires<[IsARM, IsNotDarwin]> { + let Inst{7-4} = 0b0001; + let Inst{19-8} = 0b111111111111; + let Inst{27-20} = 0b00010010; } } // On Darwin R9 is call-clobbered. -let isCall = 1, Itinerary = IIC_Br, - Defs = [R0, R1, R2, R3, R9, R12, LR, - D0, D1, D2, D3, D4, D5, D6, D7, CPSR] in { +let isCall = 1, + Defs = [R0, R1, R2, R3, R9, R12, LR, + D0, D1, D2, D3, D4, D5, D6, D7, + D16, D17, D18, D19, D20, D21, D22, D23, + D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in { def BLr9 : ABXI<0b1011, (outs), (ins i32imm:$func, variable_ops), - "bl ${func:call}", - [(ARMcall tglobaladdr:$func)]>, Requires<[IsDarwin]>; + IIC_Br, "bl ${func:call}", + [(ARMcall tglobaladdr:$func)]>, Requires<[IsARM, IsDarwin]>; def BLr9_pred : ABI<0b1011, (outs), (ins i32imm:$func, variable_ops), - "bl", " ${func:call}", - [(ARMcall_pred tglobaladdr:$func)]>, Requires<[IsDarwin]>; + IIC_Br, "bl", " ${func:call}", + [(ARMcall_pred tglobaladdr:$func)]>, + Requires<[IsARM, IsDarwin]>; // ARMv5T and above def BLXr9 : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm, - "blx $func", + IIC_Br, "blx $func", [(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T, IsDarwin]> { let Inst{7-4} = 0b0011; let Inst{19-8} = 0b111111111111; let Inst{27-20} = 0b00010010; } - let Uses = [LR] in { - // ARMv4T - def BXr9 : ABXIx2<(outs), (ins GPR:$func, variable_ops), - "mov lr, pc\n\tbx $func", - [(ARMcall_nolink GPR:$func)]>, Requires<[IsDarwin]>; + // ARMv4T + def BXr9 : ABXIx2<(outs), (ins GPR:$func, variable_ops), + IIC_Br, "mov lr, pc\n\tbx $func", + [(ARMcall_nolink GPR:$func)]>, Requires<[IsARM, IsDarwin]> { + let Inst{7-4} = 0b0001; + let Inst{19-8} = 0b111111111111; + let Inst{27-20} = 0b00010010; } } -let isBranch = 1, isTerminator = 1, Itinerary = IIC_Br in { +let isBranch = 1, isTerminator = 1 in { // B is "predicable" since it can be xformed into a Bcc. let isBarrier = 1 in { let isPredicable = 1 in - def B : ABXI<0b1010, (outs), (ins brtarget:$target), "b $target", - [(br bb:$target)]>; + def B : ABXI<0b1010, (outs), (ins brtarget:$target), IIC_Br, + "b $target", [(br bb:$target)]>; let isNotDuplicable = 1, isIndirectBranch = 1 in { def BR_JTr : JTI<(outs), (ins GPR:$target, jtblock_operand:$jt, i32imm:$id), - "mov pc, $target \n$jt", + IIC_Br, "mov pc, $target \n$jt", [(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]> { let Inst{20} = 0; // S Bit let Inst{24-21} = 0b1101; - let Inst{27-26} = {0,0}; + let Inst{27-25} = 0b000; } def BR_JTm : JTI<(outs), (ins addrmode2:$target, jtblock_operand:$jt, i32imm:$id), - "ldr pc, $target \n$jt", - [(ARMbrjt (i32 (load addrmode2:$target)), tjumptable:$jt, - imm:$id)]> { + IIC_Br, "ldr pc, $target \n$jt", + [(ARMbrjt (i32 (load addrmode2:$target)), tjumptable:$jt, + imm:$id)]> { let Inst{20} = 1; // L bit let Inst{21} = 0; // W bit let Inst{22} = 0; // B bit let Inst{24} = 1; // P bit - let Inst{27-26} = {0,1}; + let Inst{27-25} = 0b011; } def BR_JTadd : JTI<(outs), (ins GPR:$target, GPR:$idx, jtblock_operand:$jt, i32imm:$id), - "add pc, $target, $idx \n$jt", + IIC_Br, "add pc, $target, $idx \n$jt", [(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt, imm:$id)]> { let Inst{20} = 0; // S bit let Inst{24-21} = 0b0100; - let Inst{27-26} = {0,0}; + let Inst{27-25} = 0b000; } } // isNotDuplicable = 1, isIndirectBranch = 1 } // isBarrier = 1 @@ -670,7 +754,7 @@ let isBranch = 1, isTerminator = 1, Itinerary = IIC_Br in { // FIXME: should be able to write a pattern for ARMBrcond, but can't use // a two-value operand where a dag node expects two operands. :( def Bcc : ABI<0b1010, (outs), (ins brtarget:$target), - "b", " $target", + IIC_Br, "b", " $target", [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]>; } @@ -679,133 +763,141 @@ let isBranch = 1, isTerminator = 1, Itinerary = IIC_Br in { // // Load -let canFoldAsLoad = 1 in -def LDR : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, +let canFoldAsLoad = 1, isReMaterializable = 1 in +def LDR : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, IIC_iLoadr, "ldr", " $dst, $addr", [(set GPR:$dst, (load addrmode2:$addr))]>; // Special LDR for loads from non-pc-relative constpools. let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1 in -def LDRcp : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, +def LDRcp : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, IIC_iLoadr, "ldr", " $dst, $addr", []>; // Loads with zero extension def LDRH : AI3ldh<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm, - "ldr", "h $dst, $addr", - [(set GPR:$dst, (zextloadi16 addrmode3:$addr))]>; + IIC_iLoadr, "ldr", "h $dst, $addr", + [(set GPR:$dst, (zextloadi16 addrmode3:$addr))]>; -def LDRB : AI2ldb<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, - "ldr", "b $dst, $addr", - [(set GPR:$dst, (zextloadi8 addrmode2:$addr))]>; +def LDRB : AI2ldb<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, + IIC_iLoadr, "ldr", "b $dst, $addr", + [(set GPR:$dst, (zextloadi8 addrmode2:$addr))]>; // Loads with sign extension def LDRSH : AI3ldsh<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm, - "ldr", "sh $dst, $addr", - [(set GPR:$dst, (sextloadi16 addrmode3:$addr))]>; + IIC_iLoadr, "ldr", "sh $dst, $addr", + [(set GPR:$dst, (sextloadi16 addrmode3:$addr))]>; def LDRSB : AI3ldsb<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm, - "ldr", "sb $dst, $addr", - [(set GPR:$dst, (sextloadi8 addrmode3:$addr))]>; + IIC_iLoadr, "ldr", "sb $dst, $addr", + [(set GPR:$dst, (sextloadi8 addrmode3:$addr))]>; -let mayLoad = 1 in { +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in { // Load doubleword def LDRD : AI3ldd<(outs GPR:$dst1, GPR:$dst2), (ins addrmode3:$addr), LdMiscFrm, - "ldr", "d $dst1, $addr", []>, Requires<[IsARM, HasV5T]>; + IIC_iLoadr, "ldr", "d $dst1, $addr", + []>, Requires<[IsARM, HasV5TE]>; // Indexed loads def LDR_PRE : AI2ldwpr<(outs GPR:$dst, GPR:$base_wb), - (ins addrmode2:$addr), LdFrm, + (ins addrmode2:$addr), LdFrm, IIC_iLoadru, "ldr", " $dst, $addr!", "$addr.base = $base_wb", []>; def LDR_POST : AI2ldwpo<(outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base, am2offset:$offset), LdFrm, + (ins GPR:$base, am2offset:$offset), LdFrm, IIC_iLoadru, "ldr", " $dst, [$base], $offset", "$base = $base_wb", []>; def LDRH_PRE : AI3ldhpr<(outs GPR:$dst, GPR:$base_wb), - (ins addrmode3:$addr), LdMiscFrm, + (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadru, "ldr", "h $dst, $addr!", "$addr.base = $base_wb", []>; def LDRH_POST : AI3ldhpo<(outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base,am3offset:$offset), LdMiscFrm, + (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru, "ldr", "h $dst, [$base], $offset", "$base = $base_wb", []>; def LDRB_PRE : AI2ldbpr<(outs GPR:$dst, GPR:$base_wb), - (ins addrmode2:$addr), LdFrm, + (ins addrmode2:$addr), LdFrm, IIC_iLoadru, "ldr", "b $dst, $addr!", "$addr.base = $base_wb", []>; def LDRB_POST : AI2ldbpo<(outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base,am2offset:$offset), LdFrm, + (ins GPR:$base,am2offset:$offset), LdFrm, IIC_iLoadru, "ldr", "b $dst, [$base], $offset", "$base = $base_wb", []>; def LDRSH_PRE : AI3ldshpr<(outs GPR:$dst, GPR:$base_wb), - (ins addrmode3:$addr), LdMiscFrm, + (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadru, "ldr", "sh $dst, $addr!", "$addr.base = $base_wb", []>; def LDRSH_POST: AI3ldshpo<(outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base,am3offset:$offset), LdMiscFrm, + (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru, "ldr", "sh $dst, [$base], $offset", "$base = $base_wb", []>; def LDRSB_PRE : AI3ldsbpr<(outs GPR:$dst, GPR:$base_wb), - (ins addrmode3:$addr), LdMiscFrm, + (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadru, "ldr", "sb $dst, $addr!", "$addr.base = $base_wb", []>; def LDRSB_POST: AI3ldsbpo<(outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base,am3offset:$offset), LdMiscFrm, + (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru, "ldr", "sb $dst, [$base], $offset", "$base = $base_wb", []>; } // Store -def STR : AI2stw<(outs), (ins GPR:$src, addrmode2:$addr), StFrm, +def STR : AI2stw<(outs), (ins GPR:$src, addrmode2:$addr), StFrm, IIC_iStorer, "str", " $src, $addr", [(store GPR:$src, addrmode2:$addr)]>; // Stores with truncate -def STRH : AI3sth<(outs), (ins GPR:$src, addrmode3:$addr), StMiscFrm, +def STRH : AI3sth<(outs), (ins GPR:$src, addrmode3:$addr), StMiscFrm, IIC_iStorer, "str", "h $src, $addr", [(truncstorei16 GPR:$src, addrmode3:$addr)]>; -def STRB : AI2stb<(outs), (ins GPR:$src, addrmode2:$addr), StFrm, +def STRB : AI2stb<(outs), (ins GPR:$src, addrmode2:$addr), StFrm, IIC_iStorer, "str", "b $src, $addr", [(truncstorei8 GPR:$src, addrmode2:$addr)]>; // Store doubleword -let mayStore = 1 in -def STRD : AI3std<(outs), (ins GPR:$src1, GPR:$src2, addrmode3:$addr),StMiscFrm, - "str", "d $src1, $addr", []>, Requires<[IsARM, HasV5T]>; +let mayStore = 1, hasExtraSrcRegAllocReq = 1 in +def STRD : AI3std<(outs), (ins GPR:$src1, GPR:$src2, addrmode3:$addr), + StMiscFrm, IIC_iStorer, + "str", "d $src1, $addr", []>, Requires<[IsARM, HasV5TE]>; // Indexed stores def STR_PRE : AI2stwpr<(outs GPR:$base_wb), - (ins GPR:$src, GPR:$base, am2offset:$offset), StFrm, + (ins GPR:$src, GPR:$base, am2offset:$offset), + StFrm, IIC_iStoreru, "str", " $src, [$base, $offset]!", "$base = $base_wb", [(set GPR:$base_wb, (pre_store GPR:$src, GPR:$base, am2offset:$offset))]>; def STR_POST : AI2stwpo<(outs GPR:$base_wb), - (ins GPR:$src, GPR:$base,am2offset:$offset), StFrm, + (ins GPR:$src, GPR:$base,am2offset:$offset), + StFrm, IIC_iStoreru, "str", " $src, [$base], $offset", "$base = $base_wb", [(set GPR:$base_wb, (post_store GPR:$src, GPR:$base, am2offset:$offset))]>; def STRH_PRE : AI3sthpr<(outs GPR:$base_wb), - (ins GPR:$src, GPR:$base,am3offset:$offset), StMiscFrm, + (ins GPR:$src, GPR:$base,am3offset:$offset), + StMiscFrm, IIC_iStoreru, "str", "h $src, [$base, $offset]!", "$base = $base_wb", [(set GPR:$base_wb, (pre_truncsti16 GPR:$src, GPR:$base,am3offset:$offset))]>; def STRH_POST: AI3sthpo<(outs GPR:$base_wb), - (ins GPR:$src, GPR:$base,am3offset:$offset), StMiscFrm, + (ins GPR:$src, GPR:$base,am3offset:$offset), + StMiscFrm, IIC_iStoreru, "str", "h $src, [$base], $offset", "$base = $base_wb", [(set GPR:$base_wb, (post_truncsti16 GPR:$src, GPR:$base, am3offset:$offset))]>; def STRB_PRE : AI2stbpr<(outs GPR:$base_wb), - (ins GPR:$src, GPR:$base,am2offset:$offset), StFrm, + (ins GPR:$src, GPR:$base,am2offset:$offset), + StFrm, IIC_iStoreru, "str", "b $src, [$base, $offset]!", "$base = $base_wb", [(set GPR:$base_wb, (pre_truncsti8 GPR:$src, GPR:$base, am2offset:$offset))]>; def STRB_POST: AI2stbpo<(outs GPR:$base_wb), - (ins GPR:$src, GPR:$base,am2offset:$offset), StFrm, + (ins GPR:$src, GPR:$base,am2offset:$offset), + StFrm, IIC_iStoreru, "str", "b $src, [$base], $offset", "$base = $base_wb", [(set GPR:$base_wb, (post_truncsti8 GPR:$src, GPR:$base, am2offset:$offset))]>; @@ -814,17 +906,16 @@ def STRB_POST: AI2stbpo<(outs GPR:$base_wb), // Load / store multiple Instructions. // -// FIXME: $dst1 should be a def. -let mayLoad = 1 in +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in def LDM : AXI4ld<(outs), - (ins addrmode4:$addr, pred:$p, reglist:$dst1, variable_ops), - LdStMulFrm, "ldm${p}${addr:submode} $addr, $dst1", + (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops), + LdStMulFrm, IIC_iLoadm, "ldm${p}${addr:submode} $addr, $wb", []>; -let mayStore = 1 in +let mayStore = 1, hasExtraSrcRegAllocReq = 1 in def STM : AXI4st<(outs), - (ins addrmode4:$addr, pred:$p, reglist:$src1, variable_ops), - LdStMulFrm, "stm${p}${addr:submode} $addr, $src1", + (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops), + LdStMulFrm, IIC_iStorem, "stm${p}${addr:submode} $addr, $wb", []>; //===----------------------------------------------------------------------===// @@ -832,16 +923,42 @@ def STM : AXI4st<(outs), // let neverHasSideEffects = 1 in -def MOVr : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), DPFrm, - "mov", " $dst, $src", []>, UnaryDP; -def MOVs : AsI1<0b1101, (outs GPR:$dst), (ins so_reg:$src), DPSoRegFrm, - "mov", " $dst, $src", [(set GPR:$dst, so_reg:$src)]>, UnaryDP; +def MOVr : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), DPFrm, IIC_iMOVr, + "mov", " $dst, $src", []>, UnaryDP; +def MOVs : AsI1<0b1101, (outs GPR:$dst), (ins so_reg:$src), + DPSoRegFrm, IIC_iMOVsr, + "mov", " $dst, $src", [(set GPR:$dst, so_reg:$src)]>, UnaryDP; let isReMaterializable = 1, isAsCheapAsAMove = 1 in -def MOVi : AsI1<0b1101, (outs GPR:$dst), (ins so_imm:$src), DPFrm, - "mov", " $dst, $src", [(set GPR:$dst, so_imm:$src)]>, UnaryDP; +def MOVi : AsI1<0b1101, (outs GPR:$dst), (ins so_imm:$src), DPFrm, IIC_iMOVi, + "mov", " $dst, $src", [(set GPR:$dst, so_imm:$src)]>, UnaryDP { + let Inst{25} = 1; +} -def MOVrx : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo, +let isReMaterializable = 1, isAsCheapAsAMove = 1 in +def MOVi16 : AI1<0b1000, (outs GPR:$dst), (ins i32imm:$src), + DPFrm, IIC_iMOVi, + "movw", " $dst, $src", + [(set GPR:$dst, imm0_65535:$src)]>, + Requires<[IsARM, HasV6T2]> { + let Inst{20} = 0; + let Inst{25} = 1; +} + +let Constraints = "$src = $dst" in +def MOVTi16 : AI1<0b1010, (outs GPR:$dst), (ins GPR:$src, i32imm:$imm), + DPFrm, IIC_iMOVi, + "movt", " $dst, $imm", + [(set GPR:$dst, + (or (and GPR:$src, 0xffff), + lo16AllZero:$imm))]>, UnaryDP, + Requires<[IsARM, HasV6T2]> { + let Inst{20} = 0; + let Inst{25} = 1; +} + +let Uses = [CPSR] in +def MOVrx : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo, IIC_iMOVsi, "mov", " $dst, $src, rrx", [(set GPR:$dst, (ARMrrx GPR:$src))]>, UnaryDP; @@ -849,11 +966,11 @@ def MOVrx : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo, // due to flag operands. let Defs = [CPSR] in { -def MOVsrl_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo, - "mov", "s $dst, $src, lsr #1", +def MOVsrl_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo, + IIC_iMOVsi, "mov", "s $dst, $src, lsr #1", [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, UnaryDP; def MOVsra_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo, - "mov", "s $dst, $src, asr #1", + IIC_iMOVsi, "mov", "s $dst, $src, asr #1", [(set GPR:$dst, (ARMsra_flag GPR:$src))]>, UnaryDP; } @@ -901,6 +1018,24 @@ defm UXTAH : AI_bin_rrot<0b01101111, "uxtah", // TODO: UXT(A){B|H}16 +def SBFX : I<(outs GPR:$dst), + (ins GPR:$src, imm0_31:$lsb, imm0_31:$width), + AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iALUi, + "sbfx", " $dst, $src, $lsb, $width", "", []>, + Requires<[IsARM, HasV6T2]> { + let Inst{27-21} = 0b0111101; + let Inst{6-4} = 0b101; +} + +def UBFX : I<(outs GPR:$dst), + (ins GPR:$src, imm0_31:$lsb, imm0_31:$width), + AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iALUi, + "ubfx", " $dst, $src, $lsb, $width", "", []>, + Requires<[IsARM, HasV6T2]> { + let Inst{27-21} = 0b0111111; + let Inst{6-4} = 0b101; +} + //===----------------------------------------------------------------------===// // Arithmetic Instructions. // @@ -923,30 +1058,36 @@ defm SBC : AI1_adde_sube_irs<0b0110, "sbc", // These don't define reg/reg forms, because they are handled above. def RSBri : AsI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm, - "rsb", " $dst, $a, $b", - [(set GPR:$dst, (sub so_imm:$b, GPR:$a))]>; + IIC_iALUi, "rsb", " $dst, $a, $b", + [(set GPR:$dst, (sub so_imm:$b, GPR:$a))]> { + let Inst{25} = 1; +} def RSBrs : AsI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm, - "rsb", " $dst, $a, $b", + IIC_iALUsr, "rsb", " $dst, $a, $b", [(set GPR:$dst, (sub so_reg:$b, GPR:$a))]>; // RSB with 's' bit set. let Defs = [CPSR] in { def RSBSri : AI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm, - "rsb", "s $dst, $a, $b", - [(set GPR:$dst, (subc so_imm:$b, GPR:$a))]>; + IIC_iALUi, "rsb", "s $dst, $a, $b", + [(set GPR:$dst, (subc so_imm:$b, GPR:$a))]> { + let Inst{25} = 1; +} def RSBSrs : AI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm, - "rsb", "s $dst, $a, $b", + IIC_iALUsr, "rsb", "s $dst, $a, $b", [(set GPR:$dst, (subc so_reg:$b, GPR:$a))]>; } let Uses = [CPSR] in { def RSCri : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), - DPFrm, "rsc", " $dst, $a, $b", + DPFrm, IIC_iALUi, "rsc", " $dst, $a, $b", [(set GPR:$dst, (sube so_imm:$b, GPR:$a))]>, - Requires<[IsARM, CarryDefIsUnused]>; + Requires<[IsARM, CarryDefIsUnused]> { + let Inst{25} = 1; +} def RSCrs : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), - DPSoRegFrm, "rsc", " $dst, $a, $b", + DPSoRegFrm, IIC_iALUsr, "rsc", " $dst, $a, $b", [(set GPR:$dst, (sube so_reg:$b, GPR:$a))]>, Requires<[IsARM, CarryDefIsUnused]>; } @@ -954,11 +1095,13 @@ def RSCrs : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), // FIXME: Allow these to be predicated. let Defs = [CPSR], Uses = [CPSR] in { def RSCSri : AXI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), - DPFrm, "rscs $dst, $a, $b", + DPFrm, IIC_iALUi, "rscs $dst, $a, $b", [(set GPR:$dst, (sube so_imm:$b, GPR:$a))]>, - Requires<[IsARM, CarryDefIsUnused]>; + Requires<[IsARM, CarryDefIsUnused]> { + let Inst{25} = 1; +} def RSCSrs : AXI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), - DPSoRegFrm, "rscs $dst, $a, $b", + DPSoRegFrm, IIC_iALUsr, "rscs $dst, $a, $b", [(set GPR:$dst, (sube so_reg:$b, GPR:$a))]>, Requires<[IsARM, CarryDefIsUnused]>; } @@ -992,16 +1135,27 @@ defm EOR : AsI1_bin_irs<0b0001, "eor", defm BIC : AsI1_bin_irs<0b1110, "bic", BinOpFrag<(and node:$LHS, (not node:$RHS))>>; -def MVNr : AsI1<0b1111, (outs GPR:$dst), (ins GPR:$src), DPFrm, +def BFC : I<(outs GPR:$dst), (ins GPR:$src, bf_inv_mask_imm:$imm), + AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iALUi, + "bfc", " $dst, $imm", "$src = $dst", + [(set GPR:$dst, (and GPR:$src, bf_inv_mask_imm:$imm))]>, + Requires<[IsARM, HasV6T2]> { + let Inst{27-21} = 0b0111110; + let Inst{6-0} = 0b0011111; +} + +def MVNr : AsI1<0b1111, (outs GPR:$dst), (ins GPR:$src), DPFrm, IIC_iMOVr, "mvn", " $dst, $src", [(set GPR:$dst, (not GPR:$src))]>, UnaryDP; def MVNs : AsI1<0b1111, (outs GPR:$dst), (ins so_reg:$src), DPSoRegFrm, - "mvn", " $dst, $src", + IIC_iMOVsr, "mvn", " $dst, $src", [(set GPR:$dst, (not so_reg:$src))]>, UnaryDP; let isReMaterializable = 1, isAsCheapAsAMove = 1 in -def MVNi : AsI1<0b1111, (outs GPR:$dst), (ins so_imm:$imm), DPFrm, - "mvn", " $dst, $imm", - [(set GPR:$dst, so_imm_not:$imm)]>,UnaryDP; +def MVNi : AsI1<0b1111, (outs GPR:$dst), (ins so_imm:$imm), DPFrm, + IIC_iMOVi, "mvn", " $dst, $imm", + [(set GPR:$dst, so_imm_not:$imm)]>,UnaryDP { + let Inst{25} = 1; +} def : ARMPat<(and GPR:$src, so_imm_not:$imm), (BICri GPR:$src, so_imm_not:$imm)>; @@ -1012,43 +1166,48 @@ def : ARMPat<(and GPR:$src, so_imm_not:$imm), let isCommutable = 1 in def MUL : AsMul1I<0b0000000, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - "mul", " $dst, $a, $b", + IIC_iMUL32, "mul", " $dst, $a, $b", [(set GPR:$dst, (mul GPR:$a, GPR:$b))]>; def MLA : AsMul1I<0b0000001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), - "mla", " $dst, $a, $b, $c", + IIC_iMAC32, "mla", " $dst, $a, $b, $c", [(set GPR:$dst, (add (mul GPR:$a, GPR:$b), GPR:$c))]>; +def MLS : AMul1I<0b0000011, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), + IIC_iMAC32, "mls", " $dst, $a, $b, $c", + [(set GPR:$dst, (sub GPR:$c, (mul GPR:$a, GPR:$b)))]>, + Requires<[IsARM, HasV6T2]>; + // Extra precision multiplies with low / high results let neverHasSideEffects = 1 in { let isCommutable = 1 in { def SMULL : AsMul1I<0b0000110, (outs GPR:$ldst, GPR:$hdst), - (ins GPR:$a, GPR:$b), + (ins GPR:$a, GPR:$b), IIC_iMUL64, "smull", " $ldst, $hdst, $a, $b", []>; def UMULL : AsMul1I<0b0000100, (outs GPR:$ldst, GPR:$hdst), - (ins GPR:$a, GPR:$b), + (ins GPR:$a, GPR:$b), IIC_iMUL64, "umull", " $ldst, $hdst, $a, $b", []>; } // Multiply + accumulate def SMLAL : AsMul1I<0b0000111, (outs GPR:$ldst, GPR:$hdst), - (ins GPR:$a, GPR:$b), + (ins GPR:$a, GPR:$b), IIC_iMAC64, "smlal", " $ldst, $hdst, $a, $b", []>; def UMLAL : AsMul1I<0b0000101, (outs GPR:$ldst, GPR:$hdst), - (ins GPR:$a, GPR:$b), + (ins GPR:$a, GPR:$b), IIC_iMAC64, "umlal", " $ldst, $hdst, $a, $b", []>; def UMAAL : AMul1I <0b0000010, (outs GPR:$ldst, GPR:$hdst), - (ins GPR:$a, GPR:$b), + (ins GPR:$a, GPR:$b), IIC_iMAC64, "umaal", " $ldst, $hdst, $a, $b", []>, Requires<[IsARM, HasV6]>; } // neverHasSideEffects // Most significant word multiply def SMMUL : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - "smmul", " $dst, $a, $b", + IIC_iMUL32, "smmul", " $dst, $a, $b", [(set GPR:$dst, (mulhs GPR:$a, GPR:$b))]>, Requires<[IsARM, HasV6]> { let Inst{7-4} = 0b0001; @@ -1056,7 +1215,7 @@ def SMMUL : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b), } def SMMLA : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), - "smmla", " $dst, $a, $b, $c", + IIC_iMAC32, "smmla", " $dst, $a, $b, $c", [(set GPR:$dst, (add (mulhs GPR:$a, GPR:$b), GPR:$c))]>, Requires<[IsARM, HasV6]> { let Inst{7-4} = 0b0001; @@ -1064,7 +1223,7 @@ def SMMLA : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), def SMMLS : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), - "smmls", " $dst, $a, $b, $c", + IIC_iMAC32, "smmls", " $dst, $a, $b, $c", [(set GPR:$dst, (sub GPR:$c, (mulhs GPR:$a, GPR:$b)))]>, Requires<[IsARM, HasV6]> { let Inst{7-4} = 0b1101; @@ -1072,7 +1231,7 @@ def SMMLS : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), multiclass AI_smul<string opc, PatFrag opnode> { def BB : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - !strconcat(opc, "bb"), " $dst, $a, $b", + IIC_iMUL32, !strconcat(opc, "bb"), " $dst, $a, $b", [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16), (sext_inreg GPR:$b, i16)))]>, Requires<[IsARM, HasV5TE]> { @@ -1081,7 +1240,7 @@ multiclass AI_smul<string opc, PatFrag opnode> { } def BT : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - !strconcat(opc, "bt"), " $dst, $a, $b", + IIC_iMUL32, !strconcat(opc, "bt"), " $dst, $a, $b", [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16), (sra GPR:$b, (i32 16))))]>, Requires<[IsARM, HasV5TE]> { @@ -1090,7 +1249,7 @@ multiclass AI_smul<string opc, PatFrag opnode> { } def TB : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - !strconcat(opc, "tb"), " $dst, $a, $b", + IIC_iMUL32, !strconcat(opc, "tb"), " $dst, $a, $b", [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)), (sext_inreg GPR:$b, i16)))]>, Requires<[IsARM, HasV5TE]> { @@ -1099,7 +1258,7 @@ multiclass AI_smul<string opc, PatFrag opnode> { } def TT : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - !strconcat(opc, "tt"), " $dst, $a, $b", + IIC_iMUL32, !strconcat(opc, "tt"), " $dst, $a, $b", [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)), (sra GPR:$b, (i32 16))))]>, Requires<[IsARM, HasV5TE]> { @@ -1108,7 +1267,7 @@ multiclass AI_smul<string opc, PatFrag opnode> { } def WB : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - !strconcat(opc, "wb"), " $dst, $a, $b", + IIC_iMUL16, !strconcat(opc, "wb"), " $dst, $a, $b", [(set GPR:$dst, (sra (opnode GPR:$a, (sext_inreg GPR:$b, i16)), (i32 16)))]>, Requires<[IsARM, HasV5TE]> { @@ -1117,7 +1276,7 @@ multiclass AI_smul<string opc, PatFrag opnode> { } def WT : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b), - !strconcat(opc, "wt"), " $dst, $a, $b", + IIC_iMUL16, !strconcat(opc, "wt"), " $dst, $a, $b", [(set GPR:$dst, (sra (opnode GPR:$a, (sra GPR:$b, (i32 16))), (i32 16)))]>, Requires<[IsARM, HasV5TE]> { @@ -1129,7 +1288,7 @@ multiclass AI_smul<string opc, PatFrag opnode> { multiclass AI_smla<string opc, PatFrag opnode> { def BB : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), - !strconcat(opc, "bb"), " $dst, $a, $b, $acc", + IIC_iMAC16, !strconcat(opc, "bb"), " $dst, $a, $b, $acc", [(set GPR:$dst, (add GPR:$acc, (opnode (sext_inreg GPR:$a, i16), (sext_inreg GPR:$b, i16))))]>, @@ -1139,7 +1298,7 @@ multiclass AI_smla<string opc, PatFrag opnode> { } def BT : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), - !strconcat(opc, "bt"), " $dst, $a, $b, $acc", + IIC_iMAC16, !strconcat(opc, "bt"), " $dst, $a, $b, $acc", [(set GPR:$dst, (add GPR:$acc, (opnode (sext_inreg GPR:$a, i16), (sra GPR:$b, (i32 16)))))]>, Requires<[IsARM, HasV5TE]> { @@ -1148,7 +1307,7 @@ multiclass AI_smla<string opc, PatFrag opnode> { } def TB : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), - !strconcat(opc, "tb"), " $dst, $a, $b, $acc", + IIC_iMAC16, !strconcat(opc, "tb"), " $dst, $a, $b, $acc", [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)), (sext_inreg GPR:$b, i16))))]>, Requires<[IsARM, HasV5TE]> { @@ -1157,7 +1316,7 @@ multiclass AI_smla<string opc, PatFrag opnode> { } def TT : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), - !strconcat(opc, "tt"), " $dst, $a, $b, $acc", + IIC_iMAC16, !strconcat(opc, "tt"), " $dst, $a, $b, $acc", [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)), (sra GPR:$b, (i32 16)))))]>, Requires<[IsARM, HasV5TE]> { @@ -1166,7 +1325,7 @@ multiclass AI_smla<string opc, PatFrag opnode> { } def WB : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), - !strconcat(opc, "wb"), " $dst, $a, $b, $acc", + IIC_iMAC16, !strconcat(opc, "wb"), " $dst, $a, $b, $acc", [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a, (sext_inreg GPR:$b, i16)), (i32 16))))]>, Requires<[IsARM, HasV5TE]> { @@ -1175,7 +1334,7 @@ multiclass AI_smla<string opc, PatFrag opnode> { } def WT : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), - !strconcat(opc, "wt"), " $dst, $a, $b, $acc", + IIC_iMAC16, !strconcat(opc, "wt"), " $dst, $a, $b, $acc", [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a, (sra GPR:$b, (i32 16))), (i32 16))))]>, Requires<[IsARM, HasV5TE]> { @@ -1194,7 +1353,7 @@ defm SMLA : AI_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>; // Misc. Arithmetic Instructions. // -def CLZ : AMiscA1I<0b000010110, (outs GPR:$dst), (ins GPR:$src), +def CLZ : AMiscA1I<0b000010110, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, "clz", " $dst, $src", [(set GPR:$dst, (ctlz GPR:$src))]>, Requires<[IsARM, HasV5T]> { let Inst{7-4} = 0b0001; @@ -1202,7 +1361,7 @@ def CLZ : AMiscA1I<0b000010110, (outs GPR:$dst), (ins GPR:$src), let Inst{19-16} = 0b1111; } -def REV : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src), +def REV : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, "rev", " $dst, $src", [(set GPR:$dst, (bswap GPR:$src))]>, Requires<[IsARM, HasV6]> { let Inst{7-4} = 0b0011; @@ -1210,7 +1369,7 @@ def REV : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src), let Inst{19-16} = 0b1111; } -def REV16 : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src), +def REV16 : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, "rev16", " $dst, $src", [(set GPR:$dst, (or (and (srl GPR:$src, (i32 8)), 0xFF), @@ -1223,7 +1382,7 @@ def REV16 : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src), let Inst{19-16} = 0b1111; } -def REVSH : AMiscA1I<0b01101111, (outs GPR:$dst), (ins GPR:$src), +def REVSH : AMiscA1I<0b01101111, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, "revsh", " $dst, $src", [(set GPR:$dst, (sext_inreg @@ -1237,7 +1396,7 @@ def REVSH : AMiscA1I<0b01101111, (outs GPR:$dst), (ins GPR:$src), def PKHBT : AMiscA1I<0b01101000, (outs GPR:$dst), (ins GPR:$src1, GPR:$src2, i32imm:$shamt), - "pkhbt", " $dst, $src1, $src2, LSL $shamt", + IIC_iALUsi, "pkhbt", " $dst, $src1, $src2, LSL $shamt", [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF), (and (shl GPR:$src2, (i32 imm:$shamt)), 0xFFFF0000)))]>, @@ -1254,7 +1413,7 @@ def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF), (shl GPR:$src2, imm16_31:$shamt)), def PKHTB : AMiscA1I<0b01101000, (outs GPR:$dst), (ins GPR:$src1, GPR:$src2, i32imm:$shamt), - "pkhtb", " $dst, $src1, $src2, ASR $shamt", + IIC_iALUsi, "pkhtb", " $dst, $src1, $src2, ASR $shamt", [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF0000), (and (sra GPR:$src2, imm16_31:$shamt), 0xFFFF)))]>, Requires<[IsARM, HasV6]> { @@ -1300,21 +1459,23 @@ def : ARMPat<(ARMcmpZ GPR:$src, so_imm_neg:$imm), // FIXME: should be able to write a pattern for ARMcmov, but can't use // a two-value operand where a dag node expects two operands. :( def MOVCCr : AI1<0b1101, (outs GPR:$dst), (ins GPR:$false, GPR:$true), DPFrm, - "mov", " $dst, $true", + IIC_iCMOVr, "mov", " $dst, $true", [/*(set GPR:$dst, (ARMcmov GPR:$false, GPR:$true, imm:$cc, CCR:$ccr))*/]>, RegConstraint<"$false = $dst">, UnaryDP; def MOVCCs : AI1<0b1101, (outs GPR:$dst), - (ins GPR:$false, so_reg:$true), DPSoRegFrm, + (ins GPR:$false, so_reg:$true), DPSoRegFrm, IIC_iCMOVsr, "mov", " $dst, $true", [/*(set GPR:$dst, (ARMcmov GPR:$false, so_reg:$true, imm:$cc, CCR:$ccr))*/]>, RegConstraint<"$false = $dst">, UnaryDP; def MOVCCi : AI1<0b1101, (outs GPR:$dst), - (ins GPR:$false, so_imm:$true), DPFrm, + (ins GPR:$false, so_imm:$true), DPFrm, IIC_iCMOVi, "mov", " $dst, $true", [/*(set GPR:$dst, (ARMcmov GPR:$false, so_imm:$true, imm:$cc, CCR:$ccr))*/]>, - RegConstraint<"$false = $dst">, UnaryDP; + RegConstraint<"$false = $dst">, UnaryDP { + let Inst{25} = 1; +} //===----------------------------------------------------------------------===// @@ -1324,14 +1485,14 @@ def MOVCCi : AI1<0b1101, (outs GPR:$dst), // __aeabi_read_tp preserves the registers r1-r3. let isCall = 1, Defs = [R0, R12, LR, CPSR] in { - def TPsoft : ABXI<0b1011, (outs), (ins), + def TPsoft : ABXI<0b1011, (outs), (ins), IIC_Br, "bl __aeabi_read_tp", [(set R0, ARMthread_pointer)]>; } //===----------------------------------------------------------------------===// // SJLJ Exception handling intrinsics -// eh_sjlj_setjmp() is a three instruction sequence to store the return +// eh_sjlj_setjmp() is an instruction sequence to store the return // address and save #0 in R0 for the non-longjmp case. // Since by its nature we may be coming from some other function to get // here, and we're using the stack frame for the containing function to @@ -1342,13 +1503,19 @@ let isCall = 1, // doing so, we also cause the prologue/epilogue code to actively preserve // all of the callee-saved resgisters, which is exactly what we want. let Defs = - [ R0, R1, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, - D0, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15 ] in { + [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, D0, + D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15, + D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, + D31 ] in { def Int_eh_sjlj_setjmp : XI<(outs), (ins GPR:$src), - AddrModeNone, SizeSpecial, IndexModeNone, Pseudo, - "add r0, pc, #4\n\t" - "str r0, [$src, #+4]\n\t" - "mov r0, #0 @ eh_setjmp", "", + AddrModeNone, SizeSpecial, IndexModeNone, + Pseudo, NoItinerary, + "str sp, [$src, #+8] @ eh_setjmp begin\n\t" + "add r12, pc, #8\n\t" + "str r12, [$src, #+4]\n\t" + "mov r0, #0\n\t" + "add pc, pc, #0\n\t" + "mov r0, #1 @ eh_setjmp end", "", [(set R0, (ARMeh_sjlj_setjmp GPR:$src))]>; } @@ -1366,25 +1533,36 @@ def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id), // Two piece so_imms. let isReMaterializable = 1 in -def MOVi2pieces : AI1x2<(outs GPR:$dst), (ins so_imm2part:$src), Pseudo, +def MOVi2pieces : AI1x2<(outs GPR:$dst), (ins so_imm2part:$src), + Pseudo, IIC_iMOVi, "mov", " $dst, $src", - [(set GPR:$dst, so_imm2part:$src)]>; + [(set GPR:$dst, so_imm2part:$src)]>, + Requires<[IsARM, NoV6T2]>; def : ARMPat<(or GPR:$LHS, so_imm2part:$RHS), - (ORRri (ORRri GPR:$LHS, (so_imm2part_1 imm:$RHS)), - (so_imm2part_2 imm:$RHS))>; + (ORRri (ORRri GPR:$LHS, (so_imm2part_1 imm:$RHS)), + (so_imm2part_2 imm:$RHS))>; def : ARMPat<(xor GPR:$LHS, so_imm2part:$RHS), - (EORri (EORri GPR:$LHS, (so_imm2part_1 imm:$RHS)), - (so_imm2part_2 imm:$RHS))>; + (EORri (EORri GPR:$LHS, (so_imm2part_1 imm:$RHS)), + (so_imm2part_2 imm:$RHS))>; + +// 32-bit immediate using movw + movt. +// This is a single pseudo instruction to make it re-materializable. Remove +// when we can do generalized remat. +let isReMaterializable = 1 in +def MOVi32imm : AI1x2<(outs GPR:$dst), (ins i32imm:$src), Pseudo, IIC_iMOVi, + "movw", " $dst, ${src:lo16}\n\tmovt${p} $dst, ${src:hi16}", + [(set GPR:$dst, (i32 imm:$src))]>, + Requires<[IsARM, HasV6T2]>; // TODO: add,sub,and, 3-instr forms? // Direct calls def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>, - Requires<[IsNotDarwin]>; + Requires<[IsARM, IsNotDarwin]>; def : ARMPat<(ARMcall texternalsym:$func), (BLr9 texternalsym:$func)>, - Requires<[IsDarwin]>; + Requires<[IsARM, IsDarwin]>; // zextload i1 -> zextload i8 def : ARMPat<(zextloadi1 addrmode2:$addr), (LDRB addrmode2:$addr)>; diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index a62597b..cd370aa 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -65,8 +65,28 @@ def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>, def NEONvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>; def NEONvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>; -def NEONvduplaneq : SDNode<"ARMISD::VDUPLANEQ", - SDTypeProfile<1, 2, [SDTCisVT<2, i32>]>>; +def NEONvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>; + +// VDUPLANE can produce a quad-register result from a double-register source, +// so the result is not constrained to match the source. +def NEONvduplane : SDNode<"ARMISD::VDUPLANE", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisVT<2, i32>]>>; + +def SDTARMVEXT : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>; +def NEONvext : SDNode<"ARMISD::VEXT", SDTARMVEXT>; + +def SDTARMVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>; +def NEONvrev64 : SDNode<"ARMISD::VREV64", SDTARMVSHUF>; +def NEONvrev32 : SDNode<"ARMISD::VREV32", SDTARMVSHUF>; +def NEONvrev16 : SDNode<"ARMISD::VREV16", SDTARMVSHUF>; + +def SDTARMVSHUF2 : SDTypeProfile<2, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>; +def NEONzip : SDNode<"ARMISD::VZIP", SDTARMVSHUF2>; +def NEONuzp : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>; +def NEONtrn : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>; //===----------------------------------------------------------------------===// // NEON operand definitions @@ -87,28 +107,409 @@ def addrmode_neonldstm : Operand<i32>, //===----------------------------------------------------------------------===// /* TODO: Take advantage of vldm. -let mayLoad = 1 in { +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in { def VLDMD : NI<(outs), (ins addrmode_neonldstm:$addr, reglist:$dst1, variable_ops), + IIC_fpLoadm, "vldm${addr:submode} ${addr:base}, $dst1", - []>; + []> { + let Inst{27-25} = 0b110; + let Inst{20} = 1; + let Inst{11-9} = 0b101; +} def VLDMS : NI<(outs), (ins addrmode_neonldstm:$addr, reglist:$dst1, variable_ops), + IIC_fpLoadm, "vldm${addr:submode} ${addr:base}, $dst1", - []>; + []> { + let Inst{27-25} = 0b110; + let Inst{20} = 1; + let Inst{11-9} = 0b101; +} } */ // Use vldmia to load a Q register as a D register pair. -def VLDRQ : NI<(outs QPR:$dst), (ins GPR:$addr), +def VLDRQ : NI4<(outs QPR:$dst), (ins addrmode4:$addr), + IIC_fpLoadm, "vldmia $addr, ${dst:dregpair}", - [(set QPR:$dst, (v2f64 (load GPR:$addr)))]>; + [(set QPR:$dst, (v2f64 (load addrmode4:$addr)))]> { + let Inst{27-25} = 0b110; + let Inst{24} = 0; // P bit + let Inst{23} = 1; // U bit + let Inst{20} = 1; + let Inst{11-9} = 0b101; +} // Use vstmia to store a Q register as a D register pair. -def VSTRQ : NI<(outs), (ins QPR:$src, GPR:$addr), +def VSTRQ : NI4<(outs), (ins QPR:$src, addrmode4:$addr), + IIC_fpStorem, "vstmia $addr, ${src:dregpair}", - [(store (v2f64 QPR:$src), GPR:$addr)]>; + [(store (v2f64 QPR:$src), addrmode4:$addr)]> { + let Inst{27-25} = 0b110; + let Inst{24} = 0; // P bit + let Inst{23} = 1; // U bit + let Inst{20} = 0; + let Inst{11-9} = 0b101; +} + +// VLD1 : Vector Load (multiple single elements) +class VLD1D<bits<4> op7_4, string OpcodeStr, ValueType Ty, Intrinsic IntOp> + : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$dst), (ins addrmode6:$addr), IIC_VLD1, + !strconcat(OpcodeStr, "\t\\{$dst\\}, $addr"), "", + [(set DPR:$dst, (Ty (IntOp addrmode6:$addr)))]>; +class VLD1Q<bits<4> op7_4, string OpcodeStr, ValueType Ty, Intrinsic IntOp> + : NLdSt<0,0b10,0b1010,op7_4, (outs QPR:$dst), (ins addrmode6:$addr), IIC_VLD1, + !strconcat(OpcodeStr, "\t${dst:dregpair}, $addr"), "", + [(set QPR:$dst, (Ty (IntOp addrmode6:$addr)))]>; + +def VLD1d8 : VLD1D<0b0000, "vld1.8", v8i8, int_arm_neon_vld1>; +def VLD1d16 : VLD1D<0b0100, "vld1.16", v4i16, int_arm_neon_vld1>; +def VLD1d32 : VLD1D<0b1000, "vld1.32", v2i32, int_arm_neon_vld1>; +def VLD1df : VLD1D<0b1000, "vld1.32", v2f32, int_arm_neon_vld1>; +def VLD1d64 : VLD1D<0b1100, "vld1.64", v1i64, int_arm_neon_vld1>; + +def VLD1q8 : VLD1Q<0b0000, "vld1.8", v16i8, int_arm_neon_vld1>; +def VLD1q16 : VLD1Q<0b0100, "vld1.16", v8i16, int_arm_neon_vld1>; +def VLD1q32 : VLD1Q<0b1000, "vld1.32", v4i32, int_arm_neon_vld1>; +def VLD1qf : VLD1Q<0b1000, "vld1.32", v4f32, int_arm_neon_vld1>; +def VLD1q64 : VLD1Q<0b1100, "vld1.64", v2i64, int_arm_neon_vld1>; + +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in { + +// VLD2 : Vector Load (multiple 2-element structures) +class VLD2D<bits<4> op7_4, string OpcodeStr> + : NLdSt<0,0b10,0b1000,op7_4, (outs DPR:$dst1, DPR:$dst2), + (ins addrmode6:$addr), IIC_VLD2, + !strconcat(OpcodeStr, "\t\\{$dst1,$dst2\\}, $addr"), "", []>; +class VLD2Q<bits<4> op7_4, string OpcodeStr> + : NLdSt<0,0b10,0b0011,op7_4, + (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6:$addr), IIC_VLD2, + !strconcat(OpcodeStr, "\t\\{$dst1,$dst2,$dst3,$dst4\\}, $addr"), + "", []>; + +def VLD2d8 : VLD2D<0b0000, "vld2.8">; +def VLD2d16 : VLD2D<0b0100, "vld2.16">; +def VLD2d32 : VLD2D<0b1000, "vld2.32">; +def VLD2d64 : NLdSt<0,0b10,0b1010,0b1100, (outs DPR:$dst1, DPR:$dst2), + (ins addrmode6:$addr), IIC_VLD1, + "vld1.64\t\\{$dst1,$dst2\\}, $addr", "", []>; + +def VLD2q8 : VLD2Q<0b0000, "vld2.8">; +def VLD2q16 : VLD2Q<0b0100, "vld2.16">; +def VLD2q32 : VLD2Q<0b1000, "vld2.32">; + +// VLD3 : Vector Load (multiple 3-element structures) +class VLD3D<bits<4> op7_4, string OpcodeStr> + : NLdSt<0,0b10,0b0100,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3), + (ins addrmode6:$addr), IIC_VLD3, + !strconcat(OpcodeStr, "\t\\{$dst1,$dst2,$dst3\\}, $addr"), "", []>; +class VLD3WB<bits<4> op7_4, string OpcodeStr> + : NLdSt<0,0b10,0b0101,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb), + (ins addrmode6:$addr), IIC_VLD3, + !strconcat(OpcodeStr, "\t\\{$dst1,$dst2,$dst3\\}, $addr"), + "$addr.addr = $wb", []>; + +def VLD3d8 : VLD3D<0b0000, "vld3.8">; +def VLD3d16 : VLD3D<0b0100, "vld3.16">; +def VLD3d32 : VLD3D<0b1000, "vld3.32">; +def VLD3d64 : NLdSt<0,0b10,0b0110,0b1100, + (outs DPR:$dst1, DPR:$dst2, DPR:$dst3), + (ins addrmode6:$addr), IIC_VLD1, + "vld1.64\t\\{$dst1,$dst2,$dst3\\}, $addr", "", []>; + +// vld3 to double-spaced even registers. +def VLD3q8a : VLD3WB<0b0000, "vld3.8">; +def VLD3q16a : VLD3WB<0b0100, "vld3.16">; +def VLD3q32a : VLD3WB<0b1000, "vld3.32">; + +// vld3 to double-spaced odd registers. +def VLD3q8b : VLD3WB<0b0000, "vld3.8">; +def VLD3q16b : VLD3WB<0b0100, "vld3.16">; +def VLD3q32b : VLD3WB<0b1000, "vld3.32">; + +// VLD4 : Vector Load (multiple 4-element structures) +class VLD4D<bits<4> op7_4, string OpcodeStr> + : NLdSt<0,0b10,0b0000,op7_4, + (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6:$addr), IIC_VLD4, + !strconcat(OpcodeStr, "\t\\{$dst1,$dst2,$dst3,$dst4\\}, $addr"), + "", []>; +class VLD4WB<bits<4> op7_4, string OpcodeStr> + : NLdSt<0,0b10,0b0001,op7_4, + (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), + (ins addrmode6:$addr), IIC_VLD4, + !strconcat(OpcodeStr, "\t\\{$dst1,$dst2,$dst3,$dst4\\}, $addr"), + "$addr.addr = $wb", []>; + +def VLD4d8 : VLD4D<0b0000, "vld4.8">; +def VLD4d16 : VLD4D<0b0100, "vld4.16">; +def VLD4d32 : VLD4D<0b1000, "vld4.32">; +def VLD4d64 : NLdSt<0,0b10,0b0010,0b1100, + (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6:$addr), IIC_VLD1, + "vld1.64\t\\{$dst1,$dst2,$dst3,$dst4\\}, $addr", "", []>; + +// vld4 to double-spaced even registers. +def VLD4q8a : VLD4WB<0b0000, "vld4.8">; +def VLD4q16a : VLD4WB<0b0100, "vld4.16">; +def VLD4q32a : VLD4WB<0b1000, "vld4.32">; + +// vld4 to double-spaced odd registers. +def VLD4q8b : VLD4WB<0b0000, "vld4.8">; +def VLD4q16b : VLD4WB<0b0100, "vld4.16">; +def VLD4q32b : VLD4WB<0b1000, "vld4.32">; + +// VLD1LN : Vector Load (single element to one lane) +// FIXME: Not yet implemented. + +// VLD2LN : Vector Load (single 2-element structure to one lane) +class VLD2LN<bits<4> op11_8, string OpcodeStr> + : NLdSt<1,0b10,op11_8,0b0000, (outs DPR:$dst1, DPR:$dst2), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane), + IIC_VLD2, + !strconcat(OpcodeStr, "\t\\{$dst1[$lane],$dst2[$lane]\\}, $addr"), + "$src1 = $dst1, $src2 = $dst2", []>; + +def VLD2LNd8 : VLD2LN<0b0001, "vld2.8">; +def VLD2LNd16 : VLD2LN<0b0101, "vld2.16">; +def VLD2LNd32 : VLD2LN<0b1001, "vld2.32">; + +// vld2 to double-spaced even registers. +def VLD2LNq16a: VLD2LN<0b0101, "vld2.16">; +def VLD2LNq32a: VLD2LN<0b1001, "vld2.32">; + +// vld2 to double-spaced odd registers. +def VLD2LNq16b: VLD2LN<0b0101, "vld2.16">; +def VLD2LNq32b: VLD2LN<0b1001, "vld2.32">; + +// VLD3LN : Vector Load (single 3-element structure to one lane) +class VLD3LN<bits<4> op11_8, string OpcodeStr> + : NLdSt<1,0b10,op11_8,0b0000, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, + nohash_imm:$lane), IIC_VLD3, + !strconcat(OpcodeStr, + "\t\\{$dst1[$lane],$dst2[$lane],$dst3[$lane]\\}, $addr"), + "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3", []>; + +def VLD3LNd8 : VLD3LN<0b0010, "vld3.8">; +def VLD3LNd16 : VLD3LN<0b0110, "vld3.16">; +def VLD3LNd32 : VLD3LN<0b1010, "vld3.32">; + +// vld3 to double-spaced even registers. +def VLD3LNq16a: VLD3LN<0b0110, "vld3.16">; +def VLD3LNq32a: VLD3LN<0b1010, "vld3.32">; + +// vld3 to double-spaced odd registers. +def VLD3LNq16b: VLD3LN<0b0110, "vld3.16">; +def VLD3LNq32b: VLD3LN<0b1010, "vld3.32">; + +// VLD4LN : Vector Load (single 4-element structure to one lane) +class VLD4LN<bits<4> op11_8, string OpcodeStr> + : NLdSt<1,0b10,op11_8,0b0000, + (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, + nohash_imm:$lane), IIC_VLD4, + !strconcat(OpcodeStr, + "\t\\{$dst1[$lane],$dst2[$lane],$dst3[$lane],$dst4[$lane]\\}, $addr"), + "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []>; + +def VLD4LNd8 : VLD4LN<0b0011, "vld4.8">; +def VLD4LNd16 : VLD4LN<0b0111, "vld4.16">; +def VLD4LNd32 : VLD4LN<0b1011, "vld4.32">; + +// vld4 to double-spaced even registers. +def VLD4LNq16a: VLD4LN<0b0111, "vld4.16">; +def VLD4LNq32a: VLD4LN<0b1011, "vld4.32">; + +// vld4 to double-spaced odd registers. +def VLD4LNq16b: VLD4LN<0b0111, "vld4.16">; +def VLD4LNq32b: VLD4LN<0b1011, "vld4.32">; + +// VLD1DUP : Vector Load (single element to all lanes) +// VLD2DUP : Vector Load (single 2-element structure to all lanes) +// VLD3DUP : Vector Load (single 3-element structure to all lanes) +// VLD4DUP : Vector Load (single 4-element structure to all lanes) +// FIXME: Not yet implemented. +} // mayLoad = 1, hasExtraDefRegAllocReq = 1 + +// VST1 : Vector Store (multiple single elements) +class VST1D<bits<4> op7_4, string OpcodeStr, ValueType Ty, Intrinsic IntOp> + : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins addrmode6:$addr, DPR:$src), IIC_VST, + !strconcat(OpcodeStr, "\t\\{$src\\}, $addr"), "", + [(IntOp addrmode6:$addr, (Ty DPR:$src))]>; +class VST1Q<bits<4> op7_4, string OpcodeStr, ValueType Ty, Intrinsic IntOp> + : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins addrmode6:$addr, QPR:$src), IIC_VST, + !strconcat(OpcodeStr, "\t${src:dregpair}, $addr"), "", + [(IntOp addrmode6:$addr, (Ty QPR:$src))]>; + +let hasExtraSrcRegAllocReq = 1 in { +def VST1d8 : VST1D<0b0000, "vst1.8", v8i8, int_arm_neon_vst1>; +def VST1d16 : VST1D<0b0100, "vst1.16", v4i16, int_arm_neon_vst1>; +def VST1d32 : VST1D<0b1000, "vst1.32", v2i32, int_arm_neon_vst1>; +def VST1df : VST1D<0b1000, "vst1.32", v2f32, int_arm_neon_vst1>; +def VST1d64 : VST1D<0b1100, "vst1.64", v1i64, int_arm_neon_vst1>; + +def VST1q8 : VST1Q<0b0000, "vst1.8", v16i8, int_arm_neon_vst1>; +def VST1q16 : VST1Q<0b0100, "vst1.16", v8i16, int_arm_neon_vst1>; +def VST1q32 : VST1Q<0b1000, "vst1.32", v4i32, int_arm_neon_vst1>; +def VST1qf : VST1Q<0b1000, "vst1.32", v4f32, int_arm_neon_vst1>; +def VST1q64 : VST1Q<0b1100, "vst1.64", v2i64, int_arm_neon_vst1>; +} // hasExtraSrcRegAllocReq + +let mayStore = 1, hasExtraSrcRegAllocReq = 1 in { + +// VST2 : Vector Store (multiple 2-element structures) +class VST2D<bits<4> op7_4, string OpcodeStr> + : NLdSt<0,0b00,0b1000,op7_4, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2), IIC_VST, + !strconcat(OpcodeStr, "\t\\{$src1,$src2\\}, $addr"), "", []>; +class VST2Q<bits<4> op7_4, string OpcodeStr> + : NLdSt<0,0b00,0b0011,op7_4, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), + IIC_VST, + !strconcat(OpcodeStr, "\t\\{$src1,$src2,$src3,$src4\\}, $addr"), + "", []>; + +def VST2d8 : VST2D<0b0000, "vst2.8">; +def VST2d16 : VST2D<0b0100, "vst2.16">; +def VST2d32 : VST2D<0b1000, "vst2.32">; +def VST2d64 : NLdSt<0,0b00,0b1010,0b1100, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2), IIC_VST, + "vst1.64\t\\{$src1,$src2\\}, $addr", "", []>; + +def VST2q8 : VST2Q<0b0000, "vst2.8">; +def VST2q16 : VST2Q<0b0100, "vst2.16">; +def VST2q32 : VST2Q<0b1000, "vst2.32">; + +// VST3 : Vector Store (multiple 3-element structures) +class VST3D<bits<4> op7_4, string OpcodeStr> + : NLdSt<0,0b00,0b0100,op7_4, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3), IIC_VST, + !strconcat(OpcodeStr, "\t\\{$src1,$src2,$src3\\}, $addr"), "", []>; +class VST3WB<bits<4> op7_4, string OpcodeStr> + : NLdSt<0,0b00,0b0101,op7_4, (outs GPR:$wb), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3), IIC_VST, + !strconcat(OpcodeStr, "\t\\{$src1,$src2,$src3\\}, $addr"), + "$addr.addr = $wb", []>; + +def VST3d8 : VST3D<0b0000, "vst3.8">; +def VST3d16 : VST3D<0b0100, "vst3.16">; +def VST3d32 : VST3D<0b1000, "vst3.32">; +def VST3d64 : NLdSt<0,0b00,0b0110,0b1100, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3), + IIC_VST, + "vst1.64\t\\{$src1,$src2,$src3\\}, $addr", "", []>; + +// vst3 to double-spaced even registers. +def VST3q8a : VST3WB<0b0000, "vst3.8">; +def VST3q16a : VST3WB<0b0100, "vst3.16">; +def VST3q32a : VST3WB<0b1000, "vst3.32">; + +// vst3 to double-spaced odd registers. +def VST3q8b : VST3WB<0b0000, "vst3.8">; +def VST3q16b : VST3WB<0b0100, "vst3.16">; +def VST3q32b : VST3WB<0b1000, "vst3.32">; + +// VST4 : Vector Store (multiple 4-element structures) +class VST4D<bits<4> op7_4, string OpcodeStr> + : NLdSt<0,0b00,0b0000,op7_4, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), + IIC_VST, + !strconcat(OpcodeStr, "\t\\{$src1,$src2,$src3,$src4\\}, $addr"), + "", []>; +class VST4WB<bits<4> op7_4, string OpcodeStr> + : NLdSt<0,0b00,0b0001,op7_4, (outs GPR:$wb), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), + IIC_VST, + !strconcat(OpcodeStr, "\t\\{$src1,$src2,$src3,$src4\\}, $addr"), + "$addr.addr = $wb", []>; + +def VST4d8 : VST4D<0b0000, "vst4.8">; +def VST4d16 : VST4D<0b0100, "vst4.16">; +def VST4d32 : VST4D<0b1000, "vst4.32">; +def VST4d64 : NLdSt<0,0b00,0b0010,0b1100, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, + DPR:$src4), IIC_VST, + "vst1.64\t\\{$src1,$src2,$src3,$src4\\}, $addr", "", []>; + +// vst4 to double-spaced even registers. +def VST4q8a : VST4WB<0b0000, "vst4.8">; +def VST4q16a : VST4WB<0b0100, "vst4.16">; +def VST4q32a : VST4WB<0b1000, "vst4.32">; + +// vst4 to double-spaced odd registers. +def VST4q8b : VST4WB<0b0000, "vst4.8">; +def VST4q16b : VST4WB<0b0100, "vst4.16">; +def VST4q32b : VST4WB<0b1000, "vst4.32">; + +// VST1LN : Vector Store (single element from one lane) +// FIXME: Not yet implemented. + +// VST2LN : Vector Store (single 2-element structure from one lane) +class VST2LN<bits<4> op11_8, string OpcodeStr> + : NLdSt<1,0b00,op11_8,0b0000, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane), + IIC_VST, + !strconcat(OpcodeStr, "\t\\{$src1[$lane],$src2[$lane]\\}, $addr"), + "", []>; + +def VST2LNd8 : VST2LN<0b0000, "vst2.8">; +def VST2LNd16 : VST2LN<0b0100, "vst2.16">; +def VST2LNd32 : VST2LN<0b1000, "vst2.32">; + +// vst2 to double-spaced even registers. +def VST2LNq16a: VST2LN<0b0100, "vst2.16">; +def VST2LNq32a: VST2LN<0b1000, "vst2.32">; + +// vst2 to double-spaced odd registers. +def VST2LNq16b: VST2LN<0b0100, "vst2.16">; +def VST2LNq32b: VST2LN<0b1000, "vst2.32">; + +// VST3LN : Vector Store (single 3-element structure from one lane) +class VST3LN<bits<4> op11_8, string OpcodeStr> + : NLdSt<1,0b00,op11_8,0b0000, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, + nohash_imm:$lane), IIC_VST, + !strconcat(OpcodeStr, + "\t\\{$src1[$lane],$src2[$lane],$src3[$lane]\\}, $addr"), "", []>; + +def VST3LNd8 : VST3LN<0b0010, "vst3.8">; +def VST3LNd16 : VST3LN<0b0110, "vst3.16">; +def VST3LNd32 : VST3LN<0b1010, "vst3.32">; + +// vst3 to double-spaced even registers. +def VST3LNq16a: VST3LN<0b0110, "vst3.16">; +def VST3LNq32a: VST3LN<0b1010, "vst3.32">; + +// vst3 to double-spaced odd registers. +def VST3LNq16b: VST3LN<0b0110, "vst3.16">; +def VST3LNq32b: VST3LN<0b1010, "vst3.32">; + +// VST4LN : Vector Store (single 4-element structure from one lane) +class VST4LN<bits<4> op11_8, string OpcodeStr> + : NLdSt<1,0b00,op11_8,0b0000, (outs), + (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, + nohash_imm:$lane), IIC_VST, + !strconcat(OpcodeStr, + "\t\\{$src1[$lane],$src2[$lane],$src3[$lane],$src4[$lane]\\}, $addr"), + "", []>; + +def VST4LNd8 : VST4LN<0b0011, "vst4.8">; +def VST4LNd16 : VST4LN<0b0111, "vst4.16">; +def VST4LNd32 : VST4LN<0b1011, "vst4.32">; + +// vst4 to double-spaced even registers. +def VST4LNq16a: VST4LN<0b0111, "vst4.16">; +def VST4LNq32a: VST4LN<0b1011, "vst4.32">; + +// vst4 to double-spaced odd registers. +def VST4LNq16b: VST4LN<0b0111, "vst4.16">; +def VST4LNq32b: VST4LN<0b1011, "vst4.32">; + +} // mayStore = 1, hasExtraSrcRegAllocReq = 1 //===----------------------------------------------------------------------===// @@ -117,18 +518,27 @@ def VSTRQ : NI<(outs), (ins QPR:$src, GPR:$addr), // Extract D sub-registers of Q registers. // (arm_dsubreg_0 is 5; arm_dsubreg_1 is 6) -def SubReg_i8_reg : SDNodeXForm<imm, [{ +def DSubReg_i8_reg : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(5 + N->getZExtValue() / 8, MVT::i32); }]>; -def SubReg_i16_reg : SDNodeXForm<imm, [{ +def DSubReg_i16_reg : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(5 + N->getZExtValue() / 4, MVT::i32); }]>; -def SubReg_i32_reg : SDNodeXForm<imm, [{ +def DSubReg_i32_reg : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(5 + N->getZExtValue() / 2, MVT::i32); }]>; -def SubReg_f64_reg : SDNodeXForm<imm, [{ +def DSubReg_f64_reg : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(5 + N->getZExtValue(), MVT::i32); }]>; +def DSubReg_f64_other_reg : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(5 + (1 - N->getZExtValue()), MVT::i32); +}]>; + +// Extract S sub-registers of Q/D registers. +// (arm_ssubreg_0 is 1; arm_ssubreg_1 is 2; etc.) +def SSubReg_f32_reg : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(1 + N->getZExtValue(), MVT::i32); +}]>; // Translate lane numbers from Q registers to D subregs. def SubReg_i8_lane : SDNodeXForm<imm, [{ @@ -150,117 +560,337 @@ class N2VD<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, ValueType ResTy, ValueType OpTy, SDNode OpNode> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst), - (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "", + (ins DPR:$src), IIC_VUNAD, !strconcat(OpcodeStr, "\t$dst, $src"), "", [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src))))]>; class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, ValueType ResTy, ValueType OpTy, SDNode OpNode> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst), - (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "", + (ins QPR:$src), IIC_VUNAQ, !strconcat(OpcodeStr, "\t$dst, $src"), "", [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src))))]>; +// Basic 2-register operations, scalar single-precision. +class N2VDs<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, + ValueType ResTy, ValueType OpTy, SDNode OpNode> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, + (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), + IIC_VUNAD, !strconcat(OpcodeStr, "\t$dst, $src"), "", []>; + +class N2VDsPat<SDNode OpNode, ValueType ResTy, ValueType OpTy, NeonI Inst> + : NEONFPPat<(ResTy (OpNode SPR:$a)), + (EXTRACT_SUBREG + (Inst (INSERT_SUBREG (OpTy (IMPLICIT_DEF)), SPR:$a, arm_ssubreg_0)), + arm_ssubreg_0)>; + // Basic 2-register intrinsics, both double- and quad-register. class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, - bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, + bits<2> op17_16, bits<5> op11_7, bit op4, + InstrItinClass itin, string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst), - (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "", + (ins DPR:$src), itin, !strconcat(OpcodeStr, "\t$dst, $src"), "", [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src))))]>; class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, - bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, + bits<2> op17_16, bits<5> op11_7, bit op4, + InstrItinClass itin, string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst), - (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "", + (ins QPR:$src), itin, !strconcat(OpcodeStr, "\t$dst, $src"), "", [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>; +// Basic 2-register intrinsics, scalar single-precision +class N2VDInts<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, + InstrItinClass itin, string OpcodeStr, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, + (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), itin, + !strconcat(OpcodeStr, "\t$dst, $src"), "", []>; + +class N2VDIntsPat<SDNode OpNode, NeonI Inst> + : NEONFPPat<(f32 (OpNode SPR:$a)), + (EXTRACT_SUBREG + (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$a, arm_ssubreg_0)), + arm_ssubreg_0)>; + // Narrow 2-register intrinsics. class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, - string OpcodeStr, ValueType TyD, ValueType TyQ, Intrinsic IntOp> + InstrItinClass itin, string OpcodeStr, + ValueType TyD, ValueType TyQ, Intrinsic IntOp> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$dst), - (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "", + (ins QPR:$src), itin, !strconcat(OpcodeStr, "\t$dst, $src"), "", [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src))))]>; // Long 2-register intrinsics. (This is currently only used for VMOVL and is // derived from N2VImm instead of N2V because of the way the size is encoded.) class N2VLInt<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, - bit op6, bit op4, string OpcodeStr, ValueType TyQ, ValueType TyD, - Intrinsic IntOp> + bit op6, bit op4, InstrItinClass itin, string OpcodeStr, + ValueType TyQ, ValueType TyD, Intrinsic IntOp> : N2VImm<op24, op23, op21_16, op11_8, op7, op6, op4, (outs QPR:$dst), - (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "", + (ins DPR:$src), itin, !strconcat(OpcodeStr, "\t$dst, $src"), "", [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src))))]>; +// 2-register shuffles (VTRN/VZIP/VUZP), both double- and quad-register. +class N2VDShuffle<bits<2> op19_18, bits<5> op11_7, string OpcodeStr> + : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 0, 0, (outs DPR:$dst1, DPR:$dst2), + (ins DPR:$src1, DPR:$src2), IIC_VPERMD, + !strconcat(OpcodeStr, "\t$dst1, $dst2"), + "$src1 = $dst1, $src2 = $dst2", []>; +class N2VQShuffle<bits<2> op19_18, bits<5> op11_7, + InstrItinClass itin, string OpcodeStr> + : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 1, 0, (outs QPR:$dst1, QPR:$dst2), + (ins QPR:$src1, QPR:$src2), itin, + !strconcat(OpcodeStr, "\t$dst1, $dst2"), + "$src1 = $dst1, $src2 = $dst2", []>; + // Basic 3-register operations, both double- and quad-register. class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, - string OpcodeStr, ValueType ResTy, ValueType OpTy, + InstrItinClass itin, string OpcodeStr, ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "", [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]> { let isCommutable = Commutable; } +class N3VDSL<bits<2> op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, ValueType Ty, SDNode ShOp> + : N3V<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), + itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "", + [(set (Ty DPR:$dst), + (Ty (ShOp (Ty DPR:$src1), + (Ty (NEONvduplane (Ty DPR_VFP2:$src2), + imm:$lane)))))]> { + let isCommutable = 0; +} +class N3VDSL16<bits<2> op21_20, bits<4> op11_8, + string OpcodeStr, ValueType Ty, SDNode ShOp> + : N3V<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), + IIC_VMULi16D, + !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "", + [(set (Ty DPR:$dst), + (Ty (ShOp (Ty DPR:$src1), + (Ty (NEONvduplane (Ty DPR_8:$src2), + imm:$lane)))))]> { + let isCommutable = 0; +} + class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, - string OpcodeStr, ValueType ResTy, ValueType OpTy, + InstrItinClass itin, string OpcodeStr, ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 1, op4, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "", [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]> { let isCommutable = Commutable; } +class N3VQSL<bits<2> op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, + ValueType ResTy, ValueType OpTy, SDNode ShOp> + : N3V<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), + itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "", + [(set (ResTy QPR:$dst), + (ResTy (ShOp (ResTy QPR:$src1), + (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2), + imm:$lane)))))]> { + let isCommutable = 0; +} +class N3VQSL16<bits<2> op21_20, bits<4> op11_8, + string OpcodeStr, ValueType ResTy, ValueType OpTy, SDNode ShOp> + : N3V<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane), + IIC_VMULi16Q, + !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "", + [(set (ResTy QPR:$dst), + (ResTy (ShOp (ResTy QPR:$src1), + (ResTy (NEONvduplane (OpTy DPR_8:$src2), + imm:$lane)))))]> { + let isCommutable = 0; +} + +// Basic 3-register operations, scalar single-precision +class N3VDs<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + string OpcodeStr, ValueType ResTy, ValueType OpTy, + SDNode OpNode, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src1, DPR_VFP2:$src2), IIC_VBIND, + !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "", []> { + let isCommutable = Commutable; +} +class N3VDsPat<SDNode OpNode, NeonI Inst> + : NEONFPPat<(f32 (OpNode SPR:$a, SPR:$b)), + (EXTRACT_SUBREG + (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$a, arm_ssubreg_0), + (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$b, arm_ssubreg_0)), + arm_ssubreg_0)>; // Basic 3-register intrinsics, both double- and quad-register. class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, - string OpcodeStr, ValueType ResTy, ValueType OpTy, + InstrItinClass itin, string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "", [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1), (OpTy DPR:$src2))))]> { let isCommutable = Commutable; } +class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, ValueType Ty, Intrinsic IntOp> + : N3V<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), + itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "", + [(set (Ty DPR:$dst), + (Ty (IntOp (Ty DPR:$src1), + (Ty (NEONvduplane (Ty DPR_VFP2:$src2), + imm:$lane)))))]> { + let isCommutable = 0; +} +class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, ValueType Ty, Intrinsic IntOp> + : N3V<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), + itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "", + [(set (Ty DPR:$dst), + (Ty (IntOp (Ty DPR:$src1), + (Ty (NEONvduplane (Ty DPR_8:$src2), + imm:$lane)))))]> { + let isCommutable = 0; +} + class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, - string OpcodeStr, ValueType ResTy, ValueType OpTy, + InstrItinClass itin, string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 1, op4, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "", [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), (OpTy QPR:$src2))))]> { let isCommutable = Commutable; } +class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3V<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), + itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "", + [(set (ResTy QPR:$dst), + (ResTy (IntOp (ResTy QPR:$src1), + (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2), + imm:$lane)))))]> { + let isCommutable = 0; +} +class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3V<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane), + itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "", + [(set (ResTy QPR:$dst), + (ResTy (IntOp (ResTy QPR:$src1), + (ResTy (NEONvduplane (OpTy DPR_8:$src2), + imm:$lane)))))]> { + let isCommutable = 0; +} // Multiply-Add/Sub operations, both double- and quad-register. class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, - string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode OpNode> + InstrItinClass itin, string OpcodeStr, + ValueType Ty, SDNode MulOp, SDNode OpNode> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), itin, !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst", [(set DPR:$dst, (Ty (OpNode DPR:$src1, (Ty (MulOp DPR:$src2, DPR:$src3)))))]>; +class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode ShOp> + : N3V<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$dst), + (ins DPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin, + !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst", + [(set (Ty DPR:$dst), + (Ty (ShOp (Ty DPR:$src1), + (Ty (MulOp DPR:$src2, + (Ty (NEONvduplane (Ty DPR_VFP2:$src3), + imm:$lane)))))))]>; +class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode ShOp> + : N3V<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$dst), + (ins DPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin, + !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst", + [(set (Ty DPR:$dst), + (Ty (ShOp (Ty DPR:$src1), + (Ty (MulOp DPR:$src2, + (Ty (NEONvduplane (Ty DPR_8:$src3), + imm:$lane)))))))]>; + class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, - string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode OpNode> + InstrItinClass itin, string OpcodeStr, ValueType Ty, + SDNode MulOp, SDNode OpNode> : N3V<op24, op23, op21_20, op11_8, 1, op4, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), itin, !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst", [(set QPR:$dst, (Ty (OpNode QPR:$src1, (Ty (MulOp QPR:$src2, QPR:$src3)))))]>; +class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, ValueType ResTy, ValueType OpTy, + SDNode MulOp, SDNode ShOp> + : N3V<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$dst), + (ins QPR:$src1, QPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin, + !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst", + [(set (ResTy QPR:$dst), + (ResTy (ShOp (ResTy QPR:$src1), + (ResTy (MulOp QPR:$src2, + (ResTy (NEONvduplane (OpTy DPR_VFP2:$src3), + imm:$lane)))))))]>; +class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, ValueType ResTy, ValueType OpTy, + SDNode MulOp, SDNode ShOp> + : N3V<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$dst), + (ins QPR:$src1, QPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin, + !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst", + [(set (ResTy QPR:$dst), + (ResTy (ShOp (ResTy QPR:$src1), + (ResTy (MulOp QPR:$src2, + (ResTy (NEONvduplane (OpTy DPR_8:$src3), + imm:$lane)))))))]>; + +// Multiply-Add/Sub operations, scalar single-precision +class N3VDMulOps<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, + ValueType Ty, SDNode MulOp, SDNode OpNode> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR_VFP2:$dst), + (ins DPR_VFP2:$src1, DPR_VFP2:$src2, DPR_VFP2:$src3), itin, + !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst", []>; + +class N3VDMulOpsPat<SDNode MulNode, SDNode OpNode, NeonI Inst> + : NEONFPPat<(f32 (OpNode SPR:$acc, (f32 (MulNode SPR:$a, SPR:$b)))), + (EXTRACT_SUBREG + (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$acc, arm_ssubreg_0), + (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$a, arm_ssubreg_0), + (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$b, arm_ssubreg_0)), + arm_ssubreg_0)>; // Neon 3-argument intrinsics, both double- and quad-register. // The destination register is also used as the first source operand register. class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, - string OpcodeStr, ValueType ResTy, ValueType OpTy, - Intrinsic IntOp> + InstrItinClass itin, string OpcodeStr, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), itin, !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst", [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1), (OpTy DPR:$src2), (OpTy DPR:$src3))))]>; class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, - string OpcodeStr, ValueType ResTy, ValueType OpTy, - Intrinsic IntOp> + InstrItinClass itin, string OpcodeStr, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N3V<op24, op23, op21_20, op11_8, 1, op4, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), itin, !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst", [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), (OpTy QPR:$src2), (OpTy QPR:$src3))))]>; @@ -268,19 +898,44 @@ class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, // Neon Long 3-argument intrinsic. The destination register is // a quad-register and is also used as the first source operand register. class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, - string OpcodeStr, ValueType TyQ, ValueType TyD, Intrinsic IntOp> + InstrItinClass itin, string OpcodeStr, + ValueType TyQ, ValueType TyD, Intrinsic IntOp> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), + (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), itin, !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst", [(set QPR:$dst, (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2), (TyD DPR:$src3))))]>; +class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3V<op24, 1, op21_20, op11_8, 1, 0, + (outs QPR:$dst), + (ins QPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin, + !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst", + [(set (ResTy QPR:$dst), + (ResTy (IntOp (ResTy QPR:$src1), + (OpTy DPR:$src2), + (OpTy (NEONvduplane (OpTy DPR_VFP2:$src3), + imm:$lane)))))]>; +class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, ValueType ResTy, ValueType OpTy, + Intrinsic IntOp> + : N3V<op24, 1, op21_20, op11_8, 1, 0, + (outs QPR:$dst), + (ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin, + !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst", + [(set (ResTy QPR:$dst), + (ResTy (IntOp (ResTy QPR:$src1), + (OpTy DPR:$src2), + (OpTy (NEONvduplane (OpTy DPR_8:$src3), + imm:$lane)))))]>; + // Narrowing 3-register intrinsics. class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, string OpcodeStr, ValueType TyD, ValueType TyQ, Intrinsic IntOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs DPR:$dst), (ins QPR:$src1, QPR:$src2), + (outs DPR:$dst), (ins QPR:$src1, QPR:$src2), IIC_VBINi4D, !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "", [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src1), (TyQ QPR:$src2))))]> { let isCommutable = Commutable; @@ -288,21 +943,40 @@ class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, // Long 3-register intrinsics. class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, - string OpcodeStr, ValueType TyQ, ValueType TyD, + InstrItinClass itin, string OpcodeStr, ValueType TyQ, ValueType TyD, Intrinsic IntOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), + (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "", [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src1), (TyD DPR:$src2))))]> { let isCommutable = Commutable; } +class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3V<op24, 1, op21_20, op11_8, 1, 0, + (outs QPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), + itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "", + [(set (ResTy QPR:$dst), + (ResTy (IntOp (OpTy DPR:$src1), + (OpTy (NEONvduplane (OpTy DPR_VFP2:$src2), + imm:$lane)))))]>; +class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, ValueType ResTy, ValueType OpTy, + Intrinsic IntOp> + : N3V<op24, 1, op21_20, op11_8, 1, 0, + (outs QPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), + itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "", + [(set (ResTy QPR:$dst), + (ResTy (IntOp (OpTy DPR:$src1), + (OpTy (NEONvduplane (OpTy DPR_8:$src2), + imm:$lane)))))]>; // Wide 3-register intrinsics. class N3VWInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, string OpcodeStr, ValueType TyQ, ValueType TyD, Intrinsic IntOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, - (outs QPR:$dst), (ins QPR:$src1, DPR:$src2), + (outs QPR:$dst), (ins QPR:$src1, DPR:$src2), IIC_VSUBiD, !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "", [(set QPR:$dst, (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2))))]> { let isCommutable = Commutable; @@ -313,13 +987,13 @@ class N2VDPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst), - (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "", + (ins DPR:$src), IIC_VSHLiD, !strconcat(OpcodeStr, "\t$dst, $src"), "", [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src))))]>; class N2VQPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst), - (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "", + (ins QPR:$src), IIC_VSHLiD, !strconcat(OpcodeStr, "\t$dst, $src"), "", [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>; // Pairwise long 2-register accumulate intrinsics, @@ -329,29 +1003,31 @@ class N2VDPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, - (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), IIC_VPALiD, !strconcat(OpcodeStr, "\t$dst, $src2"), "$src1 = $dst", [(set DPR:$dst, (ResTy (IntOp (ResTy DPR:$src1), (OpTy DPR:$src2))))]>; class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, - (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), IIC_VPALiQ, !strconcat(OpcodeStr, "\t$dst, $src2"), "$src1 = $dst", [(set QPR:$dst, (ResTy (IntOp (ResTy QPR:$src1), (OpTy QPR:$src2))))]>; // Shift by immediate, // both double- and quad-register. class N2VDSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, - bit op4, string OpcodeStr, ValueType Ty, SDNode OpNode> + bit op4, InstrItinClass itin, string OpcodeStr, + ValueType Ty, SDNode OpNode> : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4, - (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), + (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), itin, !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "", [(set DPR:$dst, (Ty (OpNode (Ty DPR:$src), (i32 imm:$SIMM))))]>; class N2VQSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, - bit op4, string OpcodeStr, ValueType Ty, SDNode OpNode> + bit op4, InstrItinClass itin, string OpcodeStr, + ValueType Ty, SDNode OpNode> : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4, - (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), + (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), itin, !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "", [(set QPR:$dst, (Ty (OpNode (Ty QPR:$src), (i32 imm:$SIMM))))]>; @@ -360,17 +1036,17 @@ class N2VLSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, bit op6, bit op4, string OpcodeStr, ValueType ResTy, ValueType OpTy, SDNode OpNode> : N2VImm<op24, op23, op21_16, op11_8, op7, op6, op4, - (outs QPR:$dst), (ins DPR:$src, i32imm:$SIMM), + (outs QPR:$dst), (ins DPR:$src, i32imm:$SIMM), IIC_VSHLiD, !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "", [(set QPR:$dst, (ResTy (OpNode (OpTy DPR:$src), (i32 imm:$SIMM))))]>; // Narrow shift by immediate. class N2VNSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, - bit op6, bit op4, string OpcodeStr, ValueType ResTy, - ValueType OpTy, SDNode OpNode> + bit op6, bit op4, InstrItinClass itin, string OpcodeStr, + ValueType ResTy, ValueType OpTy, SDNode OpNode> : N2VImm<op24, op23, op21_16, op11_8, op7, op6, op4, - (outs DPR:$dst), (ins QPR:$src, i32imm:$SIMM), + (outs DPR:$dst), (ins QPR:$src, i32imm:$SIMM), itin, !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "", [(set DPR:$dst, (ResTy (OpNode (OpTy QPR:$src), (i32 imm:$SIMM))))]>; @@ -381,6 +1057,7 @@ class N2VDShAdd<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp> : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4, (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), + IIC_VPALiD, !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst", [(set DPR:$dst, (Ty (add DPR:$src1, (Ty (ShOp DPR:$src2, (i32 imm:$SIMM))))))]>; @@ -388,6 +1065,7 @@ class N2VQShAdd<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp> : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4, (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), + IIC_VPALiD, !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst", [(set QPR:$dst, (Ty (add QPR:$src1, (Ty (ShOp QPR:$src2, (i32 imm:$SIMM))))))]>; @@ -398,12 +1076,14 @@ class N2VDShIns<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp> : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4, (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), + IIC_VSHLiD, !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst", [(set DPR:$dst, (Ty (ShOp DPR:$src1, DPR:$src2, (i32 imm:$SIMM))))]>; class N2VQShIns<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp> : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4, (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), + IIC_VSHLiQ, !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst", [(set QPR:$dst, (Ty (ShOp QPR:$src1, QPR:$src2, (i32 imm:$SIMM))))]>; @@ -413,14 +1093,14 @@ class N2VCvtD<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, bit op4, string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4, - (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), + (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), IIC_VUNAD, !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "", [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src), (i32 imm:$SIMM))))]>; class N2VCvtQ<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, bit op4, string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4, - (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), + (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), IIC_VUNAQ, !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "", [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src), (i32 imm:$SIMM))))]>; @@ -428,50 +1108,68 @@ class N2VCvtQ<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, // Multiclasses //===----------------------------------------------------------------------===// +// Abbreviations used in multiclass suffixes: +// Q = quarter int (8 bit) elements +// H = half int (16 bit) elements +// S = single int (32 bit) elements +// D = double int (64 bit) elements + // Neon 3-register vector operations. // First with only element sizes of 8, 16 and 32 bits: multiclass N3V_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, string OpcodeStr, SDNode OpNode, bit Commutable = 0> { // 64-bit vector types. - def v8i8 : N3VD<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"), - v8i8, v8i8, OpNode, Commutable>; - def v4i16 : N3VD<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr, "16"), - v4i16, v4i16, OpNode, Commutable>; - def v2i32 : N3VD<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr, "32"), - v2i32, v2i32, OpNode, Commutable>; + def v8i8 : N3VD<op24, op23, 0b00, op11_8, op4, itinD16, + !strconcat(OpcodeStr, "8"), v8i8, v8i8, OpNode, Commutable>; + def v4i16 : N3VD<op24, op23, 0b01, op11_8, op4, itinD16, + !strconcat(OpcodeStr, "16"), v4i16, v4i16, OpNode, Commutable>; + def v2i32 : N3VD<op24, op23, 0b10, op11_8, op4, itinD32, + !strconcat(OpcodeStr, "32"), v2i32, v2i32, OpNode, Commutable>; // 128-bit vector types. - def v16i8 : N3VQ<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"), - v16i8, v16i8, OpNode, Commutable>; - def v8i16 : N3VQ<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr, "16"), - v8i16, v8i16, OpNode, Commutable>; - def v4i32 : N3VQ<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr, "32"), - v4i32, v4i32, OpNode, Commutable>; + def v16i8 : N3VQ<op24, op23, 0b00, op11_8, op4, itinQ16, + !strconcat(OpcodeStr, "8"), v16i8, v16i8, OpNode, Commutable>; + def v8i16 : N3VQ<op24, op23, 0b01, op11_8, op4, itinQ16, + !strconcat(OpcodeStr, "16"), v8i16, v8i16, OpNode, Commutable>; + def v4i32 : N3VQ<op24, op23, 0b10, op11_8, op4, itinQ32, + !strconcat(OpcodeStr, "32"), v4i32, v4i32, OpNode, Commutable>; +} + +multiclass N3VSL_HS<bits<4> op11_8, string OpcodeStr, SDNode ShOp> { + def v4i16 : N3VDSL16<0b01, op11_8, !strconcat(OpcodeStr, "16"), v4i16, ShOp>; + def v2i32 : N3VDSL<0b10, op11_8, IIC_VMULi32D, !strconcat(OpcodeStr, "32"), v2i32, ShOp>; + def v8i16 : N3VQSL16<0b01, op11_8, !strconcat(OpcodeStr, "16"), v8i16, v4i16, ShOp>; + def v4i32 : N3VQSL<0b10, op11_8, IIC_VMULi32Q, !strconcat(OpcodeStr, "32"), v4i32, v2i32, ShOp>; } // ....then also with element size 64 bits: multiclass N3V_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD, InstrItinClass itinQ, string OpcodeStr, SDNode OpNode, bit Commutable = 0> - : N3V_QHS<op24, op23, op11_8, op4, OpcodeStr, OpNode, Commutable> { - def v1i64 : N3VD<op24, op23, 0b11, op11_8, op4, !strconcat(OpcodeStr, "64"), - v1i64, v1i64, OpNode, Commutable>; - def v2i64 : N3VQ<op24, op23, 0b11, op11_8, op4, !strconcat(OpcodeStr, "64"), - v2i64, v2i64, OpNode, Commutable>; + : N3V_QHS<op24, op23, op11_8, op4, itinD, itinD, itinQ, itinQ, + OpcodeStr, OpNode, Commutable> { + def v1i64 : N3VD<op24, op23, 0b11, op11_8, op4, itinD, + !strconcat(OpcodeStr, "64"), v1i64, v1i64, OpNode, Commutable>; + def v2i64 : N3VQ<op24, op23, 0b11, op11_8, op4, itinQ, + !strconcat(OpcodeStr, "64"), v2i64, v2i64, OpNode, Commutable>; } // Neon Narrowing 2-register vector intrinsics, // source operand element sizes of 16, 32 and 64 bits: multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, - bits<5> op11_7, bit op6, bit op4, string OpcodeStr, + bits<5> op11_7, bit op6, bit op4, + InstrItinClass itin, string OpcodeStr, Intrinsic IntOp> { def v8i8 : N2VNInt<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4, - !strconcat(OpcodeStr, "16"), v8i8, v8i16, IntOp>; + itin, !strconcat(OpcodeStr, "16"), v8i8, v8i16, IntOp>; def v4i16 : N2VNInt<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4, - !strconcat(OpcodeStr, "32"), v4i16, v4i32, IntOp>; + itin, !strconcat(OpcodeStr, "32"), v4i16, v4i32, IntOp>; def v2i32 : N2VNInt<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4, - !strconcat(OpcodeStr, "64"), v2i32, v2i64, IntOp>; + itin, !strconcat(OpcodeStr, "64"), v2i32, v2i64, IntOp>; } @@ -480,11 +1178,11 @@ multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, multiclass N2VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, string OpcodeStr, Intrinsic IntOp> { def v8i16 : N2VLInt<op24, op23, 0b001000, op11_8, op7, op6, op4, - !strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp>; + IIC_VQUNAiD, !strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp>; def v4i32 : N2VLInt<op24, op23, 0b010000, op11_8, op7, op6, op4, - !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>; + IIC_VQUNAiD, !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>; def v2i64 : N2VLInt<op24, op23, 0b100000, op11_8, op7, op6, op4, - !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>; + IIC_VQUNAiD, !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>; } @@ -492,38 +1190,56 @@ multiclass N2VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, // First with only element sizes of 16 and 32 bits: multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> { // 64-bit vector types. - def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr,"16"), + def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, itinD16, !strconcat(OpcodeStr,"16"), v4i16, v4i16, IntOp, Commutable>; - def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr,"32"), + def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, itinD32, !strconcat(OpcodeStr,"32"), v2i32, v2i32, IntOp, Commutable>; // 128-bit vector types. - def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr,"16"), + def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, itinQ16, !strconcat(OpcodeStr,"16"), v8i16, v8i16, IntOp, Commutable>; - def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr,"32"), + def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, itinQ32, !strconcat(OpcodeStr,"32"), v4i32, v4i32, IntOp, Commutable>; } +multiclass N3VIntSL_HS<bits<4> op11_8, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, Intrinsic IntOp> { + def v4i16 : N3VDIntSL16<0b01, op11_8, itinD16, !strconcat(OpcodeStr, "16"), v4i16, IntOp>; + def v2i32 : N3VDIntSL<0b10, op11_8, itinD32, !strconcat(OpcodeStr, "32"), v2i32, IntOp>; + def v8i16 : N3VQIntSL16<0b01, op11_8, itinQ16, !strconcat(OpcodeStr, "16"), v8i16, v4i16, IntOp>; + def v4i32 : N3VQIntSL<0b10, op11_8, itinQ32, !strconcat(OpcodeStr, "32"), v4i32, v2i32, IntOp>; +} + // ....then also with element size of 8 bits: multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> - : N3VInt_HS<op24, op23, op11_8, op4, OpcodeStr, IntOp, Commutable> { - def v8i8 : N3VDInt<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"), - v8i8, v8i8, IntOp, Commutable>; - def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"), - v16i8, v16i8, IntOp, Commutable>; + : N3VInt_HS<op24, op23, op11_8, op4, itinD16, itinD32, itinQ16, itinQ32, + OpcodeStr, IntOp, Commutable> { + def v8i8 : N3VDInt<op24, op23, 0b00, op11_8, op4, itinD16, + !strconcat(OpcodeStr, "8"), v8i8, v8i8, IntOp, Commutable>; + def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, itinQ16, + !strconcat(OpcodeStr, "8"), v16i8, v16i8, IntOp, Commutable>; } // ....then also with element size of 64 bits: multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> - : N3VInt_QHS<op24, op23, op11_8, op4, OpcodeStr, IntOp, Commutable> { - def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, !strconcat(OpcodeStr,"64"), - v1i64, v1i64, IntOp, Commutable>; - def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, !strconcat(OpcodeStr,"64"), - v2i64, v2i64, IntOp, Commutable>; + : N3VInt_QHS<op24, op23, op11_8, op4, itinD16, itinD32, itinQ16, itinQ32, + OpcodeStr, IntOp, Commutable> { + def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, itinD32, + !strconcat(OpcodeStr,"64"), v1i64, v1i64, IntOp, Commutable>; + def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, itinQ32, + !strconcat(OpcodeStr,"64"), v2i64, v2i64, IntOp, Commutable>; } @@ -544,19 +1260,29 @@ multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4, // First with only element sizes of 16 and 32 bits: multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, - string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> { - def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr,"16"), - v4i32, v4i16, IntOp, Commutable>; - def v2i64 : N3VLInt<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr,"32"), - v2i64, v2i32, IntOp, Commutable>; + InstrItinClass itin, string OpcodeStr, + Intrinsic IntOp, bit Commutable = 0> { + def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, itin, + !strconcat(OpcodeStr,"16"), v4i32, v4i16, IntOp, Commutable>; + def v2i64 : N3VLInt<op24, op23, 0b10, op11_8, op4, itin, + !strconcat(OpcodeStr,"32"), v2i64, v2i32, IntOp, Commutable>; +} + +multiclass N3VLIntSL_HS<bit op24, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, Intrinsic IntOp> { + def v4i16 : N3VLIntSL16<op24, 0b01, op11_8, itin, + !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>; + def v2i32 : N3VLIntSL<op24, 0b10, op11_8, itin, + !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>; } // ....then also with element size of 8 bits: multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, - string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> - : N3VLInt_HS<op24, op23, op11_8, op4, OpcodeStr, IntOp, Commutable> { - def v8i16 : N3VLInt<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"), - v8i16, v8i8, IntOp, Commutable>; + InstrItinClass itin, string OpcodeStr, + Intrinsic IntOp, bit Commutable = 0> + : N3VLInt_HS<op24, op23, op11_8, op4, itin, OpcodeStr, IntOp, Commutable> { + def v8i16 : N3VLInt<op24, op23, 0b00, op11_8, op4, itin, + !strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp, Commutable>; } @@ -576,43 +1302,58 @@ multiclass N3VWInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, // Neon Multiply-Op vector operations, // element sizes of 8, 16 and 32 bits: multiclass N3VMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, string OpcodeStr, SDNode OpNode> { // 64-bit vector types. - def v8i8 : N3VDMulOp<op24, op23, 0b00, op11_8, op4, + def v8i8 : N3VDMulOp<op24, op23, 0b00, op11_8, op4, itinD16, !strconcat(OpcodeStr, "8"), v8i8, mul, OpNode>; - def v4i16 : N3VDMulOp<op24, op23, 0b01, op11_8, op4, + def v4i16 : N3VDMulOp<op24, op23, 0b01, op11_8, op4, itinD16, !strconcat(OpcodeStr, "16"), v4i16, mul, OpNode>; - def v2i32 : N3VDMulOp<op24, op23, 0b10, op11_8, op4, + def v2i32 : N3VDMulOp<op24, op23, 0b10, op11_8, op4, itinD32, !strconcat(OpcodeStr, "32"), v2i32, mul, OpNode>; // 128-bit vector types. - def v16i8 : N3VQMulOp<op24, op23, 0b00, op11_8, op4, + def v16i8 : N3VQMulOp<op24, op23, 0b00, op11_8, op4, itinQ16, !strconcat(OpcodeStr, "8"), v16i8, mul, OpNode>; - def v8i16 : N3VQMulOp<op24, op23, 0b01, op11_8, op4, + def v8i16 : N3VQMulOp<op24, op23, 0b01, op11_8, op4, itinQ16, !strconcat(OpcodeStr, "16"), v8i16, mul, OpNode>; - def v4i32 : N3VQMulOp<op24, op23, 0b10, op11_8, op4, + def v4i32 : N3VQMulOp<op24, op23, 0b10, op11_8, op4, itinQ32, !strconcat(OpcodeStr, "32"), v4i32, mul, OpNode>; } +multiclass N3VMulOpSL_HS<bits<4> op11_8, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, SDNode ShOp> { + def v4i16 : N3VDMulOpSL16<0b01, op11_8, itinD16, + !strconcat(OpcodeStr, "16"), v4i16, mul, ShOp>; + def v2i32 : N3VDMulOpSL<0b10, op11_8, itinD32, + !strconcat(OpcodeStr, "32"), v2i32, mul, ShOp>; + def v8i16 : N3VQMulOpSL16<0b01, op11_8, itinQ16, + !strconcat(OpcodeStr, "16"), v8i16, v4i16, mul, ShOp>; + def v4i32 : N3VQMulOpSL<0b10, op11_8, itinQ32, + !strconcat(OpcodeStr, "32"), v4i32, v2i32, mul, ShOp>; +} // Neon 3-argument intrinsics, // element sizes of 8, 16 and 32 bits: multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, string OpcodeStr, Intrinsic IntOp> { // 64-bit vector types. - def v8i8 : N3VDInt3<op24, op23, 0b00, op11_8, op4, + def v8i8 : N3VDInt3<op24, op23, 0b00, op11_8, op4, IIC_VMACi16D, !strconcat(OpcodeStr, "8"), v8i8, v8i8, IntOp>; - def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4, + def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16D, !strconcat(OpcodeStr, "16"), v4i16, v4i16, IntOp>; - def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4, + def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4, IIC_VMACi32D, !strconcat(OpcodeStr, "32"), v2i32, v2i32, IntOp>; // 128-bit vector types. - def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4, + def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4, IIC_VMACi16Q, !strconcat(OpcodeStr, "8"), v16i8, v16i8, IntOp>; - def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4, + def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16Q, !strconcat(OpcodeStr, "16"), v8i16, v8i16, IntOp>; - def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4, + def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4, IIC_VMACi32Q, !strconcat(OpcodeStr, "32"), v4i32, v4i32, IntOp>; } @@ -622,17 +1363,25 @@ multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, // First with only element sizes of 16 and 32 bits: multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4, string OpcodeStr, Intrinsic IntOp> { - def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4, + def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16D, !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>; - def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4, + def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4, IIC_VMACi16D, !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>; } +multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8, + string OpcodeStr, Intrinsic IntOp> { + def v4i16 : N3VLInt3SL16<op24, 0b01, op11_8, IIC_VMACi16D, + !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>; + def v2i32 : N3VLInt3SL<op24, 0b10, op11_8, IIC_VMACi32D, + !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>; +} + // ....then also with element size of 8 bits: multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, string OpcodeStr, Intrinsic IntOp> : N3VLInt3_HS<op24, op23, op11_8, op4, OpcodeStr, IntOp> { - def v8i16 : N3VLInt3<op24, op23, 0b01, op11_8, op4, + def v8i16 : N3VLInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16D, !strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp>; } @@ -640,23 +1389,24 @@ multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, // Neon 2-register vector intrinsics, // element sizes of 8, 16 and 32 bits: multiclass N2VInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, - bits<5> op11_7, bit op4, string OpcodeStr, - Intrinsic IntOp> { + bits<5> op11_7, bit op4, + InstrItinClass itinD, InstrItinClass itinQ, + string OpcodeStr, Intrinsic IntOp> { // 64-bit vector types. def v8i8 : N2VDInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, - !strconcat(OpcodeStr, "8"), v8i8, v8i8, IntOp>; + itinD, !strconcat(OpcodeStr, "8"), v8i8, v8i8, IntOp>; def v4i16 : N2VDInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4, - !strconcat(OpcodeStr, "16"), v4i16, v4i16, IntOp>; + itinD, !strconcat(OpcodeStr, "16"), v4i16, v4i16, IntOp>; def v2i32 : N2VDInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4, - !strconcat(OpcodeStr, "32"), v2i32, v2i32, IntOp>; + itinD, !strconcat(OpcodeStr, "32"), v2i32, v2i32, IntOp>; // 128-bit vector types. def v16i8 : N2VQInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, - !strconcat(OpcodeStr, "8"), v16i8, v16i8, IntOp>; + itinQ, !strconcat(OpcodeStr, "8"), v16i8, v16i8, IntOp>; def v8i16 : N2VQInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4, - !strconcat(OpcodeStr, "16"), v8i16, v8i16, IntOp>; + itinQ, !strconcat(OpcodeStr, "16"), v8i16, v8i16, IntOp>; def v4i32 : N2VQInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4, - !strconcat(OpcodeStr, "32"), v4i32, v4i32, IntOp>; + itinQ, !strconcat(OpcodeStr, "32"), v4i32, v4i32, IntOp>; } @@ -709,25 +1459,25 @@ multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, // Neon 2-register vector shift by immediate, // element sizes of 8, 16, 32 and 64 bits: multiclass N2VSh_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, - string OpcodeStr, SDNode OpNode> { + InstrItinClass itin, string OpcodeStr, SDNode OpNode> { // 64-bit vector types. - def v8i8 : N2VDSh<op24, op23, 0b001000, op11_8, 0, op4, + def v8i8 : N2VDSh<op24, op23, 0b001000, op11_8, 0, op4, itin, !strconcat(OpcodeStr, "8"), v8i8, OpNode>; - def v4i16 : N2VDSh<op24, op23, 0b010000, op11_8, 0, op4, + def v4i16 : N2VDSh<op24, op23, 0b010000, op11_8, 0, op4, itin, !strconcat(OpcodeStr, "16"), v4i16, OpNode>; - def v2i32 : N2VDSh<op24, op23, 0b100000, op11_8, 0, op4, + def v2i32 : N2VDSh<op24, op23, 0b100000, op11_8, 0, op4, itin, !strconcat(OpcodeStr, "32"), v2i32, OpNode>; - def v1i64 : N2VDSh<op24, op23, 0b000000, op11_8, 1, op4, + def v1i64 : N2VDSh<op24, op23, 0b000000, op11_8, 1, op4, itin, !strconcat(OpcodeStr, "64"), v1i64, OpNode>; // 128-bit vector types. - def v16i8 : N2VQSh<op24, op23, 0b001000, op11_8, 0, op4, + def v16i8 : N2VQSh<op24, op23, 0b001000, op11_8, 0, op4, itin, !strconcat(OpcodeStr, "8"), v16i8, OpNode>; - def v8i16 : N2VQSh<op24, op23, 0b010000, op11_8, 0, op4, + def v8i16 : N2VQSh<op24, op23, 0b010000, op11_8, 0, op4, itin, !strconcat(OpcodeStr, "16"), v8i16, OpNode>; - def v4i32 : N2VQSh<op24, op23, 0b100000, op11_8, 0, op4, + def v4i32 : N2VQSh<op24, op23, 0b100000, op11_8, 0, op4, itin, !strconcat(OpcodeStr, "32"), v4i32, OpNode>; - def v2i64 : N2VQSh<op24, op23, 0b000000, op11_8, 1, op4, + def v2i64 : N2VQSh<op24, op23, 0b000000, op11_8, 1, op4, itin, !strconcat(OpcodeStr, "64"), v2i64, OpNode>; } @@ -790,24 +1540,30 @@ multiclass N2VShIns_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, // Vector Add Operations. // VADD : Vector Add (integer and floating-point) -defm VADD : N3V_QHSD<0, 0, 0b1000, 0, "vadd.i", add, 1>; -def VADDfd : N3VD<0, 0, 0b00, 0b1101, 0, "vadd.f32", v2f32, v2f32, fadd, 1>; -def VADDfq : N3VQ<0, 0, 0b00, 0b1101, 0, "vadd.f32", v4f32, v4f32, fadd, 1>; +defm VADD : N3V_QHSD<0, 0, 0b1000, 0, IIC_VBINiD, IIC_VBINiQ, "vadd.i", add, 1>; +def VADDfd : N3VD<0, 0, 0b00, 0b1101, 0, IIC_VBIND, "vadd.f32", v2f32, v2f32, fadd, 1>; +def VADDfq : N3VQ<0, 0, 0b00, 0b1101, 0, IIC_VBINQ, "vadd.f32", v4f32, v4f32, fadd, 1>; // VADDL : Vector Add Long (Q = D + D) -defm VADDLs : N3VLInt_QHS<0,1,0b0000,0, "vaddl.s", int_arm_neon_vaddls, 1>; -defm VADDLu : N3VLInt_QHS<1,1,0b0000,0, "vaddl.u", int_arm_neon_vaddlu, 1>; +defm VADDLs : N3VLInt_QHS<0,1,0b0000,0, IIC_VSHLiD, "vaddl.s", int_arm_neon_vaddls, 1>; +defm VADDLu : N3VLInt_QHS<1,1,0b0000,0, IIC_VSHLiD, "vaddl.u", int_arm_neon_vaddlu, 1>; // VADDW : Vector Add Wide (Q = Q + D) defm VADDWs : N3VWInt_QHS<0,1,0b0001,0, "vaddw.s", int_arm_neon_vaddws, 0>; defm VADDWu : N3VWInt_QHS<1,1,0b0001,0, "vaddw.u", int_arm_neon_vaddwu, 0>; // VHADD : Vector Halving Add -defm VHADDs : N3VInt_QHS<0,0,0b0000,0, "vhadd.s", int_arm_neon_vhadds, 1>; -defm VHADDu : N3VInt_QHS<1,0,0b0000,0, "vhadd.u", int_arm_neon_vhaddu, 1>; +defm VHADDs : N3VInt_QHS<0,0,0b0000,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vhadd.s", int_arm_neon_vhadds, 1>; +defm VHADDu : N3VInt_QHS<1,0,0b0000,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vhadd.u", int_arm_neon_vhaddu, 1>; // VRHADD : Vector Rounding Halving Add -defm VRHADDs : N3VInt_QHS<0,0,0b0001,0, "vrhadd.s", int_arm_neon_vrhadds, 1>; -defm VRHADDu : N3VInt_QHS<1,0,0b0001,0, "vrhadd.u", int_arm_neon_vrhaddu, 1>; +defm VRHADDs : N3VInt_QHS<0,0,0b0001,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vrhadd.s", int_arm_neon_vrhadds, 1>; +defm VRHADDu : N3VInt_QHS<1,0,0b0001,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vrhadd.u", int_arm_neon_vrhaddu, 1>; // VQADD : Vector Saturating Add -defm VQADDs : N3VInt_QHSD<0,0,0b0000,1, "vqadd.s", int_arm_neon_vqadds, 1>; -defm VQADDu : N3VInt_QHSD<1,0,0b0000,1, "vqadd.u", int_arm_neon_vqaddu, 1>; +defm VQADDs : N3VInt_QHSD<0,0,0b0000,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vqadd.s", int_arm_neon_vqadds, 1>; +defm VQADDu : N3VInt_QHSD<1,0,0b0000,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vqadd.u", int_arm_neon_vqaddu, 1>; // VADDHN : Vector Add and Narrow Returning High Half (D = Q + Q) defm VADDHN : N3VNInt_HSD<0,1,0b0100,0, "vaddhn.i", int_arm_neon_vaddhn, 1>; // VRADDHN : Vector Rounding Add and Narrow Returning High Half (D = Q + Q) @@ -816,64 +1572,208 @@ defm VRADDHN : N3VNInt_HSD<1,1,0b0100,0, "vraddhn.i", int_arm_neon_vraddhn, 1>; // Vector Multiply Operations. // VMUL : Vector Multiply (integer, polynomial and floating-point) -defm VMUL : N3V_QHS<0, 0, 0b1001, 1, "vmul.i", mul, 1>; -def VMULpd : N3VDInt<1, 0, 0b00, 0b1001, 1, "vmul.p8", v8i8, v8i8, +defm VMUL : N3V_QHS<0, 0, 0b1001, 1, IIC_VMULi16D, IIC_VMULi32D, IIC_VMULi16Q, + IIC_VMULi32Q, "vmul.i", mul, 1>; +def VMULpd : N3VDInt<1, 0, 0b00, 0b1001, 1, IIC_VMULi16D, "vmul.p8", v8i8, v8i8, int_arm_neon_vmulp, 1>; -def VMULpq : N3VQInt<1, 0, 0b00, 0b1001, 1, "vmul.p8", v16i8, v16i8, +def VMULpq : N3VQInt<1, 0, 0b00, 0b1001, 1, IIC_VMULi16Q, "vmul.p8", v16i8, v16i8, int_arm_neon_vmulp, 1>; -def VMULfd : N3VD<1, 0, 0b00, 0b1101, 1, "vmul.f32", v2f32, v2f32, fmul, 1>; -def VMULfq : N3VQ<1, 0, 0b00, 0b1101, 1, "vmul.f32", v4f32, v4f32, fmul, 1>; +def VMULfd : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VBIND, "vmul.f32", v2f32, v2f32, fmul, 1>; +def VMULfq : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VBINQ, "vmul.f32", v4f32, v4f32, fmul, 1>; +defm VMULsl : N3VSL_HS<0b1000, "vmul.i", mul>; +def VMULslfd : N3VDSL<0b10, 0b1001, IIC_VBIND, "vmul.f32", v2f32, fmul>; +def VMULslfq : N3VQSL<0b10, 0b1001, IIC_VBINQ, "vmul.f32", v4f32, v2f32, fmul>; +def : Pat<(v8i16 (mul (v8i16 QPR:$src1), + (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))), + (v8i16 (VMULslv8i16 (v8i16 QPR:$src1), + (v4i16 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; +def : Pat<(v4i32 (mul (v4i32 QPR:$src1), + (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))), + (v4i32 (VMULslv4i32 (v4i32 QPR:$src1), + (v2i32 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; +def : Pat<(v4f32 (fmul (v4f32 QPR:$src1), + (v4f32 (NEONvduplane (v4f32 QPR:$src2), imm:$lane)))), + (v4f32 (VMULslfq (v4f32 QPR:$src1), + (v2f32 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + // VQDMULH : Vector Saturating Doubling Multiply Returning High Half -defm VQDMULH : N3VInt_HS<0,0,0b1011,0, "vqdmulh.s", int_arm_neon_vqdmulh, 1>; +defm VQDMULH : N3VInt_HS<0, 0, 0b1011, 0, IIC_VMULi16D, IIC_VMULi32D, + IIC_VMULi16Q, IIC_VMULi32Q, + "vqdmulh.s", int_arm_neon_vqdmulh, 1>; +defm VQDMULHsl: N3VIntSL_HS<0b1100, IIC_VMULi16D, IIC_VMULi32D, + IIC_VMULi16Q, IIC_VMULi32Q, + "vqdmulh.s", int_arm_neon_vqdmulh>; +def : Pat<(v8i16 (int_arm_neon_vqdmulh (v8i16 QPR:$src1), + (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))), + (v8i16 (VQDMULHslv8i16 (v8i16 QPR:$src1), + (v4i16 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; +def : Pat<(v4i32 (int_arm_neon_vqdmulh (v4i32 QPR:$src1), + (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))), + (v4i32 (VQDMULHslv4i32 (v4i32 QPR:$src1), + (v2i32 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + // VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half -defm VQRDMULH : N3VInt_HS<1,0,0b1011,0, "vqrdmulh.s", int_arm_neon_vqrdmulh, 1>; +defm VQRDMULH : N3VInt_HS<1, 0, 0b1011, 0, IIC_VMULi16D, IIC_VMULi32D, + IIC_VMULi16Q, IIC_VMULi32Q, + "vqrdmulh.s", int_arm_neon_vqrdmulh, 1>; +defm VQRDMULHsl : N3VIntSL_HS<0b1101, IIC_VMULi16D, IIC_VMULi32D, + IIC_VMULi16Q, IIC_VMULi32Q, + "vqrdmulh.s", int_arm_neon_vqrdmulh>; +def : Pat<(v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src1), + (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))), + (v8i16 (VQRDMULHslv8i16 (v8i16 QPR:$src1), + (v4i16 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; +def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1), + (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))), + (v4i32 (VQRDMULHslv4i32 (v4i32 QPR:$src1), + (v2i32 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + // VMULL : Vector Multiply Long (integer and polynomial) (Q = D * D) -defm VMULLs : N3VLInt_QHS<0,1,0b1100,0, "vmull.s", int_arm_neon_vmulls, 1>; -defm VMULLu : N3VLInt_QHS<1,1,0b1100,0, "vmull.u", int_arm_neon_vmullu, 1>; -def VMULLp : N3VLInt<0, 1, 0b00, 0b1110, 0, "vmull.p8", v8i16, v8i8, +defm VMULLs : N3VLInt_QHS<0,1,0b1100,0, IIC_VMULi16D, "vmull.s", int_arm_neon_vmulls, 1>; +defm VMULLu : N3VLInt_QHS<1,1,0b1100,0, IIC_VMULi16D, "vmull.u", int_arm_neon_vmullu, 1>; +def VMULLp : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull.p8", v8i16, v8i8, int_arm_neon_vmullp, 1>; +defm VMULLsls : N3VLIntSL_HS<0, 0b1010, IIC_VMULi16D, "vmull.s", int_arm_neon_vmulls>; +defm VMULLslu : N3VLIntSL_HS<1, 0b1010, IIC_VMULi16D, "vmull.u", int_arm_neon_vmullu>; + // VQDMULL : Vector Saturating Doubling Multiply Long (Q = D * D) -defm VQDMULL : N3VLInt_HS<0,1,0b1101,0, "vqdmull.s", int_arm_neon_vqdmull, 1>; +defm VQDMULL : N3VLInt_HS<0,1,0b1101,0, IIC_VMULi16D, "vqdmull.s", int_arm_neon_vqdmull, 1>; +defm VQDMULLsl: N3VLIntSL_HS<0, 0b1011, IIC_VMULi16D, "vqdmull.s", int_arm_neon_vqdmull>; // Vector Multiply-Accumulate and Multiply-Subtract Operations. // VMLA : Vector Multiply Accumulate (integer and floating-point) -defm VMLA : N3VMulOp_QHS<0, 0, 0b1001, 0, "vmla.i", add>; -def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, "vmla.f32", v2f32, fmul, fadd>; -def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, "vmla.f32", v4f32, fmul, fadd>; +defm VMLA : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vmla.i", add>; +def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla.f32", v2f32, fmul, fadd>; +def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla.f32", v4f32, fmul, fadd>; +defm VMLAsl : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vmla.i", add>; +def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla.f32", v2f32, fmul, fadd>; +def VMLAslfq : N3VQMulOpSL<0b10, 0b0001, IIC_VMACQ, "vmla.f32", v4f32, v2f32, fmul, fadd>; + +def : Pat<(v8i16 (add (v8i16 QPR:$src1), + (mul (v8i16 QPR:$src2), + (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))), + (v8i16 (VMLAslv8i16 (v8i16 QPR:$src1), + (v8i16 QPR:$src2), + (v4i16 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; + +def : Pat<(v4i32 (add (v4i32 QPR:$src1), + (mul (v4i32 QPR:$src2), + (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))), + (v4i32 (VMLAslv4i32 (v4i32 QPR:$src1), + (v4i32 QPR:$src2), + (v2i32 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + +def : Pat<(v4f32 (fadd (v4f32 QPR:$src1), + (fmul (v4f32 QPR:$src2), + (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))), + (v4f32 (VMLAslfq (v4f32 QPR:$src1), + (v4f32 QPR:$src2), + (v2f32 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + // VMLAL : Vector Multiply Accumulate Long (Q += D * D) defm VMLALs : N3VLInt3_QHS<0,1,0b1000,0, "vmlal.s", int_arm_neon_vmlals>; defm VMLALu : N3VLInt3_QHS<1,1,0b1000,0, "vmlal.u", int_arm_neon_vmlalu>; + +defm VMLALsls : N3VLInt3SL_HS<0, 0b0010, "vmlal.s", int_arm_neon_vmlals>; +defm VMLALslu : N3VLInt3SL_HS<1, 0b0010, "vmlal.u", int_arm_neon_vmlalu>; + // VQDMLAL : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D) defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, "vqdmlal.s", int_arm_neon_vqdmlal>; +defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal.s", int_arm_neon_vqdmlal>; + // VMLS : Vector Multiply Subtract (integer and floating-point) -defm VMLS : N3VMulOp_QHS<0, 0, 0b1001, 0, "vmls.i", sub>; -def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, "vmls.f32", v2f32, fmul, fsub>; -def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, "vmls.f32", v4f32, fmul, fsub>; +defm VMLS : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vmls.i", sub>; +def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls.f32", v2f32, fmul, fsub>; +def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls.f32", v4f32, fmul, fsub>; +defm VMLSsl : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vmls.i", sub>; +def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls.f32", v2f32, fmul, fsub>; +def VMLSslfq : N3VQMulOpSL<0b10, 0b0101, IIC_VMACQ, "vmls.f32", v4f32, v2f32, fmul, fsub>; + +def : Pat<(v8i16 (sub (v8i16 QPR:$src1), + (mul (v8i16 QPR:$src2), + (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))), + (v8i16 (VMLSslv8i16 (v8i16 QPR:$src1), + (v8i16 QPR:$src2), + (v4i16 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; + +def : Pat<(v4i32 (sub (v4i32 QPR:$src1), + (mul (v4i32 QPR:$src2), + (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))), + (v4i32 (VMLSslv4i32 (v4i32 QPR:$src1), + (v4i32 QPR:$src2), + (v2i32 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + +def : Pat<(v4f32 (fsub (v4f32 QPR:$src1), + (fmul (v4f32 QPR:$src2), + (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))), + (v4f32 (VMLSslfq (v4f32 QPR:$src1), + (v4f32 QPR:$src2), + (v2f32 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + // VMLSL : Vector Multiply Subtract Long (Q -= D * D) defm VMLSLs : N3VLInt3_QHS<0,1,0b1010,0, "vmlsl.s", int_arm_neon_vmlsls>; defm VMLSLu : N3VLInt3_QHS<1,1,0b1010,0, "vmlsl.u", int_arm_neon_vmlslu>; + +defm VMLSLsls : N3VLInt3SL_HS<0, 0b0110, "vmlsl.s", int_arm_neon_vmlsls>; +defm VMLSLslu : N3VLInt3SL_HS<1, 0b0110, "vmlsl.u", int_arm_neon_vmlslu>; + // VQDMLSL : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D) defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, "vqdmlsl.s", int_arm_neon_vqdmlsl>; +defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl.s", int_arm_neon_vqdmlsl>; // Vector Subtract Operations. // VSUB : Vector Subtract (integer and floating-point) -defm VSUB : N3V_QHSD<1, 0, 0b1000, 0, "vsub.i", sub, 0>; -def VSUBfd : N3VD<0, 0, 0b10, 0b1101, 0, "vsub.f32", v2f32, v2f32, fsub, 0>; -def VSUBfq : N3VQ<0, 0, 0b10, 0b1101, 0, "vsub.f32", v4f32, v4f32, fsub, 0>; +defm VSUB : N3V_QHSD<1, 0, 0b1000, 0, IIC_VSUBiD, IIC_VSUBiQ, "vsub.i", sub, 0>; +def VSUBfd : N3VD<0, 0, 0b10, 0b1101, 0, IIC_VBIND, "vsub.f32", v2f32, v2f32, fsub, 0>; +def VSUBfq : N3VQ<0, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vsub.f32", v4f32, v4f32, fsub, 0>; // VSUBL : Vector Subtract Long (Q = D - D) -defm VSUBLs : N3VLInt_QHS<0,1,0b0010,0, "vsubl.s", int_arm_neon_vsubls, 1>; -defm VSUBLu : N3VLInt_QHS<1,1,0b0010,0, "vsubl.u", int_arm_neon_vsublu, 1>; +defm VSUBLs : N3VLInt_QHS<0,1,0b0010,0, IIC_VSHLiD, "vsubl.s", int_arm_neon_vsubls, 1>; +defm VSUBLu : N3VLInt_QHS<1,1,0b0010,0, IIC_VSHLiD, "vsubl.u", int_arm_neon_vsublu, 1>; // VSUBW : Vector Subtract Wide (Q = Q - D) defm VSUBWs : N3VWInt_QHS<0,1,0b0011,0, "vsubw.s", int_arm_neon_vsubws, 0>; defm VSUBWu : N3VWInt_QHS<1,1,0b0011,0, "vsubw.u", int_arm_neon_vsubwu, 0>; // VHSUB : Vector Halving Subtract -defm VHSUBs : N3VInt_QHS<0, 0, 0b0010, 0, "vhsub.s", int_arm_neon_vhsubs, 0>; -defm VHSUBu : N3VInt_QHS<1, 0, 0b0010, 0, "vhsub.u", int_arm_neon_vhsubu, 0>; +defm VHSUBs : N3VInt_QHS<0, 0, 0b0010, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vhsub.s", int_arm_neon_vhsubs, 0>; +defm VHSUBu : N3VInt_QHS<1, 0, 0b0010, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vhsub.u", int_arm_neon_vhsubu, 0>; // VQSUB : Vector Saturing Subtract -defm VQSUBs : N3VInt_QHSD<0, 0, 0b0010, 1, "vqsub.s", int_arm_neon_vqsubs, 0>; -defm VQSUBu : N3VInt_QHSD<1, 0, 0b0010, 1, "vqsub.u", int_arm_neon_vqsubu, 0>; +defm VQSUBs : N3VInt_QHSD<0, 0, 0b0010, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vqsub.s", int_arm_neon_vqsubs, 0>; +defm VQSUBu : N3VInt_QHSD<1, 0, 0b0010, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vqsub.u", int_arm_neon_vqsubu, 0>; // VSUBHN : Vector Subtract and Narrow Returning High Half (D = Q - Q) defm VSUBHN : N3VNInt_HSD<0,1,0b0110,0, "vsubhn.i", int_arm_neon_vsubhn, 0>; // VRSUBHN : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q) @@ -882,85 +1782,101 @@ defm VRSUBHN : N3VNInt_HSD<1,1,0b0110,0, "vrsubhn.i", int_arm_neon_vrsubhn, 0>; // Vector Comparisons. // VCEQ : Vector Compare Equal -defm VCEQ : N3V_QHS<1, 0, 0b1000, 1, "vceq.i", NEONvceq, 1>; -def VCEQfd : N3VD<0,0,0b00,0b1110,0, "vceq.f32", v2i32, v2f32, NEONvceq, 1>; -def VCEQfq : N3VQ<0,0,0b00,0b1110,0, "vceq.f32", v4i32, v4f32, NEONvceq, 1>; +defm VCEQ : N3V_QHS<1, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vceq.i", NEONvceq, 1>; +def VCEQfd : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq.f32", v2i32, v2f32, NEONvceq, 1>; +def VCEQfq : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq.f32", v4i32, v4f32, NEONvceq, 1>; // VCGE : Vector Compare Greater Than or Equal -defm VCGEs : N3V_QHS<0, 0, 0b0011, 1, "vcge.s", NEONvcge, 0>; -defm VCGEu : N3V_QHS<1, 0, 0b0011, 1, "vcge.u", NEONvcgeu, 0>; -def VCGEfd : N3VD<1,0,0b00,0b1110,0, "vcge.f32", v2i32, v2f32, NEONvcge, 0>; -def VCGEfq : N3VQ<1,0,0b00,0b1110,0, "vcge.f32", v4i32, v4f32, NEONvcge, 0>; +defm VCGEs : N3V_QHS<0, 0, 0b0011, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vcge.s", NEONvcge, 0>; +defm VCGEu : N3V_QHS<1, 0, 0b0011, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vcge.u", NEONvcgeu, 0>; +def VCGEfd : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge.f32", v2i32, v2f32, NEONvcge, 0>; +def VCGEfq : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge.f32", v4i32, v4f32, NEONvcge, 0>; // VCGT : Vector Compare Greater Than -defm VCGTs : N3V_QHS<0, 0, 0b0011, 0, "vcgt.s", NEONvcgt, 0>; -defm VCGTu : N3V_QHS<1, 0, 0b0011, 0, "vcgt.u", NEONvcgtu, 0>; -def VCGTfd : N3VD<1,0,0b10,0b1110,0, "vcgt.f32", v2i32, v2f32, NEONvcgt, 0>; -def VCGTfq : N3VQ<1,0,0b10,0b1110,0, "vcgt.f32", v4i32, v4f32, NEONvcgt, 0>; +defm VCGTs : N3V_QHS<0, 0, 0b0011, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vcgt.s", NEONvcgt, 0>; +defm VCGTu : N3V_QHS<1, 0, 0b0011, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vcgt.u", NEONvcgtu, 0>; +def VCGTfd : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt.f32", v2i32, v2f32, NEONvcgt, 0>; +def VCGTfq : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt.f32", v4i32, v4f32, NEONvcgt, 0>; // VACGE : Vector Absolute Compare Greater Than or Equal (aka VCAGE) -def VACGEd : N3VDInt<1, 0, 0b00, 0b1110, 1, "vacge.f32", v2i32, v2f32, +def VACGEd : N3VDInt<1, 0, 0b00, 0b1110, 1, IIC_VBIND, "vacge.f32", v2i32, v2f32, int_arm_neon_vacged, 0>; -def VACGEq : N3VQInt<1, 0, 0b00, 0b1110, 1, "vacge.f32", v4i32, v4f32, +def VACGEq : N3VQInt<1, 0, 0b00, 0b1110, 1, IIC_VBINQ, "vacge.f32", v4i32, v4f32, int_arm_neon_vacgeq, 0>; // VACGT : Vector Absolute Compare Greater Than (aka VCAGT) -def VACGTd : N3VDInt<1, 0, 0b10, 0b1110, 1, "vacgt.f32", v2i32, v2f32, +def VACGTd : N3VDInt<1, 0, 0b10, 0b1110, 1, IIC_VBIND, "vacgt.f32", v2i32, v2f32, int_arm_neon_vacgtd, 0>; -def VACGTq : N3VQInt<1, 0, 0b10, 0b1110, 1, "vacgt.f32", v4i32, v4f32, +def VACGTq : N3VQInt<1, 0, 0b10, 0b1110, 1, IIC_VBINQ, "vacgt.f32", v4i32, v4f32, int_arm_neon_vacgtq, 0>; // VTST : Vector Test Bits -defm VTST : N3V_QHS<0, 0, 0b1000, 1, "vtst.i", NEONvtst, 1>; +defm VTST : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vtst.i", NEONvtst, 1>; // Vector Bitwise Operations. // VAND : Vector Bitwise AND -def VANDd : N3VD<0, 0, 0b00, 0b0001, 1, "vand", v2i32, v2i32, and, 1>; -def VANDq : N3VQ<0, 0, 0b00, 0b0001, 1, "vand", v4i32, v4i32, and, 1>; +def VANDd : N3VD<0, 0, 0b00, 0b0001, 1, IIC_VBINiD, "vand", v2i32, v2i32, and, 1>; +def VANDq : N3VQ<0, 0, 0b00, 0b0001, 1, IIC_VBINiQ, "vand", v4i32, v4i32, and, 1>; // VEOR : Vector Bitwise Exclusive OR -def VEORd : N3VD<1, 0, 0b00, 0b0001, 1, "veor", v2i32, v2i32, xor, 1>; -def VEORq : N3VQ<1, 0, 0b00, 0b0001, 1, "veor", v4i32, v4i32, xor, 1>; +def VEORd : N3VD<1, 0, 0b00, 0b0001, 1, IIC_VBINiD, "veor", v2i32, v2i32, xor, 1>; +def VEORq : N3VQ<1, 0, 0b00, 0b0001, 1, IIC_VBINiQ, "veor", v4i32, v4i32, xor, 1>; // VORR : Vector Bitwise OR -def VORRd : N3VD<0, 0, 0b10, 0b0001, 1, "vorr", v2i32, v2i32, or, 1>; -def VORRq : N3VQ<0, 0, 0b10, 0b0001, 1, "vorr", v4i32, v4i32, or, 1>; +def VORRd : N3VD<0, 0, 0b10, 0b0001, 1, IIC_VBINiD, "vorr", v2i32, v2i32, or, 1>; +def VORRq : N3VQ<0, 0, 0b10, 0b0001, 1, IIC_VBINiQ, "vorr", v4i32, v4i32, or, 1>; // VBIC : Vector Bitwise Bit Clear (AND NOT) def VBICd : N3V<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2), "vbic\t$dst, $src1, $src2", "", - [(set DPR:$dst, (v2i32 (and DPR:$src1,(vnot DPR:$src2))))]>; + (ins DPR:$src1, DPR:$src2), IIC_VBINiD, + "vbic\t$dst, $src1, $src2", "", + [(set DPR:$dst, (v2i32 (and DPR:$src1, + (vnot_conv DPR:$src2))))]>; def VBICq : N3V<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2), "vbic\t$dst, $src1, $src2", "", - [(set QPR:$dst, (v4i32 (and QPR:$src1,(vnot QPR:$src2))))]>; + (ins QPR:$src1, QPR:$src2), IIC_VBINiQ, + "vbic\t$dst, $src1, $src2", "", + [(set QPR:$dst, (v4i32 (and QPR:$src1, + (vnot_conv QPR:$src2))))]>; // VORN : Vector Bitwise OR NOT def VORNd : N3V<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2), "vorn\t$dst, $src1, $src2", "", - [(set DPR:$dst, (v2i32 (or DPR:$src1, (vnot DPR:$src2))))]>; + (ins DPR:$src1, DPR:$src2), IIC_VBINiD, + "vorn\t$dst, $src1, $src2", "", + [(set DPR:$dst, (v2i32 (or DPR:$src1, + (vnot_conv DPR:$src2))))]>; def VORNq : N3V<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2), "vorn\t$dst, $src1, $src2", "", - [(set QPR:$dst, (v4i32 (or QPR:$src1, (vnot QPR:$src2))))]>; + (ins QPR:$src1, QPR:$src2), IIC_VBINiQ, + "vorn\t$dst, $src1, $src2", "", + [(set QPR:$dst, (v4i32 (or QPR:$src1, + (vnot_conv QPR:$src2))))]>; // VMVN : Vector Bitwise NOT def VMVNd : N2V<0b11, 0b11, 0b00, 0b00, 0b01011, 0, 0, - (outs DPR:$dst), (ins DPR:$src), "vmvn\t$dst, $src", "", + (outs DPR:$dst), (ins DPR:$src), IIC_VSHLiD, + "vmvn\t$dst, $src", "", [(set DPR:$dst, (v2i32 (vnot DPR:$src)))]>; def VMVNq : N2V<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0, - (outs QPR:$dst), (ins QPR:$src), "vmvn\t$dst, $src", "", + (outs QPR:$dst), (ins QPR:$src), IIC_VSHLiD, + "vmvn\t$dst, $src", "", [(set QPR:$dst, (v4i32 (vnot QPR:$src)))]>; def : Pat<(v2i32 (vnot_conv DPR:$src)), (VMVNd DPR:$src)>; def : Pat<(v4i32 (vnot_conv QPR:$src)), (VMVNq QPR:$src)>; // VBSL : Vector Bitwise Select def VBSLd : N3V<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst), - (ins DPR:$src1, DPR:$src2, DPR:$src3), + (ins DPR:$src1, DPR:$src2, DPR:$src3), IIC_VCNTiD, "vbsl\t$dst, $src2, $src3", "$src1 = $dst", [(set DPR:$dst, (v2i32 (or (and DPR:$src2, DPR:$src1), - (and DPR:$src3, (vnot DPR:$src1)))))]>; + (and DPR:$src3, (vnot_conv DPR:$src1)))))]>; def VBSLq : N3V<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst), - (ins QPR:$src1, QPR:$src2, QPR:$src3), + (ins QPR:$src1, QPR:$src2, QPR:$src3), IIC_VCNTiQ, "vbsl\t$dst, $src2, $src3", "$src1 = $dst", [(set QPR:$dst, (v4i32 (or (and QPR:$src2, QPR:$src1), - (and QPR:$src3, (vnot QPR:$src1)))))]>; + (and QPR:$src3, (vnot_conv QPR:$src1)))))]>; // VBIF : Vector Bitwise Insert if False // like VBSL but with: "vbif\t$dst, $src3, $src1", "$src2 = $dst", @@ -973,16 +1889,18 @@ def VBSLq : N3V<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst), // Vector Absolute Differences. // VABD : Vector Absolute Difference -defm VABDs : N3VInt_QHS<0, 0, 0b0111, 0, "vabd.s", int_arm_neon_vabds, 0>; -defm VABDu : N3VInt_QHS<1, 0, 0b0111, 0, "vabd.u", int_arm_neon_vabdu, 0>; -def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, "vabd.f32", v2f32, v2f32, - int_arm_neon_vabdf, 0>; -def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, "vabd.f32", v4f32, v4f32, - int_arm_neon_vabdf, 0>; +defm VABDs : N3VInt_QHS<0, 0, 0b0111, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vabd.s", int_arm_neon_vabds, 0>; +defm VABDu : N3VInt_QHS<1, 0, 0b0111, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vabd.u", int_arm_neon_vabdu, 0>; +def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, IIC_VBIND, "vabd.f32", v2f32, v2f32, + int_arm_neon_vabds, 0>; +def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vabd.f32", v4f32, v4f32, + int_arm_neon_vabds, 0>; // VABDL : Vector Absolute Difference Long (Q = | D - D |) -defm VABDLs : N3VLInt_QHS<0,1,0b0111,0, "vabdl.s", int_arm_neon_vabdls, 0>; -defm VABDLu : N3VLInt_QHS<1,1,0b0111,0, "vabdl.u", int_arm_neon_vabdlu, 0>; +defm VABDLs : N3VLInt_QHS<0,1,0b0111,0, IIC_VBINi4Q, "vabdl.s", int_arm_neon_vabdls, 0>; +defm VABDLu : N3VLInt_QHS<1,1,0b0111,0, IIC_VBINi4Q, "vabdl.u", int_arm_neon_vabdlu, 0>; // VABA : Vector Absolute Difference and Accumulate defm VABAs : N3VInt3_QHS<0,1,0b0101,0, "vaba.s", int_arm_neon_vabas>; @@ -995,32 +1913,36 @@ defm VABALu : N3VLInt3_QHS<1,1,0b0101,0, "vabal.u", int_arm_neon_vabalu>; // Vector Maximum and Minimum. // VMAX : Vector Maximum -defm VMAXs : N3VInt_QHS<0, 0, 0b0110, 0, "vmax.s", int_arm_neon_vmaxs, 1>; -defm VMAXu : N3VInt_QHS<1, 0, 0b0110, 0, "vmax.u", int_arm_neon_vmaxu, 1>; -def VMAXfd : N3VDInt<0, 0, 0b00, 0b1111, 0, "vmax.f32", v2f32, v2f32, - int_arm_neon_vmaxf, 1>; -def VMAXfq : N3VQInt<0, 0, 0b00, 0b1111, 0, "vmax.f32", v4f32, v4f32, - int_arm_neon_vmaxf, 1>; +defm VMAXs : N3VInt_QHS<0, 0, 0b0110, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vmax.s", int_arm_neon_vmaxs, 1>; +defm VMAXu : N3VInt_QHS<1, 0, 0b0110, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vmax.u", int_arm_neon_vmaxu, 1>; +def VMAXfd : N3VDInt<0, 0, 0b00, 0b1111, 0, IIC_VBIND, "vmax.f32", v2f32, v2f32, + int_arm_neon_vmaxs, 1>; +def VMAXfq : N3VQInt<0, 0, 0b00, 0b1111, 0, IIC_VBINQ, "vmax.f32", v4f32, v4f32, + int_arm_neon_vmaxs, 1>; // VMIN : Vector Minimum -defm VMINs : N3VInt_QHS<0, 0, 0b0110, 1, "vmin.s", int_arm_neon_vmins, 1>; -defm VMINu : N3VInt_QHS<1, 0, 0b0110, 1, "vmin.u", int_arm_neon_vminu, 1>; -def VMINfd : N3VDInt<0, 0, 0b10, 0b1111, 0, "vmin.f32", v2f32, v2f32, - int_arm_neon_vminf, 1>; -def VMINfq : N3VQInt<0, 0, 0b10, 0b1111, 0, "vmin.f32", v4f32, v4f32, - int_arm_neon_vminf, 1>; +defm VMINs : N3VInt_QHS<0, 0, 0b0110, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vmin.s", int_arm_neon_vmins, 1>; +defm VMINu : N3VInt_QHS<1, 0, 0b0110, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vmin.u", int_arm_neon_vminu, 1>; +def VMINfd : N3VDInt<0, 0, 0b10, 0b1111, 0, IIC_VBIND, "vmin.f32", v2f32, v2f32, + int_arm_neon_vmins, 1>; +def VMINfq : N3VQInt<0, 0, 0b10, 0b1111, 0, IIC_VBINQ, "vmin.f32", v4f32, v4f32, + int_arm_neon_vmins, 1>; // Vector Pairwise Operations. // VPADD : Vector Pairwise Add -def VPADDi8 : N3VDInt<0, 0, 0b00, 0b1011, 1, "vpadd.i8", v8i8, v8i8, - int_arm_neon_vpaddi, 0>; -def VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, "vpadd.i16", v4i16, v4i16, - int_arm_neon_vpaddi, 0>; -def VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, "vpadd.i32", v2i32, v2i32, - int_arm_neon_vpaddi, 0>; -def VPADDf : N3VDInt<1, 0, 0b00, 0b1101, 0, "vpadd.f32", v2f32, v2f32, - int_arm_neon_vpaddf, 0>; +def VPADDi8 : N3VDInt<0, 0, 0b00, 0b1011, 1, IIC_VBINiD, "vpadd.i8", v8i8, v8i8, + int_arm_neon_vpadd, 0>; +def VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, IIC_VBINiD, "vpadd.i16", v4i16, v4i16, + int_arm_neon_vpadd, 0>; +def VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, IIC_VBINiD, "vpadd.i32", v2i32, v2i32, + int_arm_neon_vpadd, 0>; +def VPADDf : N3VDInt<1, 0, 0b00, 0b1101, 0, IIC_VBIND, "vpadd.f32", v2f32, v2f32, + int_arm_neon_vpadd, 0>; // VPADDL : Vector Pairwise Add Long defm VPADDLs : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpaddl.s", @@ -1035,81 +1957,91 @@ defm VPADALu : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b00101, 0, "vpadal.u", int_arm_neon_vpadalu>; // VPMAX : Vector Pairwise Maximum -def VPMAXs8 : N3VDInt<0, 0, 0b00, 0b1010, 0, "vpmax.s8", v8i8, v8i8, +def VPMAXs8 : N3VDInt<0, 0, 0b00, 0b1010, 0, IIC_VBINi4D, "vpmax.s8", v8i8, v8i8, int_arm_neon_vpmaxs, 0>; -def VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, "vpmax.s16", v4i16, v4i16, +def VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, IIC_VBINi4D, "vpmax.s16", v4i16, v4i16, int_arm_neon_vpmaxs, 0>; -def VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, "vpmax.s32", v2i32, v2i32, +def VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, IIC_VBINi4D, "vpmax.s32", v2i32, v2i32, int_arm_neon_vpmaxs, 0>; -def VPMAXu8 : N3VDInt<1, 0, 0b00, 0b1010, 0, "vpmax.u8", v8i8, v8i8, +def VPMAXu8 : N3VDInt<1, 0, 0b00, 0b1010, 0, IIC_VBINi4D, "vpmax.u8", v8i8, v8i8, int_arm_neon_vpmaxu, 0>; -def VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, "vpmax.u16", v4i16, v4i16, +def VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, IIC_VBINi4D, "vpmax.u16", v4i16, v4i16, int_arm_neon_vpmaxu, 0>; -def VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, "vpmax.u32", v2i32, v2i32, +def VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, IIC_VBINi4D, "vpmax.u32", v2i32, v2i32, int_arm_neon_vpmaxu, 0>; -def VPMAXf : N3VDInt<1, 0, 0b00, 0b1111, 0, "vpmax.f32", v2f32, v2f32, - int_arm_neon_vpmaxf, 0>; +def VPMAXf : N3VDInt<1, 0, 0b00, 0b1111, 0, IIC_VBINi4D, "vpmax.f32", v2f32, v2f32, + int_arm_neon_vpmaxs, 0>; // VPMIN : Vector Pairwise Minimum -def VPMINs8 : N3VDInt<0, 0, 0b00, 0b1010, 1, "vpmin.s8", v8i8, v8i8, +def VPMINs8 : N3VDInt<0, 0, 0b00, 0b1010, 1, IIC_VBINi4D, "vpmin.s8", v8i8, v8i8, int_arm_neon_vpmins, 0>; -def VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, "vpmin.s16", v4i16, v4i16, +def VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, IIC_VBINi4D, "vpmin.s16", v4i16, v4i16, int_arm_neon_vpmins, 0>; -def VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, "vpmin.s32", v2i32, v2i32, +def VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, IIC_VBINi4D, "vpmin.s32", v2i32, v2i32, int_arm_neon_vpmins, 0>; -def VPMINu8 : N3VDInt<1, 0, 0b00, 0b1010, 1, "vpmin.u8", v8i8, v8i8, +def VPMINu8 : N3VDInt<1, 0, 0b00, 0b1010, 1, IIC_VBINi4D, "vpmin.u8", v8i8, v8i8, int_arm_neon_vpminu, 0>; -def VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, "vpmin.u16", v4i16, v4i16, +def VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, IIC_VBINi4D, "vpmin.u16", v4i16, v4i16, int_arm_neon_vpminu, 0>; -def VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, "vpmin.u32", v2i32, v2i32, +def VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, IIC_VBINi4D, "vpmin.u32", v2i32, v2i32, int_arm_neon_vpminu, 0>; -def VPMINf : N3VDInt<1, 0, 0b10, 0b1111, 0, "vpmin.f32", v2f32, v2f32, - int_arm_neon_vpminf, 0>; +def VPMINf : N3VDInt<1, 0, 0b10, 0b1111, 0, IIC_VBINi4D, "vpmin.f32", v2f32, v2f32, + int_arm_neon_vpmins, 0>; // Vector Reciprocal and Reciprocal Square Root Estimate and Step. // VRECPE : Vector Reciprocal Estimate -def VRECPEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, "vrecpe.u32", +def VRECPEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, + IIC_VUNAD, "vrecpe.u32", v2i32, v2i32, int_arm_neon_vrecpe>; -def VRECPEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, "vrecpe.u32", +def VRECPEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, + IIC_VUNAQ, "vrecpe.u32", v4i32, v4i32, int_arm_neon_vrecpe>; -def VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, "vrecpe.f32", - v2f32, v2f32, int_arm_neon_vrecpef>; -def VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, "vrecpe.f32", - v4f32, v4f32, int_arm_neon_vrecpef>; +def VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, + IIC_VUNAD, "vrecpe.f32", + v2f32, v2f32, int_arm_neon_vrecpe>; +def VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, + IIC_VUNAQ, "vrecpe.f32", + v4f32, v4f32, int_arm_neon_vrecpe>; // VRECPS : Vector Reciprocal Step -def VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, "vrecps.f32", v2f32, v2f32, +def VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, IIC_VRECSD, "vrecps.f32", v2f32, v2f32, int_arm_neon_vrecps, 1>; -def VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, "vrecps.f32", v4f32, v4f32, +def VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, IIC_VRECSQ, "vrecps.f32", v4f32, v4f32, int_arm_neon_vrecps, 1>; // VRSQRTE : Vector Reciprocal Square Root Estimate -def VRSQRTEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, "vrsqrte.u32", - v2i32, v2i32, int_arm_neon_vrsqrte>; -def VRSQRTEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, "vrsqrte.u32", - v4i32, v4i32, int_arm_neon_vrsqrte>; -def VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, "vrsqrte.f32", - v2f32, v2f32, int_arm_neon_vrsqrtef>; -def VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, "vrsqrte.f32", - v4f32, v4f32, int_arm_neon_vrsqrtef>; +def VRSQRTEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, + IIC_VUNAD, "vrsqrte.u32", + v2i32, v2i32, int_arm_neon_vrsqrte>; +def VRSQRTEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, + IIC_VUNAQ, "vrsqrte.u32", + v4i32, v4i32, int_arm_neon_vrsqrte>; +def VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, + IIC_VUNAD, "vrsqrte.f32", + v2f32, v2f32, int_arm_neon_vrsqrte>; +def VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, + IIC_VUNAQ, "vrsqrte.f32", + v4f32, v4f32, int_arm_neon_vrsqrte>; // VRSQRTS : Vector Reciprocal Square Root Step -def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, "vrsqrts.f32", v2f32, v2f32, +def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, IIC_VRECSD, "vrsqrts.f32", v2f32, v2f32, int_arm_neon_vrsqrts, 1>; -def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, "vrsqrts.f32", v4f32, v4f32, +def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, IIC_VRECSQ, "vrsqrts.f32", v4f32, v4f32, int_arm_neon_vrsqrts, 1>; // Vector Shifts. // VSHL : Vector Shift -defm VSHLs : N3VInt_QHSD<0, 0, 0b0100, 0, "vshl.s", int_arm_neon_vshifts, 0>; -defm VSHLu : N3VInt_QHSD<1, 0, 0b0100, 0, "vshl.u", int_arm_neon_vshiftu, 0>; +defm VSHLs : N3VInt_QHSD<0, 0, 0b0100, 0, IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, + IIC_VSHLiQ, "vshl.s", int_arm_neon_vshifts, 0>; +defm VSHLu : N3VInt_QHSD<1, 0, 0b0100, 0, IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, + IIC_VSHLiQ, "vshl.u", int_arm_neon_vshiftu, 0>; // VSHL : Vector Shift Left (Immediate) -defm VSHLi : N2VSh_QHSD<0, 1, 0b0111, 1, "vshl.i", NEONvshl>; +defm VSHLi : N2VSh_QHSD<0, 1, 0b0111, 1, IIC_VSHLiD, "vshl.i", NEONvshl>; // VSHR : Vector Shift Right (Immediate) -defm VSHRs : N2VSh_QHSD<0, 1, 0b0000, 1, "vshr.s", NEONvshrs>; -defm VSHRu : N2VSh_QHSD<1, 1, 0b0000, 1, "vshr.u", NEONvshru>; +defm VSHRs : N2VSh_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr.s", NEONvshrs>; +defm VSHRu : N2VSh_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr.u", NEONvshru>; // VSHLL : Vector Shift Left Long def VSHLLs8 : N2VLSh<0, 1, 0b001000, 0b1010, 0, 0, 1, "vshll.s8", @@ -1134,86 +2066,90 @@ def VSHLLi32 : N2VLSh<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll.i32", v2i64, v2i32, NEONvshlli>; // VSHRN : Vector Shift Right and Narrow -def VSHRN16 : N2VNSh<0, 1, 0b001000, 0b1000, 0, 0, 1, "vshrn.i16", - v8i8, v8i16, NEONvshrn>; -def VSHRN32 : N2VNSh<0, 1, 0b010000, 0b1000, 0, 0, 1, "vshrn.i32", - v4i16, v4i32, NEONvshrn>; -def VSHRN64 : N2VNSh<0, 1, 0b100000, 0b1000, 0, 0, 1, "vshrn.i64", - v2i32, v2i64, NEONvshrn>; +def VSHRN16 : N2VNSh<0, 1, 0b001000, 0b1000, 0, 0, 1, + IIC_VSHLiD, "vshrn.i16", v8i8, v8i16, NEONvshrn>; +def VSHRN32 : N2VNSh<0, 1, 0b010000, 0b1000, 0, 0, 1, + IIC_VSHLiD, "vshrn.i32", v4i16, v4i32, NEONvshrn>; +def VSHRN64 : N2VNSh<0, 1, 0b100000, 0b1000, 0, 0, 1, + IIC_VSHLiD, "vshrn.i64", v2i32, v2i64, NEONvshrn>; // VRSHL : Vector Rounding Shift -defm VRSHLs : N3VInt_QHSD<0,0,0b0101,0, "vrshl.s", int_arm_neon_vrshifts, 0>; -defm VRSHLu : N3VInt_QHSD<1,0,0b0101,0, "vrshl.u", int_arm_neon_vrshiftu, 0>; +defm VRSHLs : N3VInt_QHSD<0,0,0b0101,0, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, + IIC_VSHLi4Q, "vrshl.s", int_arm_neon_vrshifts, 0>; +defm VRSHLu : N3VInt_QHSD<1,0,0b0101,0, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, + IIC_VSHLi4Q, "vrshl.u", int_arm_neon_vrshiftu, 0>; // VRSHR : Vector Rounding Shift Right -defm VRSHRs : N2VSh_QHSD<0, 1, 0b0010, 1, "vrshr.s", NEONvrshrs>; -defm VRSHRu : N2VSh_QHSD<1, 1, 0b0010, 1, "vrshr.u", NEONvrshru>; +defm VRSHRs : N2VSh_QHSD<0, 1, 0b0010, 1, IIC_VSHLi4D, "vrshr.s", NEONvrshrs>; +defm VRSHRu : N2VSh_QHSD<1, 1, 0b0010, 1, IIC_VSHLi4D, "vrshr.u", NEONvrshru>; // VRSHRN : Vector Rounding Shift Right and Narrow -def VRSHRN16 : N2VNSh<0, 1, 0b001000, 0b1000, 0, 1, 1, "vrshrn.i16", - v8i8, v8i16, NEONvrshrn>; -def VRSHRN32 : N2VNSh<0, 1, 0b010000, 0b1000, 0, 1, 1, "vrshrn.i32", - v4i16, v4i32, NEONvrshrn>; -def VRSHRN64 : N2VNSh<0, 1, 0b100000, 0b1000, 0, 1, 1, "vrshrn.i64", - v2i32, v2i64, NEONvrshrn>; +def VRSHRN16 : N2VNSh<0, 1, 0b001000, 0b1000, 0, 1, 1, + IIC_VSHLi4D, "vrshrn.i16", v8i8, v8i16, NEONvrshrn>; +def VRSHRN32 : N2VNSh<0, 1, 0b010000, 0b1000, 0, 1, 1, + IIC_VSHLi4D, "vrshrn.i32", v4i16, v4i32, NEONvrshrn>; +def VRSHRN64 : N2VNSh<0, 1, 0b100000, 0b1000, 0, 1, 1, + IIC_VSHLi4D, "vrshrn.i64", v2i32, v2i64, NEONvrshrn>; // VQSHL : Vector Saturating Shift -defm VQSHLs : N3VInt_QHSD<0,0,0b0100,1, "vqshl.s", int_arm_neon_vqshifts, 0>; -defm VQSHLu : N3VInt_QHSD<1,0,0b0100,1, "vqshl.u", int_arm_neon_vqshiftu, 0>; +defm VQSHLs : N3VInt_QHSD<0,0,0b0100,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, + IIC_VSHLi4Q, "vqshl.s", int_arm_neon_vqshifts, 0>; +defm VQSHLu : N3VInt_QHSD<1,0,0b0100,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, + IIC_VSHLi4Q, "vqshl.u", int_arm_neon_vqshiftu, 0>; // VQSHL : Vector Saturating Shift Left (Immediate) -defm VQSHLsi : N2VSh_QHSD<0, 1, 0b0111, 1, "vqshl.s", NEONvqshls>; -defm VQSHLui : N2VSh_QHSD<1, 1, 0b0111, 1, "vqshl.u", NEONvqshlu>; +defm VQSHLsi : N2VSh_QHSD<0, 1, 0b0111, 1, IIC_VSHLi4D, "vqshl.s", NEONvqshls>; +defm VQSHLui : N2VSh_QHSD<1, 1, 0b0111, 1, IIC_VSHLi4D, "vqshl.u", NEONvqshlu>; // VQSHLU : Vector Saturating Shift Left (Immediate, Unsigned) -defm VQSHLsu : N2VSh_QHSD<1, 1, 0b0110, 1, "vqshlu.s", NEONvqshlsu>; +defm VQSHLsu : N2VSh_QHSD<1, 1, 0b0110, 1, IIC_VSHLi4D, "vqshlu.s", NEONvqshlsu>; // VQSHRN : Vector Saturating Shift Right and Narrow -def VQSHRNs16 : N2VNSh<0, 1, 0b001000, 0b1001, 0, 0, 1, "vqshrn.s16", - v8i8, v8i16, NEONvqshrns>; -def VQSHRNs32 : N2VNSh<0, 1, 0b010000, 0b1001, 0, 0, 1, "vqshrn.s32", - v4i16, v4i32, NEONvqshrns>; -def VQSHRNs64 : N2VNSh<0, 1, 0b100000, 0b1001, 0, 0, 1, "vqshrn.s64", - v2i32, v2i64, NEONvqshrns>; -def VQSHRNu16 : N2VNSh<1, 1, 0b001000, 0b1001, 0, 0, 1, "vqshrn.u16", - v8i8, v8i16, NEONvqshrnu>; -def VQSHRNu32 : N2VNSh<1, 1, 0b010000, 0b1001, 0, 0, 1, "vqshrn.u32", - v4i16, v4i32, NEONvqshrnu>; -def VQSHRNu64 : N2VNSh<1, 1, 0b100000, 0b1001, 0, 0, 1, "vqshrn.u64", - v2i32, v2i64, NEONvqshrnu>; +def VQSHRNs16 : N2VNSh<0, 1, 0b001000, 0b1001, 0, 0, 1, + IIC_VSHLi4D, "vqshrn.s16", v8i8, v8i16, NEONvqshrns>; +def VQSHRNs32 : N2VNSh<0, 1, 0b010000, 0b1001, 0, 0, 1, + IIC_VSHLi4D, "vqshrn.s32", v4i16, v4i32, NEONvqshrns>; +def VQSHRNs64 : N2VNSh<0, 1, 0b100000, 0b1001, 0, 0, 1, + IIC_VSHLi4D, "vqshrn.s64", v2i32, v2i64, NEONvqshrns>; +def VQSHRNu16 : N2VNSh<1, 1, 0b001000, 0b1001, 0, 0, 1, + IIC_VSHLi4D, "vqshrn.u16", v8i8, v8i16, NEONvqshrnu>; +def VQSHRNu32 : N2VNSh<1, 1, 0b010000, 0b1001, 0, 0, 1, + IIC_VSHLi4D, "vqshrn.u32", v4i16, v4i32, NEONvqshrnu>; +def VQSHRNu64 : N2VNSh<1, 1, 0b100000, 0b1001, 0, 0, 1, + IIC_VSHLi4D, "vqshrn.u64", v2i32, v2i64, NEONvqshrnu>; // VQSHRUN : Vector Saturating Shift Right and Narrow (Unsigned) -def VQSHRUN16 : N2VNSh<1, 1, 0b001000, 0b1000, 0, 0, 1, "vqshrun.s16", - v8i8, v8i16, NEONvqshrnsu>; -def VQSHRUN32 : N2VNSh<1, 1, 0b010000, 0b1000, 0, 0, 1, "vqshrun.s32", - v4i16, v4i32, NEONvqshrnsu>; -def VQSHRUN64 : N2VNSh<1, 1, 0b100000, 0b1000, 0, 0, 1, "vqshrun.s64", - v2i32, v2i64, NEONvqshrnsu>; +def VQSHRUN16 : N2VNSh<1, 1, 0b001000, 0b1000, 0, 0, 1, + IIC_VSHLi4D, "vqshrun.s16", v8i8, v8i16, NEONvqshrnsu>; +def VQSHRUN32 : N2VNSh<1, 1, 0b010000, 0b1000, 0, 0, 1, + IIC_VSHLi4D, "vqshrun.s32", v4i16, v4i32, NEONvqshrnsu>; +def VQSHRUN64 : N2VNSh<1, 1, 0b100000, 0b1000, 0, 0, 1, + IIC_VSHLi4D, "vqshrun.s64", v2i32, v2i64, NEONvqshrnsu>; // VQRSHL : Vector Saturating Rounding Shift -defm VQRSHLs : N3VInt_QHSD<0, 0, 0b0101, 1, "vqrshl.s", - int_arm_neon_vqrshifts, 0>; -defm VQRSHLu : N3VInt_QHSD<1, 0, 0b0101, 1, "vqrshl.u", - int_arm_neon_vqrshiftu, 0>; +defm VQRSHLs : N3VInt_QHSD<0, 0, 0b0101, 1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, + IIC_VSHLi4Q, "vqrshl.s", int_arm_neon_vqrshifts, 0>; +defm VQRSHLu : N3VInt_QHSD<1, 0, 0b0101, 1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, + IIC_VSHLi4Q, "vqrshl.u", int_arm_neon_vqrshiftu, 0>; // VQRSHRN : Vector Saturating Rounding Shift Right and Narrow -def VQRSHRNs16: N2VNSh<0, 1, 0b001000, 0b1001, 0, 1, 1, "vqrshrn.s16", - v8i8, v8i16, NEONvqrshrns>; -def VQRSHRNs32: N2VNSh<0, 1, 0b010000, 0b1001, 0, 1, 1, "vqrshrn.s32", - v4i16, v4i32, NEONvqrshrns>; -def VQRSHRNs64: N2VNSh<0, 1, 0b100000, 0b1001, 0, 1, 1, "vqrshrn.s64", - v2i32, v2i64, NEONvqrshrns>; -def VQRSHRNu16: N2VNSh<1, 1, 0b001000, 0b1001, 0, 1, 1, "vqrshrn.u16", - v8i8, v8i16, NEONvqrshrnu>; -def VQRSHRNu32: N2VNSh<1, 1, 0b010000, 0b1001, 0, 1, 1, "vqrshrn.u32", - v4i16, v4i32, NEONvqrshrnu>; -def VQRSHRNu64: N2VNSh<1, 1, 0b100000, 0b1001, 0, 1, 1, "vqrshrn.u64", - v2i32, v2i64, NEONvqrshrnu>; +def VQRSHRNs16: N2VNSh<0, 1, 0b001000, 0b1001, 0, 1, 1, + IIC_VSHLi4D, "vqrshrn.s16", v8i8, v8i16, NEONvqrshrns>; +def VQRSHRNs32: N2VNSh<0, 1, 0b010000, 0b1001, 0, 1, 1, + IIC_VSHLi4D, "vqrshrn.s32", v4i16, v4i32, NEONvqrshrns>; +def VQRSHRNs64: N2VNSh<0, 1, 0b100000, 0b1001, 0, 1, 1, + IIC_VSHLi4D, "vqrshrn.s64", v2i32, v2i64, NEONvqrshrns>; +def VQRSHRNu16: N2VNSh<1, 1, 0b001000, 0b1001, 0, 1, 1, + IIC_VSHLi4D, "vqrshrn.u16", v8i8, v8i16, NEONvqrshrnu>; +def VQRSHRNu32: N2VNSh<1, 1, 0b010000, 0b1001, 0, 1, 1, + IIC_VSHLi4D, "vqrshrn.u32", v4i16, v4i32, NEONvqrshrnu>; +def VQRSHRNu64: N2VNSh<1, 1, 0b100000, 0b1001, 0, 1, 1, + IIC_VSHLi4D, "vqrshrn.u64", v2i32, v2i64, NEONvqrshrnu>; // VQRSHRUN : Vector Saturating Rounding Shift Right and Narrow (Unsigned) -def VQRSHRUN16: N2VNSh<1, 1, 0b001000, 0b1000, 0, 1, 1, "vqrshrun.s16", - v8i8, v8i16, NEONvqrshrnsu>; -def VQRSHRUN32: N2VNSh<1, 1, 0b010000, 0b1000, 0, 1, 1, "vqrshrun.s32", - v4i16, v4i32, NEONvqrshrnsu>; -def VQRSHRUN64: N2VNSh<1, 1, 0b100000, 0b1000, 0, 1, 1, "vqrshrun.s64", - v2i32, v2i64, NEONvqrshrnsu>; +def VQRSHRUN16: N2VNSh<1, 1, 0b001000, 0b1000, 0, 1, 1, + IIC_VSHLi4D, "vqrshrun.s16", v8i8, v8i16, NEONvqrshrnsu>; +def VQRSHRUN32: N2VNSh<1, 1, 0b010000, 0b1000, 0, 1, 1, + IIC_VSHLi4D, "vqrshrun.s32", v4i16, v4i32, NEONvqrshrnsu>; +def VQRSHRUN64: N2VNSh<1, 1, 0b100000, 0b1000, 0, 1, 1, + IIC_VSHLi4D, "vqrshrun.s64", v2i32, v2i64, NEONvqrshrnsu>; // VSRA : Vector Shift Right and Accumulate defm VSRAs : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra.s", NEONvshrs>; @@ -1230,15 +2166,19 @@ defm VSRI : N2VShIns_QHSD<1, 1, 0b0100, 1, "vsri.", NEONvsri>; // Vector Absolute and Saturating Absolute. // VABS : Vector Absolute Value -defm VABS : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0, "vabs.s", +defm VABS : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0, + IIC_VUNAiD, IIC_VUNAiQ, "vabs.s", int_arm_neon_vabs>; -def VABSfd : N2VDInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32", - v2f32, v2f32, int_arm_neon_vabsf>; -def VABSfq : N2VQInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32", - v4f32, v4f32, int_arm_neon_vabsf>; +def VABSfd : N2VDInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, + IIC_VUNAD, "vabs.f32", + v2f32, v2f32, int_arm_neon_vabs>; +def VABSfq : N2VQInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, + IIC_VUNAQ, "vabs.f32", + v4f32, v4f32, int_arm_neon_vabs>; // VQABS : Vector Saturating Absolute Value -defm VQABS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, "vqabs.s", +defm VQABS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, + IIC_VQUNAiD, IIC_VQUNAiQ, "vqabs.s", int_arm_neon_vqabs>; // Vector Negate. @@ -1248,11 +2188,11 @@ def vneg_conv : PatFrag<(ops node:$in), (sub immAllZerosV_bc, node:$in)>; class VNEGD<bits<2> size, string OpcodeStr, ValueType Ty> : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$dst), (ins DPR:$src), - !strconcat(OpcodeStr, "\t$dst, $src"), "", + IIC_VSHLiD, !strconcat(OpcodeStr, "\t$dst, $src"), "", [(set DPR:$dst, (Ty (vneg DPR:$src)))]>; class VNEGQ<bits<2> size, string OpcodeStr, ValueType Ty> : N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$dst), (ins QPR:$src), - !strconcat(OpcodeStr, "\t$dst, $src"), "", + IIC_VSHLiD, !strconcat(OpcodeStr, "\t$dst, $src"), "", [(set QPR:$dst, (Ty (vneg QPR:$src)))]>; // VNEG : Vector Negate @@ -1265,10 +2205,12 @@ def VNEGs32q : VNEGQ<0b10, "vneg.s32", v4i32>; // VNEG : Vector Negate (floating-point) def VNEGf32d : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0, - (outs DPR:$dst), (ins DPR:$src), "vneg.f32\t$dst, $src", "", + (outs DPR:$dst), (ins DPR:$src), IIC_VUNAD, + "vneg.f32\t$dst, $src", "", [(set DPR:$dst, (v2f32 (fneg DPR:$src)))]>; def VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0, - (outs QPR:$dst), (ins QPR:$src), "vneg.f32\t$dst, $src", "", + (outs QPR:$dst), (ins QPR:$src), IIC_VUNAQ, + "vneg.f32\t$dst, $src", "", [(set QPR:$dst, (v4f32 (fneg QPR:$src)))]>; def : Pat<(v8i8 (vneg_conv DPR:$src)), (VNEGs8d DPR:$src)>; @@ -1279,21 +2221,26 @@ def : Pat<(v8i16 (vneg_conv QPR:$src)), (VNEGs16q QPR:$src)>; def : Pat<(v4i32 (vneg_conv QPR:$src)), (VNEGs32q QPR:$src)>; // VQNEG : Vector Saturating Negate -defm VQNEG : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0, "vqneg.s", +defm VQNEG : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0, + IIC_VQUNAiD, IIC_VQUNAiQ, "vqneg.s", int_arm_neon_vqneg>; // Vector Bit Counting Operations. // VCLS : Vector Count Leading Sign Bits -defm VCLS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0, "vcls.s", +defm VCLS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0, + IIC_VCNTiD, IIC_VCNTiQ, "vcls.s", int_arm_neon_vcls>; // VCLZ : Vector Count Leading Zeros -defm VCLZ : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0, "vclz.i", +defm VCLZ : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0, + IIC_VCNTiD, IIC_VCNTiQ, "vclz.i", int_arm_neon_vclz>; // VCNT : Vector Count One Bits -def VCNTd : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, "vcnt.8", +def VCNTd : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, + IIC_VCNTiD, "vcnt.8", v8i8, v8i8, int_arm_neon_vcnt>; -def VCNTq : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, "vcnt.8", +def VCNTq : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, + IIC_VCNTiQ, "vcnt.8", v16i8, v16i8, int_arm_neon_vcnt>; // Vector Move Operations. @@ -1301,9 +2248,9 @@ def VCNTq : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, "vcnt.8", // VMOV : Vector Move (Register) def VMOVD : N3V<0, 0, 0b10, 0b0001, 0, 1, (outs DPR:$dst), (ins DPR:$src), - "vmov\t$dst, $src", "", []>; + IIC_VMOVD, "vmov\t$dst, $src", "", []>; def VMOVQ : N3V<0, 0, 0b10, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src), - "vmov\t$dst, $src", "", []>; + IIC_VMOVD, "vmov\t$dst, $src", "", []>; // VMOV : Vector Move (Immediate) @@ -1343,146 +2290,188 @@ def vmovImm64 : PatLeaf<(build_vector), [{ // be encoded based on the immed values. def VMOVv8i8 : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$dst), - (ins i8imm:$SIMM), "vmov.i8\t$dst, $SIMM", "", + (ins i8imm:$SIMM), IIC_VMOVImm, + "vmov.i8\t$dst, $SIMM", "", [(set DPR:$dst, (v8i8 vmovImm8:$SIMM))]>; def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$dst), - (ins i8imm:$SIMM), "vmov.i8\t$dst, $SIMM", "", + (ins i8imm:$SIMM), IIC_VMOVImm, + "vmov.i8\t$dst, $SIMM", "", [(set QPR:$dst, (v16i8 vmovImm8:$SIMM))]>; def VMOVv4i16 : N1ModImm<1, 0b000, 0b1000, 0, 0, 0, 1, (outs DPR:$dst), - (ins i16imm:$SIMM), "vmov.i16\t$dst, $SIMM", "", + (ins i16imm:$SIMM), IIC_VMOVImm, + "vmov.i16\t$dst, $SIMM", "", [(set DPR:$dst, (v4i16 vmovImm16:$SIMM))]>; def VMOVv8i16 : N1ModImm<1, 0b000, 0b1000, 0, 1, 0, 1, (outs QPR:$dst), - (ins i16imm:$SIMM), "vmov.i16\t$dst, $SIMM", "", + (ins i16imm:$SIMM), IIC_VMOVImm, + "vmov.i16\t$dst, $SIMM", "", [(set QPR:$dst, (v8i16 vmovImm16:$SIMM))]>; def VMOVv2i32 : N1ModImm<1, 0b000, 0b0000, 0, 0, 0, 1, (outs DPR:$dst), - (ins i32imm:$SIMM), "vmov.i32\t$dst, $SIMM", "", + (ins i32imm:$SIMM), IIC_VMOVImm, + "vmov.i32\t$dst, $SIMM", "", [(set DPR:$dst, (v2i32 vmovImm32:$SIMM))]>; def VMOVv4i32 : N1ModImm<1, 0b000, 0b0000, 0, 1, 0, 1, (outs QPR:$dst), - (ins i32imm:$SIMM), "vmov.i32\t$dst, $SIMM", "", + (ins i32imm:$SIMM), IIC_VMOVImm, + "vmov.i32\t$dst, $SIMM", "", [(set QPR:$dst, (v4i32 vmovImm32:$SIMM))]>; def VMOVv1i64 : N1ModImm<1, 0b000, 0b1110, 0, 0, 1, 1, (outs DPR:$dst), - (ins i64imm:$SIMM), "vmov.i64\t$dst, $SIMM", "", + (ins i64imm:$SIMM), IIC_VMOVImm, + "vmov.i64\t$dst, $SIMM", "", [(set DPR:$dst, (v1i64 vmovImm64:$SIMM))]>; def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$dst), - (ins i64imm:$SIMM), "vmov.i64\t$dst, $SIMM", "", + (ins i64imm:$SIMM), IIC_VMOVImm, + "vmov.i64\t$dst, $SIMM", "", [(set QPR:$dst, (v2i64 vmovImm64:$SIMM))]>; // VMOV : Vector Get Lane (move scalar to ARM core register) def VGETLNs8 : NVGetLane<0b11100101, 0b1011, 0b00, - (outs GPR:$dst), (ins DPR:$src, i32imm:$lane), - "vmov", ".s8\t$dst, $src[$lane]", + (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane), + IIC_VMOVSI, "vmov", ".s8\t$dst, $src[$lane]", [(set GPR:$dst, (NEONvgetlanes (v8i8 DPR:$src), imm:$lane))]>; def VGETLNs16 : NVGetLane<0b11100001, 0b1011, 0b01, - (outs GPR:$dst), (ins DPR:$src, i32imm:$lane), - "vmov", ".s16\t$dst, $src[$lane]", + (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane), + IIC_VMOVSI, "vmov", ".s16\t$dst, $src[$lane]", [(set GPR:$dst, (NEONvgetlanes (v4i16 DPR:$src), imm:$lane))]>; def VGETLNu8 : NVGetLane<0b11101101, 0b1011, 0b00, - (outs GPR:$dst), (ins DPR:$src, i32imm:$lane), - "vmov", ".u8\t$dst, $src[$lane]", + (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane), + IIC_VMOVSI, "vmov", ".u8\t$dst, $src[$lane]", [(set GPR:$dst, (NEONvgetlaneu (v8i8 DPR:$src), imm:$lane))]>; def VGETLNu16 : NVGetLane<0b11101001, 0b1011, 0b01, - (outs GPR:$dst), (ins DPR:$src, i32imm:$lane), - "vmov", ".u16\t$dst, $src[$lane]", + (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane), + IIC_VMOVSI, "vmov", ".u16\t$dst, $src[$lane]", [(set GPR:$dst, (NEONvgetlaneu (v4i16 DPR:$src), imm:$lane))]>; def VGETLNi32 : NVGetLane<0b11100001, 0b1011, 0b00, - (outs GPR:$dst), (ins DPR:$src, i32imm:$lane), - "vmov", ".32\t$dst, $src[$lane]", + (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane), + IIC_VMOVSI, "vmov", ".32\t$dst, $src[$lane]", [(set GPR:$dst, (extractelt (v2i32 DPR:$src), imm:$lane))]>; // def VGETLNf32: see FMRDH and FMRDL in ARMInstrVFP.td def : Pat<(NEONvgetlanes (v16i8 QPR:$src), imm:$lane), (VGETLNs8 (v8i8 (EXTRACT_SUBREG QPR:$src, - (SubReg_i8_reg imm:$lane))), + (DSubReg_i8_reg imm:$lane))), (SubReg_i8_lane imm:$lane))>; def : Pat<(NEONvgetlanes (v8i16 QPR:$src), imm:$lane), (VGETLNs16 (v4i16 (EXTRACT_SUBREG QPR:$src, - (SubReg_i16_reg imm:$lane))), + (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane))>; def : Pat<(NEONvgetlaneu (v16i8 QPR:$src), imm:$lane), (VGETLNu8 (v8i8 (EXTRACT_SUBREG QPR:$src, - (SubReg_i8_reg imm:$lane))), + (DSubReg_i8_reg imm:$lane))), (SubReg_i8_lane imm:$lane))>; def : Pat<(NEONvgetlaneu (v8i16 QPR:$src), imm:$lane), (VGETLNu16 (v4i16 (EXTRACT_SUBREG QPR:$src, - (SubReg_i16_reg imm:$lane))), + (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane))>; def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane), (VGETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src, - (SubReg_i32_reg imm:$lane))), + (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane))>; +def : Pat<(extractelt (v2f32 DPR:$src1), imm:$src2), + (EXTRACT_SUBREG (COPY_TO_REGCLASS DPR:$src1, DPR_VFP2), + (SSubReg_f32_reg imm:$src2))>; +def : Pat<(extractelt (v4f32 QPR:$src1), imm:$src2), + (EXTRACT_SUBREG (COPY_TO_REGCLASS QPR:$src1, QPR_VFP2), + (SSubReg_f32_reg imm:$src2))>; //def : Pat<(extractelt (v2i64 QPR:$src1), imm:$src2), -// (EXTRACT_SUBREG QPR:$src1, (SubReg_f64_reg imm:$src2))>; +// (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>; def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2), - (EXTRACT_SUBREG QPR:$src1, (SubReg_f64_reg imm:$src2))>; + (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>; // VMOV : Vector Set Lane (move ARM core register to scalar) let Constraints = "$src1 = $dst" in { def VSETLNi8 : NVSetLane<0b11100100, 0b1011, 0b00, (outs DPR:$dst), - (ins DPR:$src1, GPR:$src2, i32imm:$lane), - "vmov", ".8\t$dst[$lane], $src2", + (ins DPR:$src1, GPR:$src2, nohash_imm:$lane), + IIC_VMOVISL, "vmov", ".8\t$dst[$lane], $src2", [(set DPR:$dst, (vector_insert (v8i8 DPR:$src1), GPR:$src2, imm:$lane))]>; def VSETLNi16 : NVSetLane<0b11100000, 0b1011, 0b01, (outs DPR:$dst), - (ins DPR:$src1, GPR:$src2, i32imm:$lane), - "vmov", ".16\t$dst[$lane], $src2", + (ins DPR:$src1, GPR:$src2, nohash_imm:$lane), + IIC_VMOVISL, "vmov", ".16\t$dst[$lane], $src2", [(set DPR:$dst, (vector_insert (v4i16 DPR:$src1), GPR:$src2, imm:$lane))]>; def VSETLNi32 : NVSetLane<0b11100000, 0b1011, 0b00, (outs DPR:$dst), - (ins DPR:$src1, GPR:$src2, i32imm:$lane), - "vmov", ".32\t$dst[$lane], $src2", + (ins DPR:$src1, GPR:$src2, nohash_imm:$lane), + IIC_VMOVISL, "vmov", ".32\t$dst[$lane], $src2", [(set DPR:$dst, (insertelt (v2i32 DPR:$src1), GPR:$src2, imm:$lane))]>; } def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane), (v16i8 (INSERT_SUBREG QPR:$src1, (VSETLNi8 (v8i8 (EXTRACT_SUBREG QPR:$src1, - (SubReg_i8_reg imm:$lane))), + (DSubReg_i8_reg imm:$lane))), GPR:$src2, (SubReg_i8_lane imm:$lane)), - (SubReg_i8_reg imm:$lane)))>; + (DSubReg_i8_reg imm:$lane)))>; def : Pat<(vector_insert (v8i16 QPR:$src1), GPR:$src2, imm:$lane), (v8i16 (INSERT_SUBREG QPR:$src1, (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1, - (SubReg_i16_reg imm:$lane))), + (DSubReg_i16_reg imm:$lane))), GPR:$src2, (SubReg_i16_lane imm:$lane)), - (SubReg_i16_reg imm:$lane)))>; + (DSubReg_i16_reg imm:$lane)))>; def : Pat<(insertelt (v4i32 QPR:$src1), GPR:$src2, imm:$lane), (v4i32 (INSERT_SUBREG QPR:$src1, (VSETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src1, - (SubReg_i32_reg imm:$lane))), + (DSubReg_i32_reg imm:$lane))), GPR:$src2, (SubReg_i32_lane imm:$lane)), - (SubReg_i32_reg imm:$lane)))>; + (DSubReg_i32_reg imm:$lane)))>; + +def : Pat<(v2f32 (insertelt DPR:$src1, SPR:$src2, imm:$src3)), + (INSERT_SUBREG (COPY_TO_REGCLASS DPR:$src1, DPR_VFP2), + SPR:$src2, (SSubReg_f32_reg imm:$src3))>; +def : Pat<(v4f32 (insertelt QPR:$src1, SPR:$src2, imm:$src3)), + (INSERT_SUBREG (COPY_TO_REGCLASS QPR:$src1, QPR_VFP2), + SPR:$src2, (SSubReg_f32_reg imm:$src3))>; //def : Pat<(v2i64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)), -// (INSERT_SUBREG QPR:$src1, DPR:$src2, (SubReg_f64_reg imm:$src3))>; +// (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>; def : Pat<(v2f64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)), - (INSERT_SUBREG QPR:$src1, DPR:$src2, (SubReg_f64_reg imm:$src3))>; + (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>; + +def : Pat<(v2f32 (scalar_to_vector SPR:$src)), + (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$src, arm_ssubreg_0)>; +def : Pat<(v2f64 (scalar_to_vector DPR:$src)), + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), DPR:$src, arm_dsubreg_0)>; +def : Pat<(v4f32 (scalar_to_vector SPR:$src)), + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, arm_ssubreg_0)>; + +def : Pat<(v8i8 (scalar_to_vector GPR:$src)), + (VSETLNi8 (v8i8 (IMPLICIT_DEF)), GPR:$src, (i32 0))>; +def : Pat<(v4i16 (scalar_to_vector GPR:$src)), + (VSETLNi16 (v4i16 (IMPLICIT_DEF)), GPR:$src, (i32 0))>; +def : Pat<(v2i32 (scalar_to_vector GPR:$src)), + (VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0))>; + +def : Pat<(v16i8 (scalar_to_vector GPR:$src)), + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (VSETLNi8 (v8i8 (IMPLICIT_DEF)), GPR:$src, (i32 0)), + arm_dsubreg_0)>; +def : Pat<(v8i16 (scalar_to_vector GPR:$src)), + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), + (VSETLNi16 (v4i16 (IMPLICIT_DEF)), GPR:$src, (i32 0)), + arm_dsubreg_0)>; +def : Pat<(v4i32 (scalar_to_vector GPR:$src)), + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), + (VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0)), + arm_dsubreg_0)>; // VDUP : Vector Duplicate (from ARM core register to all elements) -def splat_lo : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - return SVOp->isSplat() && SVOp->getSplatIndex() == 0; -}]>; - class VDUPD<bits<8> opcod1, bits<2> opcod3, string asmSize, ValueType Ty> : NVDup<opcod1, 0b1011, opcod3, (outs DPR:$dst), (ins GPR:$src), - "vdup", !strconcat(asmSize, "\t$dst, $src"), - [(set DPR:$dst, (Ty (splat_lo (scalar_to_vector GPR:$src), undef)))]>; + IIC_VMOVIS, "vdup", !strconcat(asmSize, "\t$dst, $src"), + [(set DPR:$dst, (Ty (NEONvdup (i32 GPR:$src))))]>; class VDUPQ<bits<8> opcod1, bits<2> opcod3, string asmSize, ValueType Ty> : NVDup<opcod1, 0b1011, opcod3, (outs QPR:$dst), (ins GPR:$src), - "vdup", !strconcat(asmSize, "\t$dst, $src"), - [(set QPR:$dst, (Ty (splat_lo (scalar_to_vector GPR:$src), undef)))]>; + IIC_VMOVIS, "vdup", !strconcat(asmSize, "\t$dst, $src"), + [(set QPR:$dst, (Ty (NEONvdup (i32 GPR:$src))))]>; def VDUP8d : VDUPD<0b11101100, 0b00, ".8", v8i8>; def VDUP16d : VDUPD<0b11101000, 0b01, ".16", v4i16>; @@ -1492,45 +2481,28 @@ def VDUP16q : VDUPQ<0b11101010, 0b01, ".16", v8i16>; def VDUP32q : VDUPQ<0b11101010, 0b00, ".32", v4i32>; def VDUPfd : NVDup<0b11101000, 0b1011, 0b00, (outs DPR:$dst), (ins GPR:$src), - "vdup", ".32\t$dst, $src", - [(set DPR:$dst, (v2f32 (splat_lo - (scalar_to_vector - (f32 (bitconvert GPR:$src))), - undef)))]>; + IIC_VMOVIS, "vdup", ".32\t$dst, $src", + [(set DPR:$dst, (v2f32 (NEONvdup + (f32 (bitconvert GPR:$src)))))]>; def VDUPfq : NVDup<0b11101010, 0b1011, 0b00, (outs QPR:$dst), (ins GPR:$src), - "vdup", ".32\t$dst, $src", - [(set QPR:$dst, (v4f32 (splat_lo - (scalar_to_vector - (f32 (bitconvert GPR:$src))), - undef)))]>; + IIC_VMOVIS, "vdup", ".32\t$dst, $src", + [(set QPR:$dst, (v4f32 (NEONvdup + (f32 (bitconvert GPR:$src)))))]>; // VDUP : Vector Duplicate Lane (from scalar to all elements) -def SHUFFLE_get_splat_lane : SDNodeXForm<vector_shuffle, [{ - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - return CurDAG->getTargetConstant(SVOp->getSplatIndex(), MVT::i32); -}]>; - -def splat_lane : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - return SVOp->isSplat(); -}], SHUFFLE_get_splat_lane>; - class VDUPLND<bits<2> op19_18, bits<2> op17_16, string OpcodeStr, ValueType Ty> : N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 0, 0, - (outs DPR:$dst), (ins DPR:$src, i32imm:$lane), + (outs DPR:$dst), (ins DPR:$src, nohash_imm:$lane), IIC_VMOVD, !strconcat(OpcodeStr, "\t$dst, $src[$lane]"), "", - [(set DPR:$dst, (Ty (splat_lane:$lane DPR:$src, undef)))]>; + [(set DPR:$dst, (Ty (NEONvduplane (Ty DPR:$src), imm:$lane)))]>; -// vector_shuffle requires that the source and destination types match, so -// VDUP to a 128-bit result uses a target-specific VDUPLANEQ node. class VDUPLNQ<bits<2> op19_18, bits<2> op17_16, string OpcodeStr, ValueType ResTy, ValueType OpTy> : N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 1, 0, - (outs QPR:$dst), (ins DPR:$src, i32imm:$lane), + (outs QPR:$dst), (ins DPR:$src, nohash_imm:$lane), IIC_VMOVD, !strconcat(OpcodeStr, "\t$dst, $src[$lane]"), "", - [(set QPR:$dst, (ResTy (NEONvduplaneq (OpTy DPR:$src), imm:$lane)))]>; + [(set QPR:$dst, (ResTy (NEONvduplane (OpTy DPR:$src), imm:$lane)))]>; def VDUPLN8d : VDUPLND<0b00, 0b01, "vdup.8", v8i8>; def VDUPLN16d : VDUPLND<0b00, 0b10, "vdup.16", v4i16>; @@ -1541,15 +2513,51 @@ def VDUPLN16q : VDUPLNQ<0b00, 0b10, "vdup.16", v8i16, v4i16>; def VDUPLN32q : VDUPLNQ<0b01, 0b00, "vdup.32", v4i32, v2i32>; def VDUPLNfq : VDUPLNQ<0b01, 0b00, "vdup.32", v4f32, v2f32>; +def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)), + (v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i8_reg imm:$lane))), + (SubReg_i8_lane imm:$lane)))>; +def : Pat<(v8i16 (NEONvduplane (v8i16 QPR:$src), imm:$lane)), + (v8i16 (VDUPLN16q (v4i16 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; +def : Pat<(v4i32 (NEONvduplane (v4i32 QPR:$src), imm:$lane)), + (v4i32 (VDUPLN32q (v2i32 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; +def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)), + (v4f32 (VDUPLNfq (v2f32 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + +def VDUPfdf : N2V<0b11, 0b11, 0b01, 0b00, 0b11000, 0, 0, + (outs DPR:$dst), (ins SPR:$src), + IIC_VMOVD, "vdup.32\t$dst, ${src:lane}", "", + [(set DPR:$dst, (v2f32 (NEONvdup (f32 SPR:$src))))]>; + +def VDUPfqf : N2V<0b11, 0b11, 0b01, 0b00, 0b11000, 1, 0, + (outs QPR:$dst), (ins SPR:$src), + IIC_VMOVD, "vdup.32\t$dst, ${src:lane}", "", + [(set QPR:$dst, (v4f32 (NEONvdup (f32 SPR:$src))))]>; + +def : Pat<(v2i64 (NEONvduplane (v2i64 QPR:$src), imm:$lane)), + (INSERT_SUBREG QPR:$src, + (i64 (EXTRACT_SUBREG QPR:$src, (DSubReg_f64_reg imm:$lane))), + (DSubReg_f64_other_reg imm:$lane))>; +def : Pat<(v2f64 (NEONvduplane (v2f64 QPR:$src), imm:$lane)), + (INSERT_SUBREG QPR:$src, + (f64 (EXTRACT_SUBREG QPR:$src, (DSubReg_f64_reg imm:$lane))), + (DSubReg_f64_other_reg imm:$lane))>; + // VMOVN : Vector Narrowing Move -defm VMOVN : N2VNInt_HSD<0b11,0b11,0b10,0b00100,0,0, "vmovn.i", +defm VMOVN : N2VNInt_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVD, "vmovn.i", int_arm_neon_vmovn>; // VQMOVN : Vector Saturating Narrowing Move -defm VQMOVNs : N2VNInt_HSD<0b11,0b11,0b10,0b00101,0,0, "vqmovn.s", +defm VQMOVNs : N2VNInt_HSD<0b11,0b11,0b10,0b00101,0,0, IIC_VQUNAiD, "vqmovn.s", int_arm_neon_vqmovns>; -defm VQMOVNu : N2VNInt_HSD<0b11,0b11,0b10,0b00101,1,0, "vqmovn.u", +defm VQMOVNu : N2VNInt_HSD<0b11,0b11,0b10,0b00101,1,0, IIC_VQUNAiD, "vqmovn.u", int_arm_neon_vqmovnu>; -defm VQMOVNsu : N2VNInt_HSD<0b11,0b11,0b10,0b00100,1,0, "vqmovun.s", +defm VQMOVNsu : N2VNInt_HSD<0b11,0b11,0b10,0b00100,1,0, IIC_VQUNAiD, "vqmovun.s", int_arm_neon_vqmovnsu>; // VMOVL : Vector Lengthening Move defm VMOVLs : N2VLInt_QHS<0,1,0b1010,0,0,1, "vmovl.s", int_arm_neon_vmovls>; @@ -1597,6 +2605,247 @@ def VCVTxs2fq : N2VCvtQ<0, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.s32", def VCVTxu2fq : N2VCvtQ<1, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.u32", v4f32, v4i32, int_arm_neon_vcvtfxu2fp>; +// Vector Reverse. + +// VREV64 : Vector Reverse elements within 64-bit doublewords + +class VREV64D<bits<2> op19_18, string OpcodeStr, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 0, 0, (outs DPR:$dst), + (ins DPR:$src), IIC_VMOVD, + !strconcat(OpcodeStr, "\t$dst, $src"), "", + [(set DPR:$dst, (Ty (NEONvrev64 (Ty DPR:$src))))]>; +class VREV64Q<bits<2> op19_18, string OpcodeStr, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 1, 0, (outs QPR:$dst), + (ins QPR:$src), IIC_VMOVD, + !strconcat(OpcodeStr, "\t$dst, $src"), "", + [(set QPR:$dst, (Ty (NEONvrev64 (Ty QPR:$src))))]>; + +def VREV64d8 : VREV64D<0b00, "vrev64.8", v8i8>; +def VREV64d16 : VREV64D<0b01, "vrev64.16", v4i16>; +def VREV64d32 : VREV64D<0b10, "vrev64.32", v2i32>; +def VREV64df : VREV64D<0b10, "vrev64.32", v2f32>; + +def VREV64q8 : VREV64Q<0b00, "vrev64.8", v16i8>; +def VREV64q16 : VREV64Q<0b01, "vrev64.16", v8i16>; +def VREV64q32 : VREV64Q<0b10, "vrev64.32", v4i32>; +def VREV64qf : VREV64Q<0b10, "vrev64.32", v4f32>; + +// VREV32 : Vector Reverse elements within 32-bit words + +class VREV32D<bits<2> op19_18, string OpcodeStr, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 0, 0, (outs DPR:$dst), + (ins DPR:$src), IIC_VMOVD, + !strconcat(OpcodeStr, "\t$dst, $src"), "", + [(set DPR:$dst, (Ty (NEONvrev32 (Ty DPR:$src))))]>; +class VREV32Q<bits<2> op19_18, string OpcodeStr, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 1, 0, (outs QPR:$dst), + (ins QPR:$src), IIC_VMOVD, + !strconcat(OpcodeStr, "\t$dst, $src"), "", + [(set QPR:$dst, (Ty (NEONvrev32 (Ty QPR:$src))))]>; + +def VREV32d8 : VREV32D<0b00, "vrev32.8", v8i8>; +def VREV32d16 : VREV32D<0b01, "vrev32.16", v4i16>; + +def VREV32q8 : VREV32Q<0b00, "vrev32.8", v16i8>; +def VREV32q16 : VREV32Q<0b01, "vrev32.16", v8i16>; + +// VREV16 : Vector Reverse elements within 16-bit halfwords + +class VREV16D<bits<2> op19_18, string OpcodeStr, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 0, 0, (outs DPR:$dst), + (ins DPR:$src), IIC_VMOVD, + !strconcat(OpcodeStr, "\t$dst, $src"), "", + [(set DPR:$dst, (Ty (NEONvrev16 (Ty DPR:$src))))]>; +class VREV16Q<bits<2> op19_18, string OpcodeStr, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 1, 0, (outs QPR:$dst), + (ins QPR:$src), IIC_VMOVD, + !strconcat(OpcodeStr, "\t$dst, $src"), "", + [(set QPR:$dst, (Ty (NEONvrev16 (Ty QPR:$src))))]>; + +def VREV16d8 : VREV16D<0b00, "vrev16.8", v8i8>; +def VREV16q8 : VREV16Q<0b00, "vrev16.8", v16i8>; + +// Other Vector Shuffles. + +// VEXT : Vector Extract + +class VEXTd<string OpcodeStr, ValueType Ty> + : N3V<0,1,0b11,0b0000,0,0, (outs DPR:$dst), + (ins DPR:$lhs, DPR:$rhs, i32imm:$index), IIC_VEXTD, + !strconcat(OpcodeStr, "\t$dst, $lhs, $rhs, $index"), "", + [(set DPR:$dst, (Ty (NEONvext (Ty DPR:$lhs), + (Ty DPR:$rhs), imm:$index)))]>; + +class VEXTq<string OpcodeStr, ValueType Ty> + : N3V<0,1,0b11,0b0000,1,0, (outs QPR:$dst), + (ins QPR:$lhs, QPR:$rhs, i32imm:$index), IIC_VEXTQ, + !strconcat(OpcodeStr, "\t$dst, $lhs, $rhs, $index"), "", + [(set QPR:$dst, (Ty (NEONvext (Ty QPR:$lhs), + (Ty QPR:$rhs), imm:$index)))]>; + +def VEXTd8 : VEXTd<"vext.8", v8i8>; +def VEXTd16 : VEXTd<"vext.16", v4i16>; +def VEXTd32 : VEXTd<"vext.32", v2i32>; +def VEXTdf : VEXTd<"vext.32", v2f32>; + +def VEXTq8 : VEXTq<"vext.8", v16i8>; +def VEXTq16 : VEXTq<"vext.16", v8i16>; +def VEXTq32 : VEXTq<"vext.32", v4i32>; +def VEXTqf : VEXTq<"vext.32", v4f32>; + +// VTRN : Vector Transpose + +def VTRNd8 : N2VDShuffle<0b00, 0b00001, "vtrn.8">; +def VTRNd16 : N2VDShuffle<0b01, 0b00001, "vtrn.16">; +def VTRNd32 : N2VDShuffle<0b10, 0b00001, "vtrn.32">; + +def VTRNq8 : N2VQShuffle<0b00, 0b00001, IIC_VPERMQ, "vtrn.8">; +def VTRNq16 : N2VQShuffle<0b01, 0b00001, IIC_VPERMQ, "vtrn.16">; +def VTRNq32 : N2VQShuffle<0b10, 0b00001, IIC_VPERMQ, "vtrn.32">; + +// VUZP : Vector Unzip (Deinterleave) + +def VUZPd8 : N2VDShuffle<0b00, 0b00010, "vuzp.8">; +def VUZPd16 : N2VDShuffle<0b01, 0b00010, "vuzp.16">; +def VUZPd32 : N2VDShuffle<0b10, 0b00010, "vuzp.32">; + +def VUZPq8 : N2VQShuffle<0b00, 0b00010, IIC_VPERMQ3, "vuzp.8">; +def VUZPq16 : N2VQShuffle<0b01, 0b00010, IIC_VPERMQ3, "vuzp.16">; +def VUZPq32 : N2VQShuffle<0b10, 0b00010, IIC_VPERMQ3, "vuzp.32">; + +// VZIP : Vector Zip (Interleave) + +def VZIPd8 : N2VDShuffle<0b00, 0b00011, "vzip.8">; +def VZIPd16 : N2VDShuffle<0b01, 0b00011, "vzip.16">; +def VZIPd32 : N2VDShuffle<0b10, 0b00011, "vzip.32">; + +def VZIPq8 : N2VQShuffle<0b00, 0b00011, IIC_VPERMQ3, "vzip.8">; +def VZIPq16 : N2VQShuffle<0b01, 0b00011, IIC_VPERMQ3, "vzip.16">; +def VZIPq32 : N2VQShuffle<0b10, 0b00011, IIC_VPERMQ3, "vzip.32">; + +// Vector Table Lookup and Table Extension. + +// VTBL : Vector Table Lookup +def VTBL1 + : N3V<1,1,0b11,0b1000,0,0, (outs DPR:$dst), + (ins DPR:$tbl1, DPR:$src), IIC_VTB1, + "vtbl.8\t$dst, \\{$tbl1\\}, $src", "", + [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl1 DPR:$tbl1, DPR:$src)))]>; +let hasExtraSrcRegAllocReq = 1 in { +def VTBL2 + : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$dst), + (ins DPR:$tbl1, DPR:$tbl2, DPR:$src), IIC_VTB2, + "vtbl.8\t$dst, \\{$tbl1,$tbl2\\}, $src", "", + [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl2 + DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>; +def VTBL3 + : N3V<1,1,0b11,0b1010,0,0, (outs DPR:$dst), + (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), IIC_VTB3, + "vtbl.8\t$dst, \\{$tbl1,$tbl2,$tbl3\\}, $src", "", + [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl3 + DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>; +def VTBL4 + : N3V<1,1,0b11,0b1011,0,0, (outs DPR:$dst), + (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), IIC_VTB4, + "vtbl.8\t$dst, \\{$tbl1,$tbl2,$tbl3,$tbl4\\}, $src", "", + [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl4 DPR:$tbl1, DPR:$tbl2, + DPR:$tbl3, DPR:$tbl4, DPR:$src)))]>; +} // hasExtraSrcRegAllocReq = 1 + +// VTBX : Vector Table Extension +def VTBX1 + : N3V<1,1,0b11,0b1000,1,0, (outs DPR:$dst), + (ins DPR:$orig, DPR:$tbl1, DPR:$src), IIC_VTBX1, + "vtbx.8\t$dst, \\{$tbl1\\}, $src", "$orig = $dst", + [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx1 + DPR:$orig, DPR:$tbl1, DPR:$src)))]>; +let hasExtraSrcRegAllocReq = 1 in { +def VTBX2 + : N3V<1,1,0b11,0b1001,1,0, (outs DPR:$dst), + (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src), IIC_VTBX2, + "vtbx.8\t$dst, \\{$tbl1,$tbl2\\}, $src", "$orig = $dst", + [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx2 + DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>; +def VTBX3 + : N3V<1,1,0b11,0b1010,1,0, (outs DPR:$dst), + (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), IIC_VTBX3, + "vtbx.8\t$dst, \\{$tbl1,$tbl2,$tbl3\\}, $src", "$orig = $dst", + [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx3 DPR:$orig, DPR:$tbl1, + DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>; +def VTBX4 + : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$dst), (ins DPR:$orig, DPR:$tbl1, + DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), IIC_VTBX4, + "vtbx.8\t$dst, \\{$tbl1,$tbl2,$tbl3,$tbl4\\}, $src", "$orig = $dst", + [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx4 DPR:$orig, DPR:$tbl1, + DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src)))]>; +} // hasExtraSrcRegAllocReq = 1 + +//===----------------------------------------------------------------------===// +// NEON instructions for single-precision FP math +//===----------------------------------------------------------------------===// + +// These need separate instructions because they must use DPR_VFP2 register +// class which have SPR sub-registers. + +// Vector Add Operations used for single-precision FP +let neverHasSideEffects = 1 in +def VADDfd_sfp : N3VDs<0, 0, 0b00, 0b1101, 0, "vadd.f32", v2f32, v2f32, fadd,1>; +def : N3VDsPat<fadd, VADDfd_sfp>; + +// Vector Sub Operations used for single-precision FP +let neverHasSideEffects = 1 in +def VSUBfd_sfp : N3VDs<0, 0, 0b10, 0b1101, 0, "vsub.f32", v2f32, v2f32, fsub,0>; +def : N3VDsPat<fsub, VSUBfd_sfp>; + +// Vector Multiply Operations used for single-precision FP +let neverHasSideEffects = 1 in +def VMULfd_sfp : N3VDs<1, 0, 0b00, 0b1101, 1, "vmul.f32", v2f32, v2f32, fmul,1>; +def : N3VDsPat<fmul, VMULfd_sfp>; + +// Vector Multiply-Accumulate/Subtract used for single-precision FP +let neverHasSideEffects = 1 in +def VMLAfd_sfp : N3VDMulOps<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla.f32", v2f32,fmul,fadd>; +def : N3VDMulOpsPat<fmul, fadd, VMLAfd_sfp>; + +let neverHasSideEffects = 1 in +def VMLSfd_sfp : N3VDMulOps<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls.f32", v2f32,fmul,fsub>; +def : N3VDMulOpsPat<fmul, fsub, VMLSfd_sfp>; + +// Vector Absolute used for single-precision FP +let neverHasSideEffects = 1 in +def VABSfd_sfp : N2VDInts<0b11, 0b11, 0b10, 0b01, 0b01110, 0, + IIC_VUNAD, "vabs.f32", + v2f32, v2f32, int_arm_neon_vabs>; +def : N2VDIntsPat<fabs, VABSfd_sfp>; + +// Vector Negate used for single-precision FP +let neverHasSideEffects = 1 in +def VNEGf32d_sfp : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0, + (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), IIC_VUNAD, + "vneg.f32\t$dst, $src", "", []>; +def : N2VDIntsPat<fneg, VNEGf32d_sfp>; + +// Vector Convert between single-precision FP and integer +let neverHasSideEffects = 1 in +def VCVTf2sd_sfp : N2VDs<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt.s32.f32", + v2i32, v2f32, fp_to_sint>; +def : N2VDsPat<arm_ftosi, f32, v2f32, VCVTf2sd_sfp>; + +let neverHasSideEffects = 1 in +def VCVTf2ud_sfp : N2VDs<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt.u32.f32", + v2i32, v2f32, fp_to_uint>; +def : N2VDsPat<arm_ftoui, f32, v2f32, VCVTf2ud_sfp>; + +let neverHasSideEffects = 1 in +def VCVTs2fd_sfp : N2VDs<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt.f32.s32", + v2f32, v2i32, sint_to_fp>; +def : N2VDsPat<arm_sitof, f32, v2i32, VCVTs2fd_sfp>; + +let neverHasSideEffects = 1 in +def VCVTu2fd_sfp : N2VDs<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt.f32.u32", + v2f32, v2i32, uint_to_fp>; +def : N2VDsPat<arm_uitof, f32, v2i32, VCVTu2fd_sfp>; + //===----------------------------------------------------------------------===// // Non-Instruction Patterns //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td index 904d9b1..9816add 100644 --- a/lib/Target/ARM/ARMInstrThumb.td +++ b/lib/Target/ARM/ARMInstrThumb.td @@ -117,86 +117,150 @@ def t_addrmode_sp : Operand<i32>, let Defs = [SP], Uses = [SP] in { def tADJCALLSTACKUP : -PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), +PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), NoItinerary, "@ tADJCALLSTACKUP $amt1", - [(ARMcallseq_end imm:$amt1, imm:$amt2)]>, Requires<[IsThumb]>; + [(ARMcallseq_end imm:$amt1, imm:$amt2)]>, Requires<[IsThumb1Only]>; def tADJCALLSTACKDOWN : -PseudoInst<(outs), (ins i32imm:$amt), +PseudoInst<(outs), (ins i32imm:$amt), NoItinerary, "@ tADJCALLSTACKDOWN $amt", - [(ARMcallseq_start imm:$amt)]>, Requires<[IsThumb]>; + [(ARMcallseq_start imm:$amt)]>, Requires<[IsThumb1Only]>; } +// For both thumb1 and thumb2. let isNotDuplicable = 1 in -def tPICADD : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, pclabel:$cp), - "$cp:\n\tadd $dst, pc", - [(set tGPR:$dst, (ARMpic_add tGPR:$lhs, imm:$cp))]>; +def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr, + "\n$cp:\n\tadd $dst, pc", + [(set GPR:$dst, (ARMpic_add GPR:$lhs, imm:$cp))]>; // PC relative add. -def tADDrPCi : T1I<(outs tGPR:$dst), (ins i32imm:$rhs), +def tADDrPCi : T1I<(outs tGPR:$dst), (ins i32imm:$rhs), IIC_iALUi, "add $dst, pc, $rhs * 4", []>; // ADD rd, sp, #imm8 -// FIXME: hard code sp? -def tADDrSPi : T1I<(outs tGPR:$dst), (ins GPR:$sp, i32imm:$rhs), - "add $dst, $sp, $rhs * 4 @ addrspi", []>; +def tADDrSPi : T1I<(outs tGPR:$dst), (ins GPR:$sp, i32imm:$rhs), IIC_iALUi, + "add $dst, $sp, $rhs * 4", []>; // ADD sp, sp, #imm7 -// FIXME: hard code sp? -def tADDspi : T1It<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), +def tADDspi : TIt<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), IIC_iALUi, "add $dst, $rhs * 4", []>; -// FIXME: Make use of the following? -// ADD rm, sp, rm +// SUB sp, sp, #imm7 +def tSUBspi : TIt<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), IIC_iALUi, + "sub $dst, $rhs * 4", []>; + +// ADD rm, sp +def tADDrSP : TIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr, + "add $dst, $rhs", []>; + // ADD sp, rm +def tADDspr : TIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr, + "add $dst, $rhs", []>; + +// Pseudo instruction that will expand into a tSUBspi + a copy. +let usesCustomDAGSchedInserter = 1 in { // Expanded by the scheduler. +def tSUBspi_ : PseudoInst<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), + NoItinerary, "@ sub $dst, $rhs * 4", []>; + +def tADDspr_ : PseudoInst<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), + NoItinerary, "@ add $dst, $rhs", []>; + +let Defs = [CPSR] in +def tANDsp : PseudoInst<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), + NoItinerary, "@ and $dst, $rhs", []>; +} // usesCustomDAGSchedInserter //===----------------------------------------------------------------------===// // Control Flow Instructions. // -let isReturn = 1, isTerminator = 1 in { - def tBX_RET : TI<(outs), (ins), "bx lr", [(ARMretflag)]>; +let isReturn = 1, isTerminator = 1, isBarrier = 1 in { + def tBX_RET : TI<(outs), (ins), IIC_Br, "bx lr", [(ARMretflag)]>; // Alternative return instruction used by vararg functions. - def tBX_RET_vararg : TI<(outs), (ins tGPR:$target), "bx $target", []>; + def tBX_RET_vararg : TI<(outs), (ins tGPR:$target), IIC_Br, "bx $target", []>; } // FIXME: remove when we have a way to marking a MI with these properties. -let isReturn = 1, isTerminator = 1 in -def tPOP_RET : TI<(outs reglist:$dst1, variable_ops), (ins), - "pop $dst1", []>; +let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1, + hasExtraDefRegAllocReq = 1 in +def tPOP_RET : T1I<(outs), (ins pred:$p, reglist:$wb, variable_ops), IIC_Br, + "pop${p} $wb", []>; let isCall = 1, - Defs = [R0, R1, R2, R3, LR, - D0, D1, D2, D3, D4, D5, D6, D7] in { - def tBL : TIx2<(outs), (ins i32imm:$func, variable_ops), + Defs = [R0, R1, R2, R3, R12, LR, + D0, D1, D2, D3, D4, D5, D6, D7, + D16, D17, D18, D19, D20, D21, D22, D23, + D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in { + // Also used for Thumb2 + def tBL : TIx2<(outs), (ins i32imm:$func, variable_ops), IIC_Br, "bl ${func:call}", - [(ARMtcall tglobaladdr:$func)]>; - // ARMv5T and above - def tBLXi : TIx2<(outs), (ins i32imm:$func, variable_ops), + [(ARMtcall tglobaladdr:$func)]>, + Requires<[IsThumb, IsNotDarwin]>; + + // ARMv5T and above, also used for Thumb2 + def tBLXi : TIx2<(outs), (ins i32imm:$func, variable_ops), IIC_Br, "blx ${func:call}", - [(ARMcall tglobaladdr:$func)]>, Requires<[HasV5T]>; - def tBLXr : TI<(outs), (ins tGPR:$func, variable_ops), + [(ARMcall tglobaladdr:$func)]>, + Requires<[IsThumb, HasV5T, IsNotDarwin]>; + + // Also used for Thumb2 + def tBLXr : TI<(outs), (ins GPR:$func, variable_ops), IIC_Br, + "blx $func", + [(ARMtcall GPR:$func)]>, + Requires<[IsThumb, HasV5T, IsNotDarwin]>; + + // ARMv4T + def tBX : TIx2<(outs), (ins tGPR:$func, variable_ops), IIC_Br, + "mov lr, pc\n\tbx $func", + [(ARMcall_nolink tGPR:$func)]>, + Requires<[IsThumb1Only, IsNotDarwin]>; +} + +// On Darwin R9 is call-clobbered. +let isCall = 1, + Defs = [R0, R1, R2, R3, R9, R12, LR, + D0, D1, D2, D3, D4, D5, D6, D7, + D16, D17, D18, D19, D20, D21, D22, D23, + D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in { + // Also used for Thumb2 + def tBLr9 : TIx2<(outs), (ins i32imm:$func, variable_ops), IIC_Br, + "bl ${func:call}", + [(ARMtcall tglobaladdr:$func)]>, + Requires<[IsThumb, IsDarwin]>; + + // ARMv5T and above, also used for Thumb2 + def tBLXi_r9 : TIx2<(outs), (ins i32imm:$func, variable_ops), IIC_Br, + "blx ${func:call}", + [(ARMcall tglobaladdr:$func)]>, + Requires<[IsThumb, HasV5T, IsDarwin]>; + + // Also used for Thumb2 + def tBLXr_r9 : TI<(outs), (ins GPR:$func, variable_ops), IIC_Br, "blx $func", - [(ARMtcall tGPR:$func)]>, Requires<[HasV5T]>; + [(ARMtcall GPR:$func)]>, + Requires<[IsThumb, HasV5T, IsDarwin]>; + // ARMv4T - def tBX : TIx2<(outs), (ins tGPR:$func, variable_ops), - "cpy lr, pc\n\tbx $func", - [(ARMcall_nolink tGPR:$func)]>; + def tBXr9 : TIx2<(outs), (ins tGPR:$func, variable_ops), IIC_Br, + "mov lr, pc\n\tbx $func", + [(ARMcall_nolink tGPR:$func)]>, + Requires<[IsThumb1Only, IsDarwin]>; } let isBranch = 1, isTerminator = 1 in { let isBarrier = 1 in { let isPredicable = 1 in - def tB : T1I<(outs), (ins brtarget:$target), "b $target", - [(br bb:$target)]>; + def tB : T1I<(outs), (ins brtarget:$target), IIC_Br, + "b $target", [(br bb:$target)]>; // Far jump - def tBfar : T1Ix2<(outs), (ins brtarget:$target), + let Defs = [LR] in + def tBfar : TIx2<(outs), (ins brtarget:$target), IIC_Br, "bl $target\t@ far jump",[]>; def tBR_JTr : T1JTI<(outs), (ins tGPR:$target, jtblock_operand:$jt, i32imm:$id), - "cpy pc, $target \n\t.align\t2\n$jt", + IIC_Br, "mov pc, $target\n\t.align\t2\n$jt", [(ARMbrjt tGPR:$target, tjumptable:$jt, imm:$id)]>; } } @@ -204,7 +268,8 @@ let isBranch = 1, isTerminator = 1 in { // FIXME: should be able to write a pattern for ARMBrcond, but can't use // a two-value operand where a dag node expects two operands. :( let isBranch = 1, isTerminator = 1 in - def tBcc : T1I<(outs), (ins brtarget:$target, pred:$cc), "b$cc $target", + def tBcc : T1I<(outs), (ins brtarget:$target, pred:$cc), IIC_Br, + "b$cc $target", [/*(ARMbrcond bb:$target, imm:$cc)*/]>; //===----------------------------------------------------------------------===// @@ -212,384 +277,363 @@ let isBranch = 1, isTerminator = 1 in // let canFoldAsLoad = 1 in -def tLDR : T1I4<(outs tGPR:$dst), (ins t_addrmode_s4:$addr), - "ldr $dst, $addr", +def tLDR : T1pI4<(outs tGPR:$dst), (ins t_addrmode_s4:$addr), IIC_iLoadr, + "ldr", " $dst, $addr", [(set tGPR:$dst, (load t_addrmode_s4:$addr))]>; -def tLDRB : T1I1<(outs tGPR:$dst), (ins t_addrmode_s1:$addr), - "ldrb $dst, $addr", +def tLDRB : T1pI1<(outs tGPR:$dst), (ins t_addrmode_s1:$addr), IIC_iLoadr, + "ldrb", " $dst, $addr", [(set tGPR:$dst, (zextloadi8 t_addrmode_s1:$addr))]>; -def tLDRH : T1I2<(outs tGPR:$dst), (ins t_addrmode_s2:$addr), - "ldrh $dst, $addr", +def tLDRH : T1pI2<(outs tGPR:$dst), (ins t_addrmode_s2:$addr), IIC_iLoadr, + "ldrh", " $dst, $addr", [(set tGPR:$dst, (zextloadi16 t_addrmode_s2:$addr))]>; -def tLDRSB : T1I1<(outs tGPR:$dst), (ins t_addrmode_rr:$addr), - "ldrsb $dst, $addr", +let AddedComplexity = 10 in +def tLDRSB : T1pI1<(outs tGPR:$dst), (ins t_addrmode_rr:$addr), IIC_iLoadr, + "ldrsb", " $dst, $addr", [(set tGPR:$dst, (sextloadi8 t_addrmode_rr:$addr))]>; -def tLDRSH : T1I2<(outs tGPR:$dst), (ins t_addrmode_rr:$addr), - "ldrsh $dst, $addr", +let AddedComplexity = 10 in +def tLDRSH : T1pI2<(outs tGPR:$dst), (ins t_addrmode_rr:$addr), IIC_iLoadr, + "ldrsh", " $dst, $addr", [(set tGPR:$dst, (sextloadi16 t_addrmode_rr:$addr))]>; let canFoldAsLoad = 1 in -def tLDRspi : T1Is<(outs tGPR:$dst), (ins t_addrmode_sp:$addr), - "ldr $dst, $addr", +def tLDRspi : T1pIs<(outs tGPR:$dst), (ins t_addrmode_sp:$addr), IIC_iLoadi, + "ldr", " $dst, $addr", [(set tGPR:$dst, (load t_addrmode_sp:$addr))]>; // Special instruction for restore. It cannot clobber condition register // when it's expanded by eliminateCallFramePseudoInstr(). let canFoldAsLoad = 1, mayLoad = 1 in -def tRestore : T1Is<(outs tGPR:$dst), (ins t_addrmode_sp:$addr), - "ldr $dst, $addr", []>; +def tRestore : T1pIs<(outs tGPR:$dst), (ins t_addrmode_sp:$addr), IIC_iLoadi, + "ldr", " $dst, $addr", []>; // Load tconstpool let canFoldAsLoad = 1 in -def tLDRpci : T1Is<(outs tGPR:$dst), (ins i32imm:$addr), - "ldr $dst, $addr", +def tLDRpci : T1pIs<(outs tGPR:$dst), (ins i32imm:$addr), IIC_iLoadi, + "ldr", " $dst, $addr", [(set tGPR:$dst, (load (ARMWrapper tconstpool:$addr)))]>; // Special LDR for loads from non-pc-relative constpools. let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1 in -def tLDRcp : T1Is<(outs tGPR:$dst), (ins i32imm:$addr), - "ldr $dst, $addr", []>; +def tLDRcp : T1pIs<(outs tGPR:$dst), (ins i32imm:$addr), IIC_iLoadi, + "ldr", " $dst, $addr", []>; -def tSTR : T1I4<(outs), (ins tGPR:$src, t_addrmode_s4:$addr), - "str $src, $addr", +def tSTR : T1pI4<(outs), (ins tGPR:$src, t_addrmode_s4:$addr), IIC_iStorer, + "str", " $src, $addr", [(store tGPR:$src, t_addrmode_s4:$addr)]>; -def tSTRB : T1I1<(outs), (ins tGPR:$src, t_addrmode_s1:$addr), - "strb $src, $addr", +def tSTRB : T1pI1<(outs), (ins tGPR:$src, t_addrmode_s1:$addr), IIC_iStorer, + "strb", " $src, $addr", [(truncstorei8 tGPR:$src, t_addrmode_s1:$addr)]>; -def tSTRH : T1I2<(outs), (ins tGPR:$src, t_addrmode_s2:$addr), - "strh $src, $addr", +def tSTRH : T1pI2<(outs), (ins tGPR:$src, t_addrmode_s2:$addr), IIC_iStorer, + "strh", " $src, $addr", [(truncstorei16 tGPR:$src, t_addrmode_s2:$addr)]>; -def tSTRspi : T1Is<(outs), (ins tGPR:$src, t_addrmode_sp:$addr), - "str $src, $addr", +def tSTRspi : T1pIs<(outs), (ins tGPR:$src, t_addrmode_sp:$addr), IIC_iStorei, + "str", " $src, $addr", [(store tGPR:$src, t_addrmode_sp:$addr)]>; let mayStore = 1 in { // Special instruction for spill. It cannot clobber condition register // when it's expanded by eliminateCallFramePseudoInstr(). -def tSpill : T1Is<(outs), (ins tGPR:$src, t_addrmode_sp:$addr), - "str $src, $addr", []>; +def tSpill : T1pIs<(outs), (ins tGPR:$src, t_addrmode_sp:$addr), IIC_iStorei, + "str", " $src, $addr", []>; } //===----------------------------------------------------------------------===// // Load / store multiple Instructions. // -// TODO: A7-44: LDMIA - load multiple +// These requires base address to be written back or one of the loaded regs. +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in +def tLDM : T1I<(outs), + (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops), + IIC_iLoadm, + "ldm${addr:submode}${p} $addr, $wb", []>; -let mayLoad = 1 in -def tPOP : TI<(outs reglist:$dst1, variable_ops), (ins), - "pop $dst1", []>; +let mayStore = 1, hasExtraSrcRegAllocReq = 1 in +def tSTM : T1I<(outs), + (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops), + IIC_iStorem, + "stm${addr:submode}${p} $addr, $wb", []>; -let mayStore = 1 in -def tPUSH : TI<(outs), (ins reglist:$src1, variable_ops), - "push $src1", []>; +let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in +def tPOP : T1I<(outs), (ins pred:$p, reglist:$wb, variable_ops), IIC_Br, + "pop${p} $wb", []>; + +let mayStore = 1, Uses = [SP], Defs = [SP], hasExtraSrcRegAllocReq = 1 in +def tPUSH : T1I<(outs), (ins pred:$p, reglist:$wb, variable_ops), IIC_Br, + "push${p} $wb", []>; //===----------------------------------------------------------------------===// // Arithmetic Instructions. // // Add with carry register -let isCommutable = 1, Defs = [CPSR], Uses = [CPSR] in -def tADCS : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), - "adc $dst, $rhs", - [(set tGPR:$dst, (adde tGPR:$lhs, tGPR:$rhs))]>; +let isCommutable = 1, Uses = [CPSR] in +def tADC : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, + "adc", " $dst, $rhs", + [(set tGPR:$dst, (adde tGPR:$lhs, tGPR:$rhs))]>; // Add immediate -let Defs = [CPSR] in { -def tADDi3 : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), - "add $dst, $lhs, $rhs", - [(set tGPR:$dst, (add tGPR:$lhs, imm0_7:$rhs))]>; -def tADDSi3 : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), - "add $dst, $lhs, $rhs", - [(set tGPR:$dst, (addc tGPR:$lhs, imm0_7:$rhs))]>; -} +def tADDi3 : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi, + "add", " $dst, $lhs, $rhs", + [(set tGPR:$dst, (add tGPR:$lhs, imm0_7:$rhs))]>; -let Defs = [CPSR] in { -def tADDi8 : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), - "add $dst, $rhs", - [(set tGPR:$dst, (add tGPR:$lhs, imm8_255:$rhs))]>; -def tADDSi8 : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), - "add $dst, $rhs", - [(set tGPR:$dst, (addc tGPR:$lhs, imm8_255:$rhs))]>; -} +def tADDi8 : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi, + "add", " $dst, $rhs", + [(set tGPR:$dst, (add tGPR:$lhs, imm8_255:$rhs))]>; // Add register -let isCommutable = 1, Defs = [CPSR] in { -def tADDrr : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), - "add $dst, $lhs, $rhs", - [(set tGPR:$dst, (add tGPR:$lhs, tGPR:$rhs))]>; -def tADDSrr : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), - "add $dst, $lhs, $rhs", - [(set tGPR:$dst, (addc tGPR:$lhs, tGPR:$rhs))]>; -} +let isCommutable = 1 in +def tADDrr : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, + "add", " $dst, $lhs, $rhs", + [(set tGPR:$dst, (add tGPR:$lhs, tGPR:$rhs))]>; let neverHasSideEffects = 1 in -def tADDhirr : T1It<(outs tGPR:$dst), (ins GPR:$lhs, GPR:$rhs), - "add $dst, $rhs @ addhirr", []>; +def tADDhirr : T1pIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr, + "add", " $dst, $rhs", []>; // And register -let isCommutable = 1, Defs = [CPSR] in -def tAND : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), - "and $dst, $rhs", - [(set tGPR:$dst, (and tGPR:$lhs, tGPR:$rhs))]>; +let isCommutable = 1 in +def tAND : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, + "and", " $dst, $rhs", + [(set tGPR:$dst, (and tGPR:$lhs, tGPR:$rhs))]>; // ASR immediate -let Defs = [CPSR] in -def tASRri : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), - "asr $dst, $lhs, $rhs", - [(set tGPR:$dst, (sra tGPR:$lhs, (i32 imm:$rhs)))]>; +def tASRri : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iMOVsi, + "asr", " $dst, $lhs, $rhs", + [(set tGPR:$dst, (sra tGPR:$lhs, (i32 imm:$rhs)))]>; // ASR register -let Defs = [CPSR] in -def tASRrr : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), - "asr $dst, $rhs", - [(set tGPR:$dst, (sra tGPR:$lhs, tGPR:$rhs))]>; +def tASRrr : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr, + "asr", " $dst, $rhs", + [(set tGPR:$dst, (sra tGPR:$lhs, tGPR:$rhs))]>; // BIC register -let Defs = [CPSR] in -def tBIC : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), - "bic $dst, $rhs", - [(set tGPR:$dst, (and tGPR:$lhs, (not tGPR:$rhs)))]>; +def tBIC : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, + "bic", " $dst, $rhs", + [(set tGPR:$dst, (and tGPR:$lhs, (not tGPR:$rhs)))]>; // CMN register let Defs = [CPSR] in { -def tCMN : T1I<(outs), (ins tGPR:$lhs, tGPR:$rhs), - "cmn $lhs, $rhs", - [(ARMcmp tGPR:$lhs, (ineg tGPR:$rhs))]>; -def tCMNZ : T1I<(outs), (ins tGPR:$lhs, tGPR:$rhs), - "cmn $lhs, $rhs", - [(ARMcmpZ tGPR:$lhs, (ineg tGPR:$rhs))]>; +def tCMN : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr, + "cmn", " $lhs, $rhs", + [(ARMcmp tGPR:$lhs, (ineg tGPR:$rhs))]>; +def tCMNZ : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr, + "cmn", " $lhs, $rhs", + [(ARMcmpZ tGPR:$lhs, (ineg tGPR:$rhs))]>; } // CMP immediate let Defs = [CPSR] in { -def tCMPi8 : T1I<(outs), (ins tGPR:$lhs, i32imm:$rhs), - "cmp $lhs, $rhs", - [(ARMcmp tGPR:$lhs, imm0_255:$rhs)]>; -def tCMPZi8 : T1I<(outs), (ins tGPR:$lhs, i32imm:$rhs), - "cmp $lhs, $rhs", - [(ARMcmpZ tGPR:$lhs, imm0_255:$rhs)]>; +def tCMPi8 : T1pI<(outs), (ins tGPR:$lhs, i32imm:$rhs), IIC_iCMPi, + "cmp", " $lhs, $rhs", + [(ARMcmp tGPR:$lhs, imm0_255:$rhs)]>; +def tCMPzi8 : T1pI<(outs), (ins tGPR:$lhs, i32imm:$rhs), IIC_iCMPi, + "cmp", " $lhs, $rhs", + [(ARMcmpZ tGPR:$lhs, imm0_255:$rhs)]>; } // CMP register let Defs = [CPSR] in { -def tCMPr : T1I<(outs), (ins tGPR:$lhs, tGPR:$rhs), - "cmp $lhs, $rhs", - [(ARMcmp tGPR:$lhs, tGPR:$rhs)]>; -def tCMPZr : T1I<(outs), (ins tGPR:$lhs, tGPR:$rhs), - "cmp $lhs, $rhs", - [(ARMcmpZ tGPR:$lhs, tGPR:$rhs)]>; +def tCMPr : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr, + "cmp", " $lhs, $rhs", + [(ARMcmp tGPR:$lhs, tGPR:$rhs)]>; +def tCMPzr : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr, + "cmp", " $lhs, $rhs", + [(ARMcmpZ tGPR:$lhs, tGPR:$rhs)]>; + +def tCMPhir : T1pI<(outs), (ins GPR:$lhs, GPR:$rhs), IIC_iCMPr, + "cmp", " $lhs, $rhs", []>; +def tCMPzhir : T1pI<(outs), (ins GPR:$lhs, GPR:$rhs), IIC_iCMPr, + "cmp", " $lhs, $rhs", []>; } -// TODO: A7-37: CMP(3) - cmp hi regs // XOR register -let isCommutable = 1, Defs = [CPSR] in -def tEOR : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), - "eor $dst, $rhs", - [(set tGPR:$dst, (xor tGPR:$lhs, tGPR:$rhs))]>; +let isCommutable = 1 in +def tEOR : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, + "eor", " $dst, $rhs", + [(set tGPR:$dst, (xor tGPR:$lhs, tGPR:$rhs))]>; // LSL immediate -let Defs = [CPSR] in -def tLSLri : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), - "lsl $dst, $lhs, $rhs", - [(set tGPR:$dst, (shl tGPR:$lhs, (i32 imm:$rhs)))]>; +def tLSLri : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iMOVsi, + "lsl", " $dst, $lhs, $rhs", + [(set tGPR:$dst, (shl tGPR:$lhs, (i32 imm:$rhs)))]>; // LSL register -let Defs = [CPSR] in -def tLSLrr : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), - "lsl $dst, $rhs", - [(set tGPR:$dst, (shl tGPR:$lhs, tGPR:$rhs))]>; +def tLSLrr : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr, + "lsl", " $dst, $rhs", + [(set tGPR:$dst, (shl tGPR:$lhs, tGPR:$rhs))]>; // LSR immediate -let Defs = [CPSR] in -def tLSRri : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), - "lsr $dst, $lhs, $rhs", - [(set tGPR:$dst, (srl tGPR:$lhs, (i32 imm:$rhs)))]>; +def tLSRri : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iMOVsi, + "lsr", " $dst, $lhs, $rhs", + [(set tGPR:$dst, (srl tGPR:$lhs, (i32 imm:$rhs)))]>; // LSR register -let Defs = [CPSR] in -def tLSRrr : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), - "lsr $dst, $rhs", - [(set tGPR:$dst, (srl tGPR:$lhs, tGPR:$rhs))]>; +def tLSRrr : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr, + "lsr", " $dst, $rhs", + [(set tGPR:$dst, (srl tGPR:$lhs, tGPR:$rhs))]>; // move register -let Defs = [CPSR] in -def tMOVi8 : T1I<(outs tGPR:$dst), (ins i32imm:$src), - "mov $dst, $src", - [(set tGPR:$dst, imm0_255:$src)]>; +def tMOVi8 : T1sI<(outs tGPR:$dst), (ins i32imm:$src), IIC_iMOVi, + "mov", " $dst, $src", + [(set tGPR:$dst, imm0_255:$src)]>; // TODO: A7-73: MOV(2) - mov setting flag. -// Note: MOV(2) of two low regs updates the flags, so we emit this as 'cpy', -// which is MOV(3). This also supports high registers. let neverHasSideEffects = 1 in { -def tMOVr : T1I<(outs tGPR:$dst), (ins tGPR:$src), - "cpy $dst, $src", []>; -def tMOVhir2lor : T1I<(outs tGPR:$dst), (ins GPR:$src), - "cpy $dst, $src\t@ hir2lor", []>; -def tMOVlor2hir : T1I<(outs GPR:$dst), (ins tGPR:$src), - "cpy $dst, $src\t@ lor2hir", []>; -def tMOVhir2hir : T1I<(outs GPR:$dst), (ins GPR:$src), - "cpy $dst, $src\t@ hir2hir", []>; +// FIXME: Make this predicable. +def tMOVr : T1I<(outs tGPR:$dst), (ins tGPR:$src), IIC_iMOVr, + "mov $dst, $src", []>; +let Defs = [CPSR] in +def tMOVSr : T1I<(outs tGPR:$dst), (ins tGPR:$src), IIC_iMOVr, + "movs $dst, $src", []>; + +// FIXME: Make these predicable. +def tMOVgpr2tgpr : T1I<(outs tGPR:$dst), (ins GPR:$src), IIC_iMOVr, + "mov $dst, $src", []>; +def tMOVtgpr2gpr : T1I<(outs GPR:$dst), (ins tGPR:$src), IIC_iMOVr, + "mov $dst, $src", []>; +def tMOVgpr2gpr : T1I<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVr, + "mov $dst, $src", []>; } // neverHasSideEffects // multiply register -let isCommutable = 1, Defs = [CPSR] in -def tMUL : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), - "mul $dst, $rhs", - [(set tGPR:$dst, (mul tGPR:$lhs, tGPR:$rhs))]>; +let isCommutable = 1 in +def tMUL : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMUL32, + "mul", " $dst, $rhs", + [(set tGPR:$dst, (mul tGPR:$lhs, tGPR:$rhs))]>; // move inverse register -let Defs = [CPSR] in -def tMVN : T1I<(outs tGPR:$dst), (ins tGPR:$src), - "mvn $dst, $src", - [(set tGPR:$dst, (not tGPR:$src))]>; - -// negate register -let Defs = [CPSR] in -def tNEG : T1I<(outs tGPR:$dst), (ins tGPR:$src), - "neg $dst, $src", - [(set tGPR:$dst, (ineg tGPR:$src))]>; +def tMVN : T1sI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iMOVr, + "mvn", " $dst, $src", + [(set tGPR:$dst, (not tGPR:$src))]>; // bitwise or register -let isCommutable = 1, Defs = [CPSR] in -def tORR : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), - "orr $dst, $rhs", - [(set tGPR:$dst, (or tGPR:$lhs, tGPR:$rhs))]>; +let isCommutable = 1 in +def tORR : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, + "orr", " $dst, $rhs", + [(set tGPR:$dst, (or tGPR:$lhs, tGPR:$rhs))]>; // swaps -def tREV : T1I<(outs tGPR:$dst), (ins tGPR:$src), - "rev $dst, $src", - [(set tGPR:$dst, (bswap tGPR:$src))]>, - Requires<[IsThumb, HasV6]>; - -def tREV16 : T1I<(outs tGPR:$dst), (ins tGPR:$src), - "rev16 $dst, $src", - [(set tGPR:$dst, - (or (and (srl tGPR:$src, (i32 8)), 0xFF), - (or (and (shl tGPR:$src, (i32 8)), 0xFF00), - (or (and (srl tGPR:$src, (i32 8)), 0xFF0000), - (and (shl tGPR:$src, (i32 8)), 0xFF000000)))))]>, - Requires<[IsThumb, HasV6]>; - -def tREVSH : T1I<(outs tGPR:$dst), (ins tGPR:$src), - "revsh $dst, $src", - [(set tGPR:$dst, - (sext_inreg - (or (srl (and tGPR:$src, 0xFFFF), (i32 8)), - (shl tGPR:$src, (i32 8))), i16))]>, - Requires<[IsThumb, HasV6]>; +def tREV : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr, + "rev", " $dst, $src", + [(set tGPR:$dst, (bswap tGPR:$src))]>, + Requires<[IsThumb1Only, HasV6]>; + +def tREV16 : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr, + "rev16", " $dst, $src", + [(set tGPR:$dst, + (or (and (srl tGPR:$src, (i32 8)), 0xFF), + (or (and (shl tGPR:$src, (i32 8)), 0xFF00), + (or (and (srl tGPR:$src, (i32 8)), 0xFF0000), + (and (shl tGPR:$src, (i32 8)), 0xFF000000)))))]>, + Requires<[IsThumb1Only, HasV6]>; + +def tREVSH : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr, + "revsh", " $dst, $src", + [(set tGPR:$dst, + (sext_inreg + (or (srl (and tGPR:$src, 0xFF00), (i32 8)), + (shl tGPR:$src, (i32 8))), i16))]>, + Requires<[IsThumb1Only, HasV6]>; // rotate right register -let Defs = [CPSR] in -def tROR : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), - "ror $dst, $rhs", - [(set tGPR:$dst, (rotr tGPR:$lhs, tGPR:$rhs))]>; +def tROR : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr, + "ror", " $dst, $rhs", + [(set tGPR:$dst, (rotr tGPR:$lhs, tGPR:$rhs))]>; + +// negate register +def tRSB : T1sI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iALUi, + "rsb", " $dst, $src, #0", + [(set tGPR:$dst, (ineg tGPR:$src))]>; // Subtract with carry register -let Defs = [CPSR], Uses = [CPSR] in -def tSBCS : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), - "sbc $dst, $rhs", - [(set tGPR:$dst, (sube tGPR:$lhs, tGPR:$rhs))]>; +let Uses = [CPSR] in +def tSBC : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, + "sbc", " $dst, $rhs", + [(set tGPR:$dst, (sube tGPR:$lhs, tGPR:$rhs))]>; // Subtract immediate -let Defs = [CPSR] in { -def tSUBi3 : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), - "sub $dst, $lhs, $rhs", - [(set tGPR:$dst, (add tGPR:$lhs, imm0_7_neg:$rhs))]>; -def tSUBSi3 : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), - "sub $dst, $lhs, $rhs", - [(set tGPR:$dst, (addc tGPR:$lhs, imm0_7_neg:$rhs))]>; -} +def tSUBi3 : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi, + "sub", " $dst, $lhs, $rhs", + [(set tGPR:$dst, (add tGPR:$lhs, imm0_7_neg:$rhs))]>; -let Defs = [CPSR] in { -def tSUBi8 : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), - "sub $dst, $rhs", - [(set tGPR:$dst, (add tGPR:$lhs, imm8_255_neg:$rhs))]>; -def tSUBSi8 : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), - "sub $dst, $rhs", - [(set tGPR:$dst, (addc tGPR:$lhs, imm8_255_neg:$rhs))]>; -} +def tSUBi8 : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi, + "sub", " $dst, $rhs", + [(set tGPR:$dst, (add tGPR:$lhs, imm8_255_neg:$rhs))]>; // subtract register -let Defs = [CPSR] in { -def tSUBrr : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), - "sub $dst, $lhs, $rhs", - [(set tGPR:$dst, (sub tGPR:$lhs, tGPR:$rhs))]>; -def tSUBSrr : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), - "sub $dst, $lhs, $rhs", - [(set tGPR:$dst, (subc tGPR:$lhs, tGPR:$rhs))]>; -} +def tSUBrr : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr, + "sub", " $dst, $lhs, $rhs", + [(set tGPR:$dst, (sub tGPR:$lhs, tGPR:$rhs))]>; // TODO: A7-96: STMIA - store multiple. -def tSUBspi : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), - "sub $dst, $rhs * 4", []>; - // sign-extend byte -def tSXTB : T1I<(outs tGPR:$dst), (ins tGPR:$src), - "sxtb $dst, $src", - [(set tGPR:$dst, (sext_inreg tGPR:$src, i8))]>, - Requires<[IsThumb, HasV6]>; +def tSXTB : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr, + "sxtb", " $dst, $src", + [(set tGPR:$dst, (sext_inreg tGPR:$src, i8))]>, + Requires<[IsThumb1Only, HasV6]>; // sign-extend short -def tSXTH : T1I<(outs tGPR:$dst), (ins tGPR:$src), - "sxth $dst, $src", - [(set tGPR:$dst, (sext_inreg tGPR:$src, i16))]>, - Requires<[IsThumb, HasV6]>; +def tSXTH : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr, + "sxth", " $dst, $src", + [(set tGPR:$dst, (sext_inreg tGPR:$src, i16))]>, + Requires<[IsThumb1Only, HasV6]>; // test let isCommutable = 1, Defs = [CPSR] in -def tTST : T1I<(outs), (ins tGPR:$lhs, tGPR:$rhs), - "tst $lhs, $rhs", - [(ARMcmpZ (and tGPR:$lhs, tGPR:$rhs), 0)]>; +def tTST : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr, + "tst", " $lhs, $rhs", + [(ARMcmpZ (and tGPR:$lhs, tGPR:$rhs), 0)]>; // zero-extend byte -def tUXTB : T1I<(outs tGPR:$dst), (ins tGPR:$src), - "uxtb $dst, $src", - [(set tGPR:$dst, (and tGPR:$src, 0xFF))]>, - Requires<[IsThumb, HasV6]>; +def tUXTB : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr, + "uxtb", " $dst, $src", + [(set tGPR:$dst, (and tGPR:$src, 0xFF))]>, + Requires<[IsThumb1Only, HasV6]>; // zero-extend short -def tUXTH : T1I<(outs tGPR:$dst), (ins tGPR:$src), - "uxth $dst, $src", - [(set tGPR:$dst, (and tGPR:$src, 0xFFFF))]>, - Requires<[IsThumb, HasV6]>; +def tUXTH : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr, + "uxth", " $dst, $src", + [(set tGPR:$dst, (and tGPR:$src, 0xFFFF))]>, + Requires<[IsThumb1Only, HasV6]>; // Conditional move tMOVCCr - Used to implement the Thumb SELECT_CC DAG operation. // Expanded by the scheduler into a branch sequence. let usesCustomDAGSchedInserter = 1 in // Expanded by the scheduler. - def tMOVCCr : + def tMOVCCr_pseudo : PseudoInst<(outs tGPR:$dst), (ins tGPR:$false, tGPR:$true, pred:$cc), - "@ tMOVCCr $cc", - [/*(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, imm:$cc))*/]>; + NoItinerary, "@ tMOVCCr $cc", + [/*(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, imm:$cc))*/]>; + + +// 16-bit movcc in IT blocks for Thumb2. +def tMOVCCr : T1pIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iCMOVr, + "mov", " $dst, $rhs", []>; + +def tMOVCCi : T1pIt<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), IIC_iCMOVi, + "mov", " $dst, $rhs", []>; // tLEApcrel - Load a pc-relative address into a register without offending the // assembler. -def tLEApcrel : TIx2<(outs tGPR:$dst), (ins i32imm:$label), - !strconcat(!strconcat(".set PCRELV${:uid}, ($label-(", - "${:private}PCRELL${:uid}+4))\n"), - !strconcat("\tmov $dst, #PCRELV${:uid}\n", - "${:private}PCRELL${:uid}:\n\tadd $dst, pc")), - []>; - -def tLEApcrelJT : TIx2<(outs tGPR:$dst), (ins i32imm:$label, i32imm:$id), - !strconcat(!strconcat(".set PCRELV${:uid}, (${label}_${id:no_hash}-(", - "${:private}PCRELL${:uid}+4))\n"), - !strconcat("\tmov $dst, #PCRELV${:uid}\n", - "${:private}PCRELL${:uid}:\n\tadd $dst, pc")), - []>; +def tLEApcrel : T1I<(outs tGPR:$dst), (ins i32imm:$label, pred:$p), IIC_iALUi, + "adr$p $dst, #$label", []>; + +def tLEApcrelJT : T1I<(outs tGPR:$dst), + (ins i32imm:$label, nohash_imm:$id, pred:$p), + IIC_iALUi, "adr$p $dst, #${label}_${id}", []>; //===----------------------------------------------------------------------===// // TLS Instructions @@ -598,7 +642,7 @@ def tLEApcrelJT : TIx2<(outs tGPR:$dst), (ins i32imm:$label, i32imm:$id), // __aeabi_read_tp preserves the registers r1-r3. let isCall = 1, Defs = [R0, LR] in { - def tTPsoft : TIx2<(outs), (ins), + def tTPsoft : TIx2<(outs), (ins), IIC_Br, "bl __aeabi_read_tp", [(set R0, ARMthread_pointer)]>; } @@ -607,20 +651,46 @@ let isCall = 1, // Non-Instruction Patterns // +// Add with carry +def : T1Pat<(addc tGPR:$lhs, imm0_7:$rhs), + (tADDi3 tGPR:$lhs, imm0_7:$rhs)>; +def : T1Pat<(addc tGPR:$lhs, imm8_255:$rhs), + (tADDi8 tGPR:$lhs, imm8_255:$rhs)>; +def : T1Pat<(addc tGPR:$lhs, tGPR:$rhs), + (tADDrr tGPR:$lhs, tGPR:$rhs)>; + +// Subtract with carry +def : T1Pat<(addc tGPR:$lhs, imm0_7_neg:$rhs), + (tSUBi3 tGPR:$lhs, imm0_7_neg:$rhs)>; +def : T1Pat<(addc tGPR:$lhs, imm8_255_neg:$rhs), + (tSUBi8 tGPR:$lhs, imm8_255_neg:$rhs)>; +def : T1Pat<(subc tGPR:$lhs, tGPR:$rhs), + (tSUBrr tGPR:$lhs, tGPR:$rhs)>; + // ConstantPool, GlobalAddress -def : TPat<(ARMWrapper tglobaladdr :$dst), (tLEApcrel tglobaladdr :$dst)>; -def : TPat<(ARMWrapper tconstpool :$dst), (tLEApcrel tconstpool :$dst)>; +def : T1Pat<(ARMWrapper tglobaladdr :$dst), (tLEApcrel tglobaladdr :$dst)>; +def : T1Pat<(ARMWrapper tconstpool :$dst), (tLEApcrel tconstpool :$dst)>; // JumpTable -def : TPat<(ARMWrapperJT tjumptable:$dst, imm:$id), - (tLEApcrelJT tjumptable:$dst, imm:$id)>; +def : T1Pat<(ARMWrapperJT tjumptable:$dst, imm:$id), + (tLEApcrelJT tjumptable:$dst, imm:$id)>; // Direct calls -def : TPat<(ARMtcall texternalsym:$func), (tBL texternalsym:$func)>; -def : Tv5Pat<(ARMcall texternalsym:$func), (tBLXi texternalsym:$func)>; +def : T1Pat<(ARMtcall texternalsym:$func), (tBL texternalsym:$func)>, + Requires<[IsThumb, IsNotDarwin]>; +def : T1Pat<(ARMtcall texternalsym:$func), (tBLr9 texternalsym:$func)>, + Requires<[IsThumb, IsDarwin]>; + +def : Tv5Pat<(ARMcall texternalsym:$func), (tBLXi texternalsym:$func)>, + Requires<[IsThumb, HasV5T, IsNotDarwin]>; +def : Tv5Pat<(ARMcall texternalsym:$func), (tBLXi_r9 texternalsym:$func)>, + Requires<[IsThumb, HasV5T, IsDarwin]>; // Indirect calls to ARM routines -def : Tv5Pat<(ARMcall tGPR:$dst), (tBLXr tGPR:$dst)>; +def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr GPR:$dst)>, + Requires<[IsThumb, HasV5T, IsNotDarwin]>; +def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr_r9 GPR:$dst)>, + Requires<[IsThumb, HasV5T, IsDarwin]>; // zextload i1 -> zextload i8 def : T1Pat<(zextloadi1 t_addrmode_s1:$addr), @@ -631,6 +701,20 @@ def : T1Pat<(extloadi1 t_addrmode_s1:$addr), (tLDRB t_addrmode_s1:$addr)>; def : T1Pat<(extloadi8 t_addrmode_s1:$addr), (tLDRB t_addrmode_s1:$addr)>; def : T1Pat<(extloadi16 t_addrmode_s2:$addr), (tLDRH t_addrmode_s2:$addr)>; +// If it's impossible to use [r,r] address mode for sextload, select to +// ldr{b|h} + sxt{b|h} instead. +def : T1Pat<(sextloadi8 t_addrmode_s1:$addr), + (tSXTB (tLDRB t_addrmode_s1:$addr))>, + Requires<[IsThumb1Only, HasV6]>; +def : T1Pat<(sextloadi16 t_addrmode_s2:$addr), + (tSXTH (tLDRH t_addrmode_s2:$addr))>, + Requires<[IsThumb1Only, HasV6]>; + +def : T1Pat<(sextloadi8 t_addrmode_s1:$addr), + (tASRri (tLSLri (tLDRB t_addrmode_s1:$addr), 24), 24)>; +def : T1Pat<(sextloadi16 t_addrmode_s1:$addr), + (tASRri (tLSLri (tLDRH t_addrmode_s1:$addr), 16), 16)>; + // Large immediate handling. // Two piece imms. diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index 50345a6..0750dcc 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -11,6 +11,21 @@ // //===----------------------------------------------------------------------===// +// IT block predicate field +def it_pred : Operand<i32> { + let PrintMethod = "printPredicateOperand"; +} + +// IT block condition mask +def it_mask : Operand<i32> { + let PrintMethod = "printThumbITMask"; +} + +// Table branch address +def tb_addrmode : Operand<i32> { + let PrintMethod = "printTBAddrMode"; +} + // Shifted operands. No register controlled shifts for Thumb2. // Note: We do not support rrx shifted operands yet. def t2_so_reg : Operand<i32>, // reg imm @@ -20,23 +35,14 @@ def t2_so_reg : Operand<i32>, // reg imm let MIOperandInfo = (ops GPR, i32imm); } -// t2_so_imm_XFORM - Return a t2_so_imm value packed into the format -// described for t2_so_imm def below. -def t2_so_imm_XFORM : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant( - ARM_AM::getT2SOImmVal(N->getZExtValue()), MVT::i32); -}]>; - // t2_so_imm_not_XFORM - Return the complement of a t2_so_imm value def t2_so_imm_not_XFORM : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant( - ARM_AM::getT2SOImmVal(~((uint32_t)N->getZExtValue())), MVT::i32); + return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), MVT::i32); }]>; // t2_so_imm_neg_XFORM - Return the negation of a t2_so_imm value def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant( - ARM_AM::getT2SOImmVal(-((int)N->getZExtValue())), MVT::i32); + return CurDAG->getTargetConstant(-((int)N->getZExtValue()), MVT::i32); }]>; // t2_so_imm - Match a 32-bit immediate operand, which is an @@ -47,27 +53,21 @@ def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{ // [bits 0-7], the 4-bit shift/splat amount is the next 4 bits [bits 8-11]. def t2_so_imm : Operand<i32>, PatLeaf<(imm), [{ - return ARM_AM::getT2SOImmVal((uint32_t)N->getZExtValue()) != -1; - }], t2_so_imm_XFORM> { - let PrintMethod = "printT2SOImmOperand"; -} + return ARM_AM::getT2SOImmVal((uint32_t)N->getZExtValue()) != -1; +}]>; // t2_so_imm_not - Match an immediate that is a complement // of a t2_so_imm. def t2_so_imm_not : Operand<i32>, PatLeaf<(imm), [{ - return ARM_AM::getT2SOImmVal(~((uint32_t)N->getZExtValue())) != -1; - }], t2_so_imm_not_XFORM> { - let PrintMethod = "printT2SOImmOperand"; -} + return ARM_AM::getT2SOImmVal(~((uint32_t)N->getZExtValue())) != -1; +}], t2_so_imm_not_XFORM>; // t2_so_imm_neg - Match an immediate that is a negation of a t2_so_imm. def t2_so_imm_neg : Operand<i32>, PatLeaf<(imm), [{ - return ARM_AM::getT2SOImmVal(-((int)N->getZExtValue())) != -1; - }], t2_so_imm_neg_XFORM> { - let PrintMethod = "printT2SOImmOperand"; -} + return ARM_AM::getT2SOImmVal(-((int)N->getZExtValue())) != -1; +}], t2_so_imm_neg_XFORM>; /// imm1_31 predicate - True if the 32-bit immediate is in the range [1,31]. def imm1_31 : PatLeaf<(i32 imm), [{ @@ -75,7 +75,8 @@ def imm1_31 : PatLeaf<(i32 imm), [{ }]>; /// imm0_4095 predicate - True if the 32-bit immediate is in the range [0.4095]. -def imm0_4095 : PatLeaf<(i32 imm), [{ +def imm0_4095 : Operand<i32>, + PatLeaf<(i32 imm), [{ return (uint32_t)N->getZExtValue() < 4096; }]>; @@ -83,48 +84,9 @@ def imm0_4095_neg : PatLeaf<(i32 imm), [{ return (uint32_t)(-N->getZExtValue()) < 4096; }], imm_neg_XFORM>; -/// imm0_65535 predicate - True if the 32-bit immediate is in the range -/// [0.65535]. -def imm0_65535 : PatLeaf<(i32 imm), [{ - return (uint32_t)N->getZExtValue() < 65536; -}]>; - -/// bf_inv_mask_imm predicate - An AND mask to clear an arbitrary width bitfield -/// e.g., 0xf000ffff -def bf_inv_mask_imm : Operand<i32>, - PatLeaf<(imm), [{ - uint32_t v = (uint32_t)N->getZExtValue(); - if (v == 0xffffffff) - return 0; - // naive checker. should do better, but simple is best for now since it's - // more likely to be correct. - while (v & 1) v >>= 1; // shift off the leading 1's - if (v) - { - while (!(v & 1)) v >>=1; // shift off the mask - while (v & 1) v >>= 1; // shift off the trailing 1's - } - // if this is a mask for clearing a bitfield, what's left should be zero. - return (v == 0); -}] > { - let PrintMethod = "printBitfieldInvMaskImmOperand"; -} - -/// Split a 32-bit immediate into two 16 bit parts. -def t2_lo16 : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() & 0xffff, - MVT::i32); -}]>; - -def t2_hi16 : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() >> 16, MVT::i32); -}]>; - -def t2_lo16AllZero : PatLeaf<(i32 imm), [{ - // Returns true if all low 16-bits are 0. - return (((uint32_t)N->getZExtValue()) & 0xFFFFUL) == 0; - }], t2_hi16>; - +def imm0_255_neg : PatLeaf<(i32 imm), [{ + return (uint32_t)(-N->getZExtValue()) < 255; +}], imm_neg_XFORM>; // Define Thumb2 specific addressing modes. @@ -147,14 +109,14 @@ def t2am_imm8_offset : Operand<i32>, let PrintMethod = "printT2AddrModeImm8OffsetOperand"; } -// t2addrmode_imm8s4 := reg + (imm8 << 2) +// t2addrmode_imm8s4 := reg +/- (imm8 << 2) def t2addrmode_imm8s4 : Operand<i32>, ComplexPattern<i32, 2, "SelectT2AddrModeImm8s4", []> { - let PrintMethod = "printT2AddrModeImm8Operand"; + let PrintMethod = "printT2AddrModeImm8s4Operand"; let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); } -// t2addrmode_so_reg := reg + reg << imm2 +// t2addrmode_so_reg := reg + (reg << imm2) def t2addrmode_so_reg : Operand<i32>, ComplexPattern<i32, 3, "SelectT2AddrModeSoReg", []> { let PrintMethod = "printT2AddrModeSoRegOperand"; @@ -171,52 +133,58 @@ def t2addrmode_so_reg : Operand<i32>, /// changed to modify CPSR. multiclass T2I_un_irs<string opc, PatFrag opnode, bit Cheap = 0, bit ReMat = 0>{ // shifted imm - def i : T2sI<(outs GPR:$dst), (ins t2_so_imm:$src), + def i : T2sI<(outs GPR:$dst), (ins t2_so_imm:$src), IIC_iMOVi, opc, " $dst, $src", [(set GPR:$dst, (opnode t2_so_imm:$src))]> { let isAsCheapAsAMove = Cheap; let isReMaterializable = ReMat; } // register - def r : T2I<(outs GPR:$dst), (ins GPR:$src), - opc, " $dst, $src", + def r : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVr, + opc, ".w $dst, $src", [(set GPR:$dst, (opnode GPR:$src))]>; // shifted register - def s : T2I<(outs GPR:$dst), (ins t2_so_reg:$src), - opc, " $dst, $src", + def s : T2I<(outs GPR:$dst), (ins t2_so_reg:$src), IIC_iMOVsi, + opc, ".w $dst, $src", [(set GPR:$dst, (opnode t2_so_reg:$src))]>; } /// T2I_bin_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a // binary operation that produces a value. These are predicable and can be /// changed to modify CPSR. -multiclass T2I_bin_irs<string opc, PatFrag opnode, bit Commutable = 0> { +multiclass T2I_bin_irs<string opc, PatFrag opnode, + bit Commutable = 0, string wide =""> { // shifted imm - def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), + def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi, opc, " $dst, $lhs, $rhs", [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>; // register - def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), - opc, " $dst, $lhs, $rhs", + def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr, + opc, !strconcat(wide, " $dst, $lhs, $rhs"), [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]> { let isCommutable = Commutable; } // shifted register - def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), - opc, " $dst, $lhs, $rhs", + def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi, + opc, !strconcat(wide, " $dst, $lhs, $rhs"), [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>; } +/// T2I_bin_w_irs - Same as T2I_bin_irs except these operations need +// the ".w" prefix to indicate that they are wide. +multiclass T2I_bin_w_irs<string opc, PatFrag opnode, bit Commutable = 0> : + T2I_bin_irs<opc, opnode, Commutable, ".w">; + /// T2I_rbin_is - Same as T2I_bin_irs except the order of operands are /// reversed. It doesn't define the 'rr' form since it's handled by its /// T2I_bin_irs counterpart. multiclass T2I_rbin_is<string opc, PatFrag opnode> { // shifted imm - def ri : T2I<(outs GPR:$dst), (ins GPR:$rhs, t2_so_imm:$lhs), - opc, " $dst, $rhs, $lhs", + def ri : T2I<(outs GPR:$dst), (ins GPR:$rhs, t2_so_imm:$lhs), IIC_iALUi, + opc, ".w $dst, $rhs, $lhs", [(set GPR:$dst, (opnode t2_so_imm:$lhs, GPR:$rhs))]>; // shifted register - def rs : T2I<(outs GPR:$dst), (ins GPR:$rhs, t2_so_reg:$lhs), + def rs : T2I<(outs GPR:$dst), (ins GPR:$rhs, t2_so_reg:$lhs), IIC_iALUsi, opc, " $dst, $rhs, $lhs", [(set GPR:$dst, (opnode t2_so_reg:$lhs, GPR:$rhs))]>; } @@ -226,18 +194,18 @@ multiclass T2I_rbin_is<string opc, PatFrag opnode> { let Defs = [CPSR] in { multiclass T2I_bin_s_irs<string opc, PatFrag opnode, bit Commutable = 0> { // shifted imm - def ri : T2I<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), - !strconcat(opc, "s"), " $dst, $lhs, $rhs", + def ri : T2I<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi, + !strconcat(opc, "s"), ".w $dst, $lhs, $rhs", [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>; // register - def rr : T2I<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), - !strconcat(opc, "s"), " $dst, $lhs, $rhs", + def rr : T2I<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr, + !strconcat(opc, "s"), ".w $dst, $lhs, $rhs", [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]> { let isCommutable = Commutable; } // shifted register - def rs : T2I<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), - !strconcat(opc, "s"), " $dst, $lhs, $rhs", + def rs : T2I<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi, + !strconcat(opc, "s"), ".w $dst, $lhs, $rhs", [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>; } } @@ -246,22 +214,22 @@ multiclass T2I_bin_s_irs<string opc, PatFrag opnode, bit Commutable = 0> { /// patterns for a binary operation that produces a value. multiclass T2I_bin_ii12rs<string opc, PatFrag opnode, bit Commutable = 0> { // shifted imm - def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), - opc, " $dst, $lhs, $rhs", + def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi, + opc, ".w $dst, $lhs, $rhs", [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>; // 12-bit imm - def ri12 : T2sI<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), + def ri12 : T2sI<(outs GPR:$dst), (ins GPR:$lhs, imm0_4095:$rhs), IIC_iALUi, !strconcat(opc, "w"), " $dst, $lhs, $rhs", [(set GPR:$dst, (opnode GPR:$lhs, imm0_4095:$rhs))]>; // register - def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), - opc, " $dst, $lhs, $rhs", + def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr, + opc, ".w $dst, $lhs, $rhs", [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]> { let isCommutable = Commutable; } // shifted register - def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), - opc, " $dst, $lhs, $rhs", + def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi, + opc, ".w $dst, $lhs, $rhs", [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>; } @@ -271,41 +239,41 @@ multiclass T2I_bin_ii12rs<string opc, PatFrag opnode, bit Commutable = 0> { let Uses = [CPSR] in { multiclass T2I_adde_sube_irs<string opc, PatFrag opnode, bit Commutable = 0> { // shifted imm - def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), + def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi, opc, " $dst, $lhs, $rhs", [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>, Requires<[IsThumb2, CarryDefIsUnused]>; // register - def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), - opc, " $dst, $lhs, $rhs", + def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr, + opc, ".w $dst, $lhs, $rhs", [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]>, Requires<[IsThumb2, CarryDefIsUnused]> { let isCommutable = Commutable; } // shifted register - def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), - opc, " $dst, $lhs, $rhs", + def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi, + opc, ".w $dst, $lhs, $rhs", [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>, Requires<[IsThumb2, CarryDefIsUnused]>; // Carry setting variants // shifted imm - def Sri : T2XI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), + def Sri : T2XI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi, !strconcat(opc, "s $dst, $lhs, $rhs"), [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>, Requires<[IsThumb2, CarryDefIsUsed]> { let Defs = [CPSR]; } // register - def Srr : T2XI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), - !strconcat(opc, "s $dst, $lhs, $rhs"), + def Srr : T2XI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr, + !strconcat(opc, "s.w $dst, $lhs, $rhs"), [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]>, Requires<[IsThumb2, CarryDefIsUsed]> { let Defs = [CPSR]; let isCommutable = Commutable; } // shifted register - def Srs : T2XI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), - !strconcat(opc, "s $dst, $lhs, $rhs"), + def Srs : T2XI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi, + !strconcat(opc, "s.w $dst, $lhs, $rhs"), [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>, Requires<[IsThumb2, CarryDefIsUsed]> { let Defs = [CPSR]; @@ -313,49 +281,17 @@ multiclass T2I_adde_sube_irs<string opc, PatFrag opnode, bit Commutable = 0> { } } -/// T2I_rsc_is - Same as T2I_adde_sube_irs except the order of operands are -/// reversed. It doesn't define the 'rr' form since it's handled by its -/// T2I_adde_sube_irs counterpart. -let Defs = [CPSR], Uses = [CPSR] in { -multiclass T2I_rsc_is<string opc, PatFrag opnode> { - // shifted imm - def ri : T2sI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_imm:$lhs), - opc, " $dst, $rhs, $lhs", - [(set GPR:$dst, (opnode t2_so_imm:$lhs, GPR:$rhs))]>, - Requires<[IsThumb2, CarryDefIsUnused]>; - // shifted register - def rs : T2sI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_reg:$lhs), - opc, " $dst, $rhs, $lhs", - [(set GPR:$dst, (opnode t2_so_reg:$lhs, GPR:$rhs))]>, - Requires<[IsThumb2, CarryDefIsUnused]>; - // shifted imm - def Sri : T2XI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_imm:$lhs), - !strconcat(opc, "s $dst, $rhs, $lhs"), - [(set GPR:$dst, (opnode t2_so_imm:$lhs, GPR:$rhs))]>, - Requires<[IsThumb2, CarryDefIsUsed]> { - let Defs = [CPSR]; - } - // shifted register - def Srs : T2XI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_reg:$lhs), - !strconcat(opc, "s $dst, $rhs, $lhs"), - [(set GPR:$dst, (opnode t2_so_reg:$lhs, GPR:$rhs))]>, - Requires<[IsThumb2, CarryDefIsUsed]> { - let Defs = [CPSR]; - } -} -} - -/// T2I_rbin_s_is - Same as T2I_bin_s_irs except the order of operands are -/// reversed. It doesn't define the 'rr' form since it's handled by its -/// T2I_bin_s_irs counterpart. +/// T2I_rbin_s_is - Same as T2I_rbin_is except sets 's' bit. let Defs = [CPSR] in { multiclass T2I_rbin_s_is<string opc, PatFrag opnode> { // shifted imm def ri : T2XI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_imm:$lhs, cc_out:$s), - !strconcat(opc, "${s} $dst, $rhs, $lhs"), + IIC_iALUi, + !strconcat(opc, "${s}.w $dst, $rhs, $lhs"), [(set GPR:$dst, (opnode t2_so_imm:$lhs, GPR:$rhs))]>; // shifted register def rs : T2XI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_reg:$lhs, cc_out:$s), + IIC_iALUsi, !strconcat(opc, "${s} $dst, $rhs, $lhs"), [(set GPR:$dst, (opnode t2_so_reg:$lhs, GPR:$rhs))]>; } @@ -365,96 +301,96 @@ multiclass T2I_rbin_s_is<string opc, PatFrag opnode> { // rotate operation that produces a value. multiclass T2I_sh_ir<string opc, PatFrag opnode> { // 5-bit imm - def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), - opc, " $dst, $lhs, $rhs", + def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), IIC_iMOVsi, + opc, ".w $dst, $lhs, $rhs", [(set GPR:$dst, (opnode GPR:$lhs, imm1_31:$rhs))]>; // register - def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), - opc, " $dst, $lhs, $rhs", + def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iMOVsr, + opc, ".w $dst, $lhs, $rhs", [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]>; } -/// T21_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test +/// T2I_cmp_is - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test /// patterns. Similar to T2I_bin_irs except the instruction does not produce /// a explicit result, only implicitly set CPSR. -let Uses = [CPSR] in { +let Defs = [CPSR] in { multiclass T2I_cmp_is<string opc, PatFrag opnode> { // shifted imm - def ri : T2I<(outs), (ins GPR:$lhs, t2_so_imm:$rhs), - opc, " $lhs, $rhs", + def ri : T2I<(outs), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iCMPi, + opc, ".w $lhs, $rhs", [(opnode GPR:$lhs, t2_so_imm:$rhs)]>; // register - def rr : T2I<(outs), (ins GPR:$lhs, GPR:$rhs), - opc, " $lhs, $rhs", + def rr : T2I<(outs), (ins GPR:$lhs, GPR:$rhs), IIC_iCMPr, + opc, ".w $lhs, $rhs", [(opnode GPR:$lhs, GPR:$rhs)]>; // shifted register - def rs : T2I<(outs), (ins GPR:$lhs, t2_so_reg:$rhs), - opc, " $lhs, $rhs", + def rs : T2I<(outs), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iCMPsi, + opc, ".w $lhs, $rhs", [(opnode GPR:$lhs, t2_so_reg:$rhs)]>; } } /// T2I_ld - Defines a set of (op r, {imm12|imm8|so_reg}) load patterns. multiclass T2I_ld<string opc, PatFrag opnode> { - def i12 : T2Ii12<(outs GPR:$dst), (ins t2addrmode_imm12:$addr), - opc, " $dst, $addr", + def i12 : T2Ii12<(outs GPR:$dst), (ins t2addrmode_imm12:$addr), IIC_iLoadi, + opc, ".w $dst, $addr", [(set GPR:$dst, (opnode t2addrmode_imm12:$addr))]>; - def i8 : T2Ii8 <(outs GPR:$dst), (ins t2addrmode_imm8:$addr), + def i8 : T2Ii8 <(outs GPR:$dst), (ins t2addrmode_imm8:$addr), IIC_iLoadi, opc, " $dst, $addr", [(set GPR:$dst, (opnode t2addrmode_imm8:$addr))]>; - def s : T2Iso <(outs GPR:$dst), (ins t2addrmode_so_reg:$addr), - opc, " $dst, $addr", + def s : T2Iso <(outs GPR:$dst), (ins t2addrmode_so_reg:$addr), IIC_iLoadr, + opc, ".w $dst, $addr", [(set GPR:$dst, (opnode t2addrmode_so_reg:$addr))]>; - def pci : T2Ipc <(outs GPR:$dst), (ins i32imm:$addr), - opc, " $dst, $addr", + def pci : T2Ipc <(outs GPR:$dst), (ins i32imm:$addr), IIC_iLoadi, + opc, ".w $dst, $addr", [(set GPR:$dst, (opnode (ARMWrapper tconstpool:$addr)))]>; } /// T2I_st - Defines a set of (op r, {imm12|imm8|so_reg}) store patterns. multiclass T2I_st<string opc, PatFrag opnode> { - def i12 : T2Ii12<(outs), (ins GPR:$src, t2addrmode_imm12:$addr), - opc, " $src, $addr", + def i12 : T2Ii12<(outs), (ins GPR:$src, t2addrmode_imm12:$addr), IIC_iStorei, + opc, ".w $src, $addr", [(opnode GPR:$src, t2addrmode_imm12:$addr)]>; - def i8 : T2Ii8 <(outs), (ins GPR:$src, t2addrmode_imm8:$addr), + def i8 : T2Ii8 <(outs), (ins GPR:$src, t2addrmode_imm8:$addr), IIC_iStorei, opc, " $src, $addr", [(opnode GPR:$src, t2addrmode_imm8:$addr)]>; - def s : T2Iso <(outs), (ins GPR:$src, t2addrmode_so_reg:$addr), - opc, " $src, $addr", + def s : T2Iso <(outs), (ins GPR:$src, t2addrmode_so_reg:$addr), IIC_iStorer, + opc, ".w $src, $addr", [(opnode GPR:$src, t2addrmode_so_reg:$addr)]>; } /// T2I_picld - Defines the PIC load pattern. class T2I_picld<string opc, PatFrag opnode> : - T2I<(outs GPR:$dst), (ins addrmodepc:$addr), - !strconcat("${addr:label}:\n\t", opc), " $dst, $addr", + T2I<(outs GPR:$dst), (ins addrmodepc:$addr), IIC_iLoadi, + !strconcat("\n${addr:label}:\n\t", opc), " $dst, $addr", [(set GPR:$dst, (opnode addrmodepc:$addr))]>; /// T2I_picst - Defines the PIC store pattern. class T2I_picst<string opc, PatFrag opnode> : - T2I<(outs), (ins GPR:$src, addrmodepc:$addr), - !strconcat("${addr:label}:\n\t", opc), " $src, $addr", + T2I<(outs), (ins GPR:$src, addrmodepc:$addr), IIC_iStorer, + !strconcat("\n${addr:label}:\n\t", opc), " $src, $addr", [(opnode GPR:$src, addrmodepc:$addr)]>; /// T2I_unary_rrot - A unary operation with two forms: one whose operand is a /// register and one whose operand is a register rotated by 8/16/24. multiclass T2I_unary_rrot<string opc, PatFrag opnode> { - def r : T2I<(outs GPR:$dst), (ins GPR:$Src), - opc, " $dst, $Src", - [(set GPR:$dst, (opnode GPR:$Src))]>; - def r_rot : T2I<(outs GPR:$dst), (ins GPR:$Src, i32imm:$rot), - opc, " $dst, $Src, ror $rot", - [(set GPR:$dst, (opnode (rotr GPR:$Src, rot_imm:$rot)))]>; + def r : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, + opc, ".w $dst, $src", + [(set GPR:$dst, (opnode GPR:$src))]>; + def r_rot : T2I<(outs GPR:$dst), (ins GPR:$src, i32imm:$rot), IIC_iUNAsi, + opc, ".w $dst, $src, ror $rot", + [(set GPR:$dst, (opnode (rotr GPR:$src, rot_imm:$rot)))]>; } /// T2I_bin_rrot - A binary operation with two forms: one whose operand is a /// register and one whose operand is a register rotated by 8/16/24. multiclass T2I_bin_rrot<string opc, PatFrag opnode> { - def rr : T2I<(outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS), + def rr : T2I<(outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS), IIC_iALUr, opc, " $dst, $LHS, $RHS", [(set GPR:$dst, (opnode GPR:$LHS, GPR:$RHS))]>; def rr_rot : T2I<(outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS, i32imm:$rot), - opc, " $dst, $LHS, $RHS, ror $rot", + IIC_iALUsr, opc, " $dst, $LHS, $RHS, ror $rot", [(set GPR:$dst, (opnode GPR:$LHS, (rotr GPR:$RHS, rot_imm:$rot)))]>; } @@ -467,42 +403,46 @@ multiclass T2I_bin_rrot<string opc, PatFrag opnode> { // Miscellaneous Instructions. // -let isNotDuplicable = 1 in -def t2PICADD : T2XI<(outs tGPR:$dst), (ins tGPR:$lhs, pclabel:$cp), - "$cp:\n\tadd $dst, pc", - [(set tGPR:$dst, (ARMpic_add tGPR:$lhs, imm:$cp))]>; - - // LEApcrel - Load a pc-relative address into a register without offending the // assembler. -def t2LEApcrel : T2XI<(outs GPR:$dst), (ins i32imm:$label, pred:$p), - !strconcat(!strconcat(".set PCRELV${:uid}, ($label-(", - "${:private}PCRELL${:uid}+8))\n"), - !strconcat("${:private}PCRELL${:uid}:\n\t", - "add$p $dst, pc, #PCRELV${:uid}")), - []>; +def t2LEApcrel : T2XI<(outs GPR:$dst), (ins i32imm:$label, pred:$p), IIC_iALUi, + "adr$p.w $dst, #$label", []>; def t2LEApcrelJT : T2XI<(outs GPR:$dst), - (ins i32imm:$label, i32imm:$id, pred:$p), - !strconcat(!strconcat(".set PCRELV${:uid}, (${label}_${id:no_hash}-(", - "${:private}PCRELL${:uid}+8))\n"), - !strconcat("${:private}PCRELL${:uid}:\n\t", - "add$p $dst, pc, #PCRELV${:uid}")), - []>; - -// ADD rd, sp, #so_imm -def t2ADDrSPi : T2XI<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm), - "add $dst, $sp, $imm", - []>; - -// ADD rd, sp, #imm12 -def t2ADDrSPi12 : T2XI<(outs GPR:$dst), (ins GPR:$sp, i32imm:$imm), - "addw $dst, $sp, $imm", - []>; - -def t2ADDrSPs : T2XI<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs), - "addw $dst, $sp, $rhs", - []>; + (ins i32imm:$label, nohash_imm:$id, pred:$p), IIC_iALUi, + "adr$p.w $dst, #${label}_${id}", []>; + +// ADD r, sp, {so_imm|i12} +def t2ADDrSPi : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm), + IIC_iALUi, "add", ".w $dst, $sp, $imm", []>; +def t2ADDrSPi12 : T2I<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm), + IIC_iALUi, "addw", " $dst, $sp, $imm", []>; + +// ADD r, sp, so_reg +def t2ADDrSPs : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs), + IIC_iALUsi, "add", ".w $dst, $sp, $rhs", []>; + +// SUB r, sp, {so_imm|i12} +def t2SUBrSPi : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm), + IIC_iALUi, "sub", ".w $dst, $sp, $imm", []>; +def t2SUBrSPi12 : T2I<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm), + IIC_iALUi, "subw", " $dst, $sp, $imm", []>; + +// SUB r, sp, so_reg +def t2SUBrSPs : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs), + IIC_iALUsi, + "sub", " $dst, $sp, $rhs", []>; + + +// Pseudo instruction that will expand into a t2SUBrSPi + a copy. +let usesCustomDAGSchedInserter = 1 in { // Expanded by the scheduler. +def t2SUBrSPi_ : PseudoInst<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm), + NoItinerary, "@ sub.w $dst, $sp, $imm", []>; +def t2SUBrSPi12_ : PseudoInst<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm), + NoItinerary, "@ subw $dst, $sp, $imm", []>; +def t2SUBrSPs_ : PseudoInst<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs), + NoItinerary, "@ sub $dst, $sp, $rhs", []>; +} // usesCustomDAGSchedInserter //===----------------------------------------------------------------------===// @@ -521,12 +461,14 @@ defm t2LDRB : T2I_ld<"ldrb", UnOpFrag<(zextloadi8 node:$Src)>>; defm t2LDRSH : T2I_ld<"ldrsh", UnOpFrag<(sextloadi16 node:$Src)>>; defm t2LDRSB : T2I_ld<"ldrsb", UnOpFrag<(sextloadi8 node:$Src)>>; -let mayLoad = 1 in { +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in { // Load doubleword -def t2LDRDi8 : T2Ii8s4<(outs GPR:$dst), (ins t2addrmode_imm8s4:$addr), - "ldrd", " $dst, $addr", []>; -def t2LDRDpci : T2Ii8s4<(outs GPR:$dst), (ins i32imm:$addr), - "ldrd", " $dst, $addr", []>; +def t2LDRDi8 : T2Ii8s4<(outs GPR:$dst1, GPR:$dst2), + (ins t2addrmode_imm8s4:$addr), + IIC_iLoadi, "ldrd", " $dst1, $addr", []>; +def t2LDRDpci : T2Ii8s4<(outs GPR:$dst1, GPR:$dst2), + (ins i32imm:$addr), IIC_iLoadi, + "ldrd", " $dst1, $addr", []>; } // zextload i1 -> zextload i8 @@ -573,57 +515,57 @@ def : T2Pat<(extloadi16 (ARMWrapper tconstpool:$addr)), let mayLoad = 1 in { def t2LDR_PRE : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb), (ins t2addrmode_imm8:$addr), - AddrModeT2_i8, IndexModePre, + AddrModeT2_i8, IndexModePre, IIC_iLoadiu, "ldr", " $dst, $addr!", "$addr.base = $base_wb", []>; def t2LDR_POST : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb), (ins GPR:$base, t2am_imm8_offset:$offset), - AddrModeT2_i8, IndexModePost, + AddrModeT2_i8, IndexModePost, IIC_iLoadiu, "ldr", " $dst, [$base], $offset", "$base = $base_wb", []>; def t2LDRB_PRE : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb), (ins t2addrmode_imm8:$addr), - AddrModeT2_i8, IndexModePre, + AddrModeT2_i8, IndexModePre, IIC_iLoadiu, "ldrb", " $dst, $addr!", "$addr.base = $base_wb", []>; def t2LDRB_POST : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb), (ins GPR:$base, t2am_imm8_offset:$offset), - AddrModeT2_i8, IndexModePost, + AddrModeT2_i8, IndexModePost, IIC_iLoadiu, "ldrb", " $dst, [$base], $offset", "$base = $base_wb", []>; def t2LDRH_PRE : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb), (ins t2addrmode_imm8:$addr), - AddrModeT2_i8, IndexModePre, + AddrModeT2_i8, IndexModePre, IIC_iLoadiu, "ldrh", " $dst, $addr!", "$addr.base = $base_wb", []>; def t2LDRH_POST : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb), (ins GPR:$base, t2am_imm8_offset:$offset), - AddrModeT2_i8, IndexModePost, + AddrModeT2_i8, IndexModePost, IIC_iLoadiu, "ldrh", " $dst, [$base], $offset", "$base = $base_wb", []>; def t2LDRSB_PRE : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb), (ins t2addrmode_imm8:$addr), - AddrModeT2_i8, IndexModePre, + AddrModeT2_i8, IndexModePre, IIC_iLoadiu, "ldrsb", " $dst, $addr!", "$addr.base = $base_wb", []>; def t2LDRSB_POST : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb), (ins GPR:$base, t2am_imm8_offset:$offset), - AddrModeT2_i8, IndexModePost, + AddrModeT2_i8, IndexModePost, IIC_iLoadiu, "ldrsb", " $dst, [$base], $offset", "$base = $base_wb", []>; def t2LDRSH_PRE : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb), (ins t2addrmode_imm8:$addr), - AddrModeT2_i8, IndexModePre, + AddrModeT2_i8, IndexModePre, IIC_iLoadiu, "ldrsh", " $dst, $addr!", "$addr.base = $base_wb", []>; def t2LDRSH_POST : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb), (ins GPR:$base, t2am_imm8_offset:$offset), - AddrModeT2_i8, IndexModePost, + AddrModeT2_i8, IndexModePost, IIC_iLoadiu, "ldrsh", " $dst, [$base], $offset", "$base = $base_wb", []>; } @@ -634,108 +576,95 @@ defm t2STRB : T2I_st<"strb", BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>; defm t2STRH : T2I_st<"strh", BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>; // Store doubleword -let mayLoad = 1 in -def t2STRDi8 : T2Ii8s4<(outs), (ins GPR:$src, t2addrmode_imm8s4:$addr), - "strd", " $src, $addr", []>; +let mayLoad = 1, hasExtraSrcRegAllocReq = 1 in +def t2STRDi8 : T2Ii8s4<(outs), + (ins GPR:$src1, GPR:$src2, t2addrmode_imm8s4:$addr), + IIC_iStorer, "strd", " $src1, $addr", []>; // Indexed stores def t2STR_PRE : T2Iidxldst<(outs GPR:$base_wb), (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset), - AddrModeT2_i8, IndexModePre, + AddrModeT2_i8, IndexModePre, IIC_iStoreiu, "str", " $src, [$base, $offset]!", "$base = $base_wb", [(set GPR:$base_wb, (pre_store GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>; def t2STR_POST : T2Iidxldst<(outs GPR:$base_wb), (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset), - AddrModeT2_i8, IndexModePost, + AddrModeT2_i8, IndexModePost, IIC_iStoreiu, "str", " $src, [$base], $offset", "$base = $base_wb", [(set GPR:$base_wb, (post_store GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>; def t2STRH_PRE : T2Iidxldst<(outs GPR:$base_wb), (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset), - AddrModeT2_i8, IndexModePre, + AddrModeT2_i8, IndexModePre, IIC_iStoreiu, "strh", " $src, [$base, $offset]!", "$base = $base_wb", [(set GPR:$base_wb, (pre_truncsti16 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>; def t2STRH_POST : T2Iidxldst<(outs GPR:$base_wb), (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset), - AddrModeT2_i8, IndexModePost, + AddrModeT2_i8, IndexModePost, IIC_iStoreiu, "strh", " $src, [$base], $offset", "$base = $base_wb", [(set GPR:$base_wb, (post_truncsti16 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>; def t2STRB_PRE : T2Iidxldst<(outs GPR:$base_wb), (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset), - AddrModeT2_i8, IndexModePre, + AddrModeT2_i8, IndexModePre, IIC_iStoreiu, "strb", " $src, [$base, $offset]!", "$base = $base_wb", [(set GPR:$base_wb, (pre_truncsti8 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>; def t2STRB_POST : T2Iidxldst<(outs GPR:$base_wb), (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset), - AddrModeT2_i8, IndexModePost, + AddrModeT2_i8, IndexModePost, IIC_iStoreiu, "strb", " $src, [$base], $offset", "$base = $base_wb", [(set GPR:$base_wb, (post_truncsti8 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>; -// Address computation and loads and stores in PIC mode. -let isNotDuplicable = 1, AddedComplexity = 10 in { -let canFoldAsLoad = 1 in -def t2PICLDR : T2I_picld<"ldr", UnOpFrag<(load node:$Src)>>; - -def t2PICLDRH : T2I_picld<"ldrh", UnOpFrag<(zextloadi16 node:$Src)>>; -def t2PICLDRB : T2I_picld<"ldrb", UnOpFrag<(zextloadi8 node:$Src)>>; -def t2PICLDRSH : T2I_picld<"ldrsh", UnOpFrag<(sextloadi16 node:$Src)>>; -def t2PICLDRSB : T2I_picld<"ldrsb", UnOpFrag<(sextloadi8 node:$Src)>>; - -def t2PICSTR : T2I_picst<"str", BinOpFrag<(store node:$LHS, node:$RHS)>>; -def t2PICSTRH : T2I_picst<"strh", BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>; -def t2PICSTRB : T2I_picst<"strb", BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>; -} // isNotDuplicable = 1, AddedComplexity = 10 - +// FIXME: ldrd / strd pre / post variants //===----------------------------------------------------------------------===// // Load / store multiple Instructions. // -let mayLoad = 1 in +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in def t2LDM : T2XI<(outs), - (ins addrmode4:$addr, pred:$p, reglist:$dst1, variable_ops), - "ldm${p}${addr:submode} $addr, $dst1", []>; + (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops), + IIC_iLoadm, "ldm${addr:submode}${p}${addr:wide} $addr, $wb", []>; -let mayStore = 1 in +let mayStore = 1, hasExtraSrcRegAllocReq = 1 in def t2STM : T2XI<(outs), - (ins addrmode4:$addr, pred:$p, reglist:$src1, variable_ops), - "stm${p}${addr:submode} $addr, $src1", []>; + (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops), + IIC_iStorem, "stm${addr:submode}${p}${addr:wide} $addr, $wb", []>; //===----------------------------------------------------------------------===// // Move Instructions. // let neverHasSideEffects = 1 in -def t2MOVr : T2sI<(outs GPR:$dst), (ins GPR:$src), - "mov", " $dst, $src", []>; +def t2MOVr : T2sI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVr, + "mov", ".w $dst, $src", []>; -let isReMaterializable = 1, isAsCheapAsAMove = 1 in -def t2MOVi : T2sI<(outs GPR:$dst), (ins t2_so_imm:$src), - "mov", " $dst, $src", +// AddedComplexity to ensure isel tries t2MOVi before t2MOVi16. +let isReMaterializable = 1, isAsCheapAsAMove = 1, AddedComplexity = 1 in +def t2MOVi : T2sI<(outs GPR:$dst), (ins t2_so_imm:$src), IIC_iMOVi, + "mov", ".w $dst, $src", [(set GPR:$dst, t2_so_imm:$src)]>; let isReMaterializable = 1, isAsCheapAsAMove = 1 in -def t2MOVi16 : T2I<(outs GPR:$dst), (ins i32imm:$src), +def t2MOVi16 : T2I<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVi, "movw", " $dst, $src", [(set GPR:$dst, imm0_65535:$src)]>; -// FIXME: Also available in ARM mode. let Constraints = "$src = $dst" in -def t2MOVTi16 : T2sI<(outs GPR:$dst), (ins GPR:$src, i32imm:$imm), - "movt", " $dst, $imm", - [(set GPR:$dst, - (or (and GPR:$src, 0xffff), t2_lo16AllZero:$imm))]>; +def t2MOVTi16 : T2I<(outs GPR:$dst), (ins GPR:$src, i32imm:$imm), IIC_iMOVi, + "movt", " $dst, $imm", + [(set GPR:$dst, + (or (and GPR:$src, 0xffff), lo16AllZero:$imm))]>; //===----------------------------------------------------------------------===// // Extend Instructions. @@ -785,12 +714,14 @@ defm t2SUBS : T2I_bin_s_irs <"sub", BinOpFrag<(subc node:$LHS, node:$RHS)>>; defm t2ADC : T2I_adde_sube_irs<"adc",BinOpFrag<(adde node:$LHS, node:$RHS)>,1>; defm t2SBC : T2I_adde_sube_irs<"sbc",BinOpFrag<(sube node:$LHS, node:$RHS)>>; -// RSB, RSC +// RSB defm t2RSB : T2I_rbin_is <"rsb", BinOpFrag<(sub node:$LHS, node:$RHS)>>; defm t2RSBS : T2I_rbin_s_is <"rsb", BinOpFrag<(subc node:$LHS, node:$RHS)>>; -defm t2RSC : T2I_rsc_is <"rsc", BinOpFrag<(sube node:$LHS, node:$RHS)>>; // (sub X, imm) gets canonicalized to (add X, -imm). Match this form. +let AddedComplexity = 1 in +def : T2Pat<(add GPR:$src, imm0_255_neg:$imm), + (t2SUBri GPR:$src, imm0_255_neg:$imm)>; def : T2Pat<(add GPR:$src, t2_so_imm_neg:$imm), (t2SUBri GPR:$src, t2_so_imm_neg:$imm)>; def : T2Pat<(add GPR:$src, imm0_4095_neg:$imm), @@ -806,105 +737,250 @@ defm t2LSR : T2I_sh_ir<"lsr", BinOpFrag<(srl node:$LHS, node:$RHS)>>; defm t2ASR : T2I_sh_ir<"asr", BinOpFrag<(sra node:$LHS, node:$RHS)>>; defm t2ROR : T2I_sh_ir<"ror", BinOpFrag<(rotr node:$LHS, node:$RHS)>>; -def t2MOVrx : T2sI<(outs GPR:$dst), (ins GPR:$src), - "mov", " $dst, $src, rrx", +let Uses = [CPSR] in { +def t2MOVrx : T2sI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi, + "rrx", " $dst, $src", [(set GPR:$dst, (ARMrrx GPR:$src))]>; +} + +let Defs = [CPSR] in { +def t2MOVsrl_flag : T2XI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi, + "lsrs.w $dst, $src, #1", + [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>; +def t2MOVsra_flag : T2XI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi, + "asrs.w $dst, $src, #1", + [(set GPR:$dst, (ARMsra_flag GPR:$src))]>; +} //===----------------------------------------------------------------------===// // Bitwise Instructions. // -defm t2AND : T2I_bin_irs<"and", BinOpFrag<(and node:$LHS, node:$RHS)>, 1>; -defm t2ORR : T2I_bin_irs<"orr", BinOpFrag<(or node:$LHS, node:$RHS)>, 1>; -defm t2EOR : T2I_bin_irs<"eor", BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>; +defm t2AND : T2I_bin_w_irs<"and", BinOpFrag<(and node:$LHS, node:$RHS)>, 1>; +defm t2ORR : T2I_bin_w_irs<"orr", BinOpFrag<(or node:$LHS, node:$RHS)>, 1>; +defm t2EOR : T2I_bin_w_irs<"eor", BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>; -defm t2BIC : T2I_bin_irs<"bic", BinOpFrag<(and node:$LHS, (not node:$RHS))>>; +defm t2BIC : T2I_bin_w_irs<"bic", BinOpFrag<(and node:$LHS, (not node:$RHS))>>; -def : T2Pat<(and GPR:$src, t2_so_imm_not:$imm), - (t2BICri GPR:$src, t2_so_imm_not:$imm)>; +let Constraints = "$src = $dst" in +def t2BFC : T2I<(outs GPR:$dst), (ins GPR:$src, bf_inv_mask_imm:$imm), + IIC_iALUi, "bfc", " $dst, $imm", + [(set GPR:$dst, (and GPR:$src, bf_inv_mask_imm:$imm))]>; -defm t2ORN : T2I_bin_irs<"orn", BinOpFrag<(or node:$LHS, (not node:$RHS))>>; +def t2SBFX : T2I<(outs GPR:$dst), (ins GPR:$src, imm0_31:$lsb, imm0_31:$width), + IIC_iALUi, "sbfx", " $dst, $src, $lsb, $width", []>; -def : T2Pat<(or GPR:$src, t2_so_imm_not:$imm), - (t2ORNri GPR:$src, t2_so_imm_not:$imm)>; +def t2UBFX : T2I<(outs GPR:$dst), (ins GPR:$src, imm0_31:$lsb, imm0_31:$width), + IIC_iALUi, "ubfx", " $dst, $src, $lsb, $width", []>; + +// FIXME: A8.6.18 BFI - Bitfield insert (Encoding T1) + +defm t2ORN : T2I_bin_irs<"orn", BinOpFrag<(or node:$LHS, (not node:$RHS))>>; // Prefer over of t2EORri ra, rb, -1 because mvn has 16-bit version let AddedComplexity = 1 in defm t2MVN : T2I_un_irs <"mvn", UnOpFrag<(not node:$Src)>, 1, 1>; -def : T2Pat<(t2_so_imm_not:$src), - (t2MVNi t2_so_imm_not:$src)>; -// A8.6.17 BFC - Bitfield clear -// FIXME: Also available in ARM mode. -let Constraints = "$src = $dst" in -def t2BFC : T2I<(outs GPR:$dst), (ins GPR:$src, bf_inv_mask_imm:$imm), - "bfc", " $dst, $imm", - [(set GPR:$dst, (and GPR:$src, bf_inv_mask_imm:$imm))]>; +def : T2Pat<(and GPR:$src, t2_so_imm_not:$imm), + (t2BICri GPR:$src, t2_so_imm_not:$imm)>; -// FIXME: A8.6.18 BFI - Bitfield insert (Encoding T1) +// FIXME: Disable this pattern on Darwin to workaround an assembler bug. +def : T2Pat<(or GPR:$src, t2_so_imm_not:$imm), + (t2ORNri GPR:$src, t2_so_imm_not:$imm)>, + Requires<[IsThumb2]>; + +def : T2Pat<(t2_so_imm_not:$src), + (t2MVNi t2_so_imm_not:$src)>; //===----------------------------------------------------------------------===// // Multiply Instructions. // let isCommutable = 1 in -def t2MUL: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), +def t2MUL: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32, "mul", " $dst, $a, $b", [(set GPR:$dst, (mul GPR:$a, GPR:$b))]>; -def t2MLA: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), +def t2MLA: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32, "mla", " $dst, $a, $b, $c", [(set GPR:$dst, (add (mul GPR:$a, GPR:$b), GPR:$c))]>; -def t2MLS: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), +def t2MLS: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32, "mls", " $dst, $a, $b, $c", [(set GPR:$dst, (sub GPR:$c, (mul GPR:$a, GPR:$b)))]>; -// FIXME: SMULL, etc. +// Extra precision multiplies with low / high results +let neverHasSideEffects = 1 in { +let isCommutable = 1 in { +def t2SMULL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMUL64, + "smull", " $ldst, $hdst, $a, $b", []>; + +def t2UMULL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMUL64, + "umull", " $ldst, $hdst, $a, $b", []>; +} + +// Multiply + accumulate +def t2SMLAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMAC64, + "smlal", " $ldst, $hdst, $a, $b", []>; + +def t2UMLAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMAC64, + "umlal", " $ldst, $hdst, $a, $b", []>; + +def t2UMAAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMAC64, + "umaal", " $ldst, $hdst, $a, $b", []>; +} // neverHasSideEffects + +// Most significant word multiply +def t2SMMUL : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32, + "smmul", " $dst, $a, $b", + [(set GPR:$dst, (mulhs GPR:$a, GPR:$b))]>; + +def t2SMMLA : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32, + "smmla", " $dst, $a, $b, $c", + [(set GPR:$dst, (add (mulhs GPR:$a, GPR:$b), GPR:$c))]>; + + +def t2SMMLS : T2I <(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32, + "smmls", " $dst, $a, $b, $c", + [(set GPR:$dst, (sub GPR:$c, (mulhs GPR:$a, GPR:$b)))]>; + +multiclass T2I_smul<string opc, PatFrag opnode> { + def BB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32, + !strconcat(opc, "bb"), " $dst, $a, $b", + [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16), + (sext_inreg GPR:$b, i16)))]>; + + def BT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32, + !strconcat(opc, "bt"), " $dst, $a, $b", + [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16), + (sra GPR:$b, (i32 16))))]>; + + def TB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32, + !strconcat(opc, "tb"), " $dst, $a, $b", + [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)), + (sext_inreg GPR:$b, i16)))]>; + + def TT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32, + !strconcat(opc, "tt"), " $dst, $a, $b", + [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)), + (sra GPR:$b, (i32 16))))]>; + + def WB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL16, + !strconcat(opc, "wb"), " $dst, $a, $b", + [(set GPR:$dst, (sra (opnode GPR:$a, + (sext_inreg GPR:$b, i16)), (i32 16)))]>; + + def WT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL16, + !strconcat(opc, "wt"), " $dst, $a, $b", + [(set GPR:$dst, (sra (opnode GPR:$a, + (sra GPR:$b, (i32 16))), (i32 16)))]>; +} + + +multiclass T2I_smla<string opc, PatFrag opnode> { + def BB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16, + !strconcat(opc, "bb"), " $dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, + (opnode (sext_inreg GPR:$a, i16), + (sext_inreg GPR:$b, i16))))]>; + + def BT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16, + !strconcat(opc, "bt"), " $dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, (opnode (sext_inreg GPR:$a, i16), + (sra GPR:$b, (i32 16)))))]>; + + def TB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16, + !strconcat(opc, "tb"), " $dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)), + (sext_inreg GPR:$b, i16))))]>; + + def TT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16, + !strconcat(opc, "tt"), " $dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)), + (sra GPR:$b, (i32 16)))))]>; + + def WB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16, + !strconcat(opc, "wb"), " $dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a, + (sext_inreg GPR:$b, i16)), (i32 16))))]>; + + def WT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16, + !strconcat(opc, "wt"), " $dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a, + (sra GPR:$b, (i32 16))), (i32 16))))]>; +} + +defm t2SMUL : T2I_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>; +defm t2SMLA : T2I_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>; + +// TODO: Halfword multiple accumulate long: SMLAL<x><y> +// TODO: Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD + //===----------------------------------------------------------------------===// // Misc. Arithmetic Instructions. // -def t2CLZ : T2I<(outs GPR:$dst), (ins GPR:$src), +def t2CLZ : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, "clz", " $dst, $src", [(set GPR:$dst, (ctlz GPR:$src))]>; -def t2REV : T2I<(outs GPR:$dst), (ins GPR:$src), - "rev", " $dst, $src", +def t2REV : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, + "rev", ".w $dst, $src", [(set GPR:$dst, (bswap GPR:$src))]>; -def t2REV16 : T2I<(outs GPR:$dst), (ins GPR:$src), - "rev16", " $dst, $src", +def t2REV16 : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, + "rev16", ".w $dst, $src", [(set GPR:$dst, (or (and (srl GPR:$src, (i32 8)), 0xFF), (or (and (shl GPR:$src, (i32 8)), 0xFF00), (or (and (srl GPR:$src, (i32 8)), 0xFF0000), (and (shl GPR:$src, (i32 8)), 0xFF000000)))))]>; -///// -/// A8.6.137 REVSH -///// -def t2REVSH : T2I<(outs GPR:$dst), (ins GPR:$src), - "revsh", " $dst, $src", +def t2REVSH : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, + "revsh", ".w $dst, $src", [(set GPR:$dst, (sext_inreg - (or (srl (and GPR:$src, 0xFFFF), (i32 8)), + (or (srl (and GPR:$src, 0xFF00), (i32 8)), (shl GPR:$src, (i32 8))), i16))]>; -// FIXME: PKHxx etc. +def t2PKHBT : T2I<(outs GPR:$dst), (ins GPR:$src1, GPR:$src2, i32imm:$shamt), + IIC_iALUsi, "pkhbt", " $dst, $src1, $src2, LSL $shamt", + [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF), + (and (shl GPR:$src2, (i32 imm:$shamt)), + 0xFFFF0000)))]>; + +// Alternate cases for PKHBT where identities eliminate some nodes. +def : T2Pat<(or (and GPR:$src1, 0xFFFF), (and GPR:$src2, 0xFFFF0000)), + (t2PKHBT GPR:$src1, GPR:$src2, 0)>; +def : T2Pat<(or (and GPR:$src1, 0xFFFF), (shl GPR:$src2, imm16_31:$shamt)), + (t2PKHBT GPR:$src1, GPR:$src2, imm16_31:$shamt)>; + +def t2PKHTB : T2I<(outs GPR:$dst), (ins GPR:$src1, GPR:$src2, i32imm:$shamt), + IIC_iALUsi, "pkhtb", " $dst, $src1, $src2, ASR $shamt", + [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF0000), + (and (sra GPR:$src2, imm16_31:$shamt), + 0xFFFF)))]>; + +// Alternate cases for PKHTB where identities eliminate some nodes. Note that +// a shift amount of 0 is *not legal* here, it is PKHBT instead. +def : T2Pat<(or (and GPR:$src1, 0xFFFF0000), (srl GPR:$src2, (i32 16))), + (t2PKHTB GPR:$src1, GPR:$src2, 16)>; +def : T2Pat<(or (and GPR:$src1, 0xFFFF0000), + (and (srl GPR:$src2, imm1_15:$shamt), 0xFFFF)), + (t2PKHTB GPR:$src1, GPR:$src2, imm1_15:$shamt)>; //===----------------------------------------------------------------------===// // Comparison Instructions... // -defm t2CMP : T2I_cmp_is<"cmp", - BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>; +defm t2CMP : T2I_cmp_is<"cmp", + BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>; defm t2CMPz : T2I_cmp_is<"cmp", BinOpFrag<(ARMcmpZ node:$LHS, node:$RHS)>>; -defm t2CMN : T2I_cmp_is<"cmn", - BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>; +defm t2CMN : T2I_cmp_is<"cmn", + BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>; defm t2CMNz : T2I_cmp_is<"cmn", BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>>; @@ -923,45 +999,132 @@ defm t2TEQ : T2I_cmp_is<"teq", // Short range conditional branch. Looks awesome for loops. Need to figure // out how to use this one. -// FIXME: Conditional moves + +// Conditional moves +// FIXME: should be able to write a pattern for ARMcmov, but can't use +// a two-value operand where a dag node expects two operands. :( +def t2MOVCCr : T2I<(outs GPR:$dst), (ins GPR:$false, GPR:$true), IIC_iCMOVr, + "mov", ".w $dst, $true", + [/*(set GPR:$dst, (ARMcmov GPR:$false, GPR:$true, imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $dst">; + +def t2MOVCCi : T2I<(outs GPR:$dst), (ins GPR:$false, t2_so_imm:$true), + IIC_iCMOVi, "mov", ".w $dst, $true", +[/*(set GPR:$dst, (ARMcmov GPR:$false, t2_so_imm:$true, imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $dst">; + +def t2MOVCClsl : T2I<(outs GPR:$dst), (ins GPR:$false, GPR:$true, i32imm:$rhs), + IIC_iCMOVsi, "lsl", ".w $dst, $true, $rhs", []>, + RegConstraint<"$false = $dst">; +def t2MOVCClsr : T2I<(outs GPR:$dst), (ins GPR:$false, GPR:$true, i32imm:$rhs), + IIC_iCMOVsi, "lsr", ".w $dst, $true, $rhs", []>, + RegConstraint<"$false = $dst">; +def t2MOVCCasr : T2I<(outs GPR:$dst), (ins GPR:$false, GPR:$true, i32imm:$rhs), + IIC_iCMOVsi, "asr", ".w $dst, $true, $rhs", []>, + RegConstraint<"$false = $dst">; +def t2MOVCCror : T2I<(outs GPR:$dst), (ins GPR:$false, GPR:$true, i32imm:$rhs), + IIC_iCMOVsi, "ror", ".w $dst, $true, $rhs", []>, + RegConstraint<"$false = $dst">; + +//===----------------------------------------------------------------------===// +// TLS Instructions +// + +// __aeabi_read_tp preserves the registers r1-r3. +let isCall = 1, + Defs = [R0, R12, LR, CPSR] in { + def t2TPsoft : T2XI<(outs), (ins), IIC_Br, + "bl __aeabi_read_tp", + [(set R0, ARMthread_pointer)]>; +} + +//===----------------------------------------------------------------------===// +// SJLJ Exception handling intrinsics +// eh_sjlj_setjmp() is an instruction sequence to store the return +// address and save #0 in R0 for the non-longjmp case. +// Since by its nature we may be coming from some other function to get +// here, and we're using the stack frame for the containing function to +// save/restore registers, we can't keep anything live in regs across +// the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon +// when we get here from a longjmp(). We force everthing out of registers +// except for our own input by listing the relevant registers in Defs. By +// doing so, we also cause the prologue/epilogue code to actively preserve +// all of the callee-saved resgisters, which is exactly what we want. +let Defs = + [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, D0, + D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15, + D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, + D31 ] in { + def t2Int_eh_sjlj_setjmp : Thumb2XI<(outs), (ins GPR:$src), + AddrModeNone, SizeSpecial, NoItinerary, + "str.w sp, [$src, #+8] @ eh_setjmp begin\n" + "\tadr r12, 0f\n" + "\torr r12, #1\n" + "\tstr.w r12, [$src, #+4]\n" + "\tmovs r0, #0\n" + "\tb 1f\n" + "0:\tmovs r0, #1 @ eh_setjmp end\n" + "1:", "", + [(set R0, (ARMeh_sjlj_setjmp GPR:$src))]>; +} + + //===----------------------------------------------------------------------===// // Control-Flow Instructions // +// FIXME: remove when we have a way to marking a MI with these properties. +// FIXME: $dst1 should be a def. But the extra ops must be in the end of the +// operand list. +// FIXME: Should pc be an implicit operand like PICADD, etc? +let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1, + hasExtraDefRegAllocReq = 1 in + def t2LDM_RET : T2XI<(outs), + (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops), + IIC_Br, "ldm${addr:submode}${p}${addr:wide} $addr, $wb", + []>; + let isBranch = 1, isTerminator = 1, isBarrier = 1 in { let isPredicable = 1 in -def t2B : T2XI<(outs), (ins brtarget:$target), - "b $target", +def t2B : T2XI<(outs), (ins brtarget:$target), IIC_Br, + "b.w $target", [(br bb:$target)]>; let isNotDuplicable = 1, isIndirectBranch = 1 in { -def t2BR_JTr : T2JTI<(outs), (ins GPR:$target, jtblock_operand:$jt, i32imm:$id), - "mov pc, $target \n$jt", - [(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]>; +def t2BR_JT : + T2JTI<(outs), + (ins GPR:$target, GPR:$index, jt2block_operand:$jt, i32imm:$id), + IIC_Br, "mov pc, $target\n$jt", + [(ARMbr2jt GPR:$target, GPR:$index, tjumptable:$jt, imm:$id)]>; -def t2BR_JTm : +// FIXME: Add a non-pc based case that can be predicated. +def t2TBB : T2JTI<(outs), - (ins t2addrmode_so_reg:$target, jtblock_operand:$jt, i32imm:$id), - "ldr pc, $target \n$jt", - [(ARMbrjt (i32 (load t2addrmode_so_reg:$target)), tjumptable:$jt, - imm:$id)]>; + (ins tb_addrmode:$index, jt2block_operand:$jt, i32imm:$id), + IIC_Br, "tbb $index\n$jt", []>; -def t2BR_JTadd : +def t2TBH : T2JTI<(outs), - (ins GPR:$target, GPR:$idx, jtblock_operand:$jt, i32imm:$id), - "add pc, $target, $idx \n$jt", - [(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt, imm:$id)]>; -} // isNotDuplicate, isIndirectBranch + (ins tb_addrmode:$index, jt2block_operand:$jt, i32imm:$id), + IIC_Br, "tbh $index\n$jt", []>; +} // isNotDuplicable, isIndirectBranch + } // isBranch, isTerminator, isBarrier // FIXME: should be able to write a pattern for ARMBrcond, but can't use // a two-value operand where a dag node expects two operands. :( let isBranch = 1, isTerminator = 1 in -def t2Bcc : T2I<(outs), (ins brtarget:$target), - "b", " $target", +def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br, + "b", ".w $target", [/*(ARMbrcond bb:$target, imm:$cc)*/]>; + +// IT block +def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask), + AddrModeNone, Size2Bytes, IIC_iALUx, + "it$mask $cc", "", []>; + //===----------------------------------------------------------------------===// // Non-Instruction Patterns // @@ -972,7 +1135,10 @@ def : T2Pat<(ARMWrapper tconstpool :$dst), (t2LEApcrel tconstpool :$dst)>; def : T2Pat<(ARMWrapperJT tjumptable:$dst, imm:$id), (t2LEApcrelJT tjumptable:$dst, imm:$id)>; -// Large immediate handling. - -def : T2Pat<(i32 imm:$src), - (t2MOVTi16 (t2MOVi16 (t2_lo16 imm:$src)), (t2_hi16 imm:$src))>; +// 32-bit immediate using movw + movt. +// This is a single pseudo instruction to make it re-materializable. Remove +// when we can do generalized remat. +let isReMaterializable = 1 in +def t2MOVi32imm : T2Ix2<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVi, + "movw", " $dst, ${src:lo16}\n\tmovt${p} $dst, ${src:hi16}", + [(set GPR:$dst, (i32 imm:$src))]>; diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td index 9104c77..56336d1 100644 --- a/lib/Target/ARM/ARMInstrVFP.td +++ b/lib/Target/ARM/ARMInstrVFP.td @@ -36,57 +36,57 @@ def arm_fmdrr : SDNode<"ARMISD::FMDRR", SDT_FMDRR>; let canFoldAsLoad = 1 in { def FLDD : ADI5<0b1101, 0b01, (outs DPR:$dst), (ins addrmode5:$addr), - "fldd", " $dst, $addr", + IIC_fpLoad64, "fldd", " $dst, $addr", [(set DPR:$dst, (load addrmode5:$addr))]>; def FLDS : ASI5<0b1101, 0b01, (outs SPR:$dst), (ins addrmode5:$addr), - "flds", " $dst, $addr", + IIC_fpLoad32, "flds", " $dst, $addr", [(set SPR:$dst, (load addrmode5:$addr))]>; } // canFoldAsLoad def FSTD : ADI5<0b1101, 0b00, (outs), (ins DPR:$src, addrmode5:$addr), - "fstd", " $src, $addr", + IIC_fpStore64, "fstd", " $src, $addr", [(store DPR:$src, addrmode5:$addr)]>; def FSTS : ASI5<0b1101, 0b00, (outs), (ins SPR:$src, addrmode5:$addr), - "fsts", " $src, $addr", + IIC_fpStore32, "fsts", " $src, $addr", [(store SPR:$src, addrmode5:$addr)]>; //===----------------------------------------------------------------------===// // Load / store multiple Instructions. // -let mayLoad = 1 in { -def FLDMD : AXDI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$dst1, - variable_ops), - "fldm${addr:submode}d${p} ${addr:base}, $dst1", +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in { +def FLDMD : AXDI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$wb, + variable_ops), IIC_fpLoadm, + "fldm${addr:submode}d${p} ${addr:base}, $wb", []> { let Inst{20} = 1; } -def FLDMS : AXSI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$dst1, - variable_ops), - "fldm${addr:submode}s${p} ${addr:base}, $dst1", +def FLDMS : AXSI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$wb, + variable_ops), IIC_fpLoadm, + "fldm${addr:submode}s${p} ${addr:base}, $wb", []> { let Inst{20} = 1; } -} +} // mayLoad, hasExtraDefRegAllocReq -let mayStore = 1 in { -def FSTMD : AXDI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$src1, - variable_ops), - "fstm${addr:submode}d${p} ${addr:base}, $src1", +let mayStore = 1, hasExtraSrcRegAllocReq = 1 in { +def FSTMD : AXDI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$wb, + variable_ops), IIC_fpStorem, + "fstm${addr:submode}d${p} ${addr:base}, $wb", []> { let Inst{20} = 0; } -def FSTMS : AXSI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$src1, - variable_ops), - "fstm${addr:submode}s${p} ${addr:base}, $src1", +def FSTMS : AXSI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$wb, + variable_ops), IIC_fpStorem, + "fstm${addr:submode}s${p} ${addr:base}, $wb", []> { let Inst{20} = 0; } -} // mayStore +} // mayStore, hasExtraSrcRegAllocReq // FLDMX, FSTMX - mixing S/D registers for pre-armv6 cores @@ -95,46 +95,48 @@ def FSTMS : AXSI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$src1, // def FADDD : ADbI<0b11100011, (outs DPR:$dst), (ins DPR:$a, DPR:$b), - "faddd", " $dst, $a, $b", + IIC_fpALU64, "faddd", " $dst, $a, $b", [(set DPR:$dst, (fadd DPR:$a, DPR:$b))]>; -def FADDS : ASbI<0b11100011, (outs SPR:$dst), (ins SPR:$a, SPR:$b), - "fadds", " $dst, $a, $b", - [(set SPR:$dst, (fadd SPR:$a, SPR:$b))]>; +def FADDS : ASbIn<0b11100011, (outs SPR:$dst), (ins SPR:$a, SPR:$b), + IIC_fpALU32, "fadds", " $dst, $a, $b", + [(set SPR:$dst, (fadd SPR:$a, SPR:$b))]>; // These are encoded as unary instructions. +let Defs = [FPSCR] in { def FCMPED : ADuI<0b11101011, 0b0100, 0b1100, (outs), (ins DPR:$a, DPR:$b), - "fcmped", " $a, $b", + IIC_fpCMP64, "fcmped", " $a, $b", [(arm_cmpfp DPR:$a, DPR:$b)]>; def FCMPES : ASuI<0b11101011, 0b0100, 0b1100, (outs), (ins SPR:$a, SPR:$b), - "fcmpes", " $a, $b", + IIC_fpCMP32, "fcmpes", " $a, $b", [(arm_cmpfp SPR:$a, SPR:$b)]>; +} def FDIVD : ADbI<0b11101000, (outs DPR:$dst), (ins DPR:$a, DPR:$b), - "fdivd", " $dst, $a, $b", + IIC_fpDIV64, "fdivd", " $dst, $a, $b", [(set DPR:$dst, (fdiv DPR:$a, DPR:$b))]>; def FDIVS : ASbI<0b11101000, (outs SPR:$dst), (ins SPR:$a, SPR:$b), - "fdivs", " $dst, $a, $b", + IIC_fpDIV32, "fdivs", " $dst, $a, $b", [(set SPR:$dst, (fdiv SPR:$a, SPR:$b))]>; def FMULD : ADbI<0b11100010, (outs DPR:$dst), (ins DPR:$a, DPR:$b), - "fmuld", " $dst, $a, $b", + IIC_fpMUL64, "fmuld", " $dst, $a, $b", [(set DPR:$dst, (fmul DPR:$a, DPR:$b))]>; -def FMULS : ASbI<0b11100010, (outs SPR:$dst), (ins SPR:$a, SPR:$b), - "fmuls", " $dst, $a, $b", - [(set SPR:$dst, (fmul SPR:$a, SPR:$b))]>; +def FMULS : ASbIn<0b11100010, (outs SPR:$dst), (ins SPR:$a, SPR:$b), + IIC_fpMUL32, "fmuls", " $dst, $a, $b", + [(set SPR:$dst, (fmul SPR:$a, SPR:$b))]>; def FNMULD : ADbI<0b11100010, (outs DPR:$dst), (ins DPR:$a, DPR:$b), - "fnmuld", " $dst, $a, $b", + IIC_fpMUL64, "fnmuld", " $dst, $a, $b", [(set DPR:$dst, (fneg (fmul DPR:$a, DPR:$b)))]> { let Inst{6} = 1; } def FNMULS : ASbI<0b11100010, (outs SPR:$dst), (ins SPR:$a, SPR:$b), - "fnmuls", " $dst, $a, $b", + IIC_fpMUL32, "fnmuls", " $dst, $a, $b", [(set SPR:$dst, (fneg (fmul SPR:$a, SPR:$b)))]> { let Inst{6} = 1; } @@ -147,14 +149,14 @@ def : Pat<(fmul (fneg SPR:$a), SPR:$b), def FSUBD : ADbI<0b11100011, (outs DPR:$dst), (ins DPR:$a, DPR:$b), - "fsubd", " $dst, $a, $b", + IIC_fpALU64, "fsubd", " $dst, $a, $b", [(set DPR:$dst, (fsub DPR:$a, DPR:$b))]> { let Inst{6} = 1; } -def FSUBS : ASbI<0b11100011, (outs SPR:$dst), (ins SPR:$a, SPR:$b), - "fsubs", " $dst, $a, $b", - [(set SPR:$dst, (fsub SPR:$a, SPR:$b))]> { +def FSUBS : ASbIn<0b11100011, (outs SPR:$dst), (ins SPR:$a, SPR:$b), + IIC_fpALU32, "fsubs", " $dst, $a, $b", + [(set SPR:$dst, (fsub SPR:$a, SPR:$b))]> { let Inst{6} = 1; } @@ -163,29 +165,31 @@ def FSUBS : ASbI<0b11100011, (outs SPR:$dst), (ins SPR:$a, SPR:$b), // def FABSD : ADuI<0b11101011, 0b0000, 0b1100, (outs DPR:$dst), (ins DPR:$a), - "fabsd", " $dst, $a", + IIC_fpUNA64, "fabsd", " $dst, $a", [(set DPR:$dst, (fabs DPR:$a))]>; -def FABSS : ASuI<0b11101011, 0b0000, 0b1100, (outs SPR:$dst), (ins SPR:$a), - "fabss", " $dst, $a", - [(set SPR:$dst, (fabs SPR:$a))]>; +def FABSS : ASuIn<0b11101011, 0b0000, 0b1100, (outs SPR:$dst), (ins SPR:$a), + IIC_fpUNA32, "fabss", " $dst, $a", + [(set SPR:$dst, (fabs SPR:$a))]>; +let Defs = [FPSCR] in { def FCMPEZD : ADuI<0b11101011, 0b0101, 0b1100, (outs), (ins DPR:$a), - "fcmpezd", " $a", + IIC_fpCMP64, "fcmpezd", " $a", [(arm_cmpfp0 DPR:$a)]>; def FCMPEZS : ASuI<0b11101011, 0b0101, 0b1100, (outs), (ins SPR:$a), - "fcmpezs", " $a", + IIC_fpCMP32, "fcmpezs", " $a", [(arm_cmpfp0 SPR:$a)]>; +} def FCVTDS : ASuI<0b11101011, 0b0111, 0b1100, (outs DPR:$dst), (ins SPR:$a), - "fcvtds", " $dst, $a", + IIC_fpCVTDS, "fcvtds", " $dst, $a", [(set DPR:$dst, (fextend SPR:$a))]>; // Special case encoding: bits 11-8 is 0b1011. -def FCVTSD : AI<(outs SPR:$dst), (ins DPR:$a), VFPUnaryFrm, - "fcvtsd", " $dst, $a", - [(set SPR:$dst, (fround DPR:$a))]> { +def FCVTSD : VFPAI<(outs SPR:$dst), (ins DPR:$a), VFPUnaryFrm, + IIC_fpCVTSD, "fcvtsd", " $dst, $a", + [(set SPR:$dst, (fround DPR:$a))]> { let Inst{27-23} = 0b11101; let Inst{21-16} = 0b110111; let Inst{11-8} = 0b1011; @@ -194,26 +198,26 @@ def FCVTSD : AI<(outs SPR:$dst), (ins DPR:$a), VFPUnaryFrm, let neverHasSideEffects = 1 in { def FCPYD : ADuI<0b11101011, 0b0000, 0b0100, (outs DPR:$dst), (ins DPR:$a), - "fcpyd", " $dst, $a", []>; + IIC_fpUNA64, "fcpyd", " $dst, $a", []>; def FCPYS : ASuI<0b11101011, 0b0000, 0b0100, (outs SPR:$dst), (ins SPR:$a), - "fcpys", " $dst, $a", []>; + IIC_fpUNA32, "fcpys", " $dst, $a", []>; } // neverHasSideEffects def FNEGD : ADuI<0b11101011, 0b0001, 0b0100, (outs DPR:$dst), (ins DPR:$a), - "fnegd", " $dst, $a", + IIC_fpUNA64, "fnegd", " $dst, $a", [(set DPR:$dst, (fneg DPR:$a))]>; -def FNEGS : ASuI<0b11101011, 0b0001, 0b0100, (outs SPR:$dst), (ins SPR:$a), - "fnegs", " $dst, $a", - [(set SPR:$dst, (fneg SPR:$a))]>; +def FNEGS : ASuIn<0b11101011, 0b0001, 0b0100, (outs SPR:$dst), (ins SPR:$a), + IIC_fpUNA32, "fnegs", " $dst, $a", + [(set SPR:$dst, (fneg SPR:$a))]>; def FSQRTD : ADuI<0b11101011, 0b0001, 0b1100, (outs DPR:$dst), (ins DPR:$a), - "fsqrtd", " $dst, $a", + IIC_fpSQRT64, "fsqrtd", " $dst, $a", [(set DPR:$dst, (fsqrt DPR:$a))]>; def FSQRTS : ASuI<0b11101011, 0b0001, 0b1100, (outs SPR:$dst), (ins SPR:$a), - "fsqrts", " $dst, $a", + IIC_fpSQRT32, "fsqrts", " $dst, $a", [(set SPR:$dst, (fsqrt SPR:$a))]>; //===----------------------------------------------------------------------===// @@ -221,16 +225,16 @@ def FSQRTS : ASuI<0b11101011, 0b0001, 0b1100, (outs SPR:$dst), (ins SPR:$a), // def FMRS : AVConv2I<0b11100001, 0b1010, (outs GPR:$dst), (ins SPR:$src), - "fmrs", " $dst, $src", + IIC_VMOVSI, "fmrs", " $dst, $src", [(set GPR:$dst, (bitconvert SPR:$src))]>; def FMSR : AVConv4I<0b11100000, 0b1010, (outs SPR:$dst), (ins GPR:$src), - "fmsr", " $dst, $src", + IIC_VMOVIS, "fmsr", " $dst, $src", [(set SPR:$dst, (bitconvert GPR:$src))]>; def FMRRD : AVConv3I<0b11000101, 0b1011, - (outs GPR:$dst1, GPR:$dst2), (ins DPR:$src), - "fmrrd", " $dst1, $dst2, $src", + (outs GPR:$wb, GPR:$dst2), (ins DPR:$src), + IIC_VMOVDI, "fmrrd", " $wb, $dst2, $src", [/* FIXME: Can't write pattern for multiple result instr*/]>; // FMDHR: GPR -> SPR @@ -238,7 +242,7 @@ def FMRRD : AVConv3I<0b11000101, 0b1011, def FMDRR : AVConv5I<0b11000100, 0b1011, (outs DPR:$dst), (ins GPR:$src1, GPR:$src2), - "fmdrr", " $dst, $src1, $src2", + IIC_VMOVID, "fmdrr", " $dst, $src1, $src2", [(set DPR:$dst, (arm_fmdrr GPR:$src1, GPR:$src2))]>; // FMRDH: SPR -> GPR @@ -254,23 +258,23 @@ def FMDRR : AVConv5I<0b11000100, 0b1011, // Int to FP: def FSITOD : AVConv1I<0b11101011, 0b1000, 0b1011, (outs DPR:$dst), (ins SPR:$a), - "fsitod", " $dst, $a", + IIC_fpCVTID, "fsitod", " $dst, $a", [(set DPR:$dst, (arm_sitof SPR:$a))]> { let Inst{7} = 1; } -def FSITOS : AVConv1I<0b11101011, 0b1000, 0b1010, (outs SPR:$dst), (ins SPR:$a), - "fsitos", " $dst, $a", +def FSITOS : AVConv1In<0b11101011, 0b1000, 0b1010, (outs SPR:$dst),(ins SPR:$a), + IIC_fpCVTIS, "fsitos", " $dst, $a", [(set SPR:$dst, (arm_sitof SPR:$a))]> { let Inst{7} = 1; } def FUITOD : AVConv1I<0b11101011, 0b1000, 0b1011, (outs DPR:$dst), (ins SPR:$a), - "fuitod", " $dst, $a", + IIC_fpCVTID, "fuitod", " $dst, $a", [(set DPR:$dst, (arm_uitof SPR:$a))]>; -def FUITOS : AVConv1I<0b11101011, 0b1000, 0b1010, (outs SPR:$dst), (ins SPR:$a), - "fuitos", " $dst, $a", +def FUITOS : AVConv1In<0b11101011, 0b1000, 0b1010, (outs SPR:$dst),(ins SPR:$a), + IIC_fpCVTIS, "fuitos", " $dst, $a", [(set SPR:$dst, (arm_uitof SPR:$a))]>; // FP to Int: @@ -278,28 +282,28 @@ def FUITOS : AVConv1I<0b11101011, 0b1000, 0b1010, (outs SPR:$dst), (ins SPR:$a), def FTOSIZD : AVConv1I<0b11101011, 0b1101, 0b1011, (outs SPR:$dst), (ins DPR:$a), - "ftosizd", " $dst, $a", + IIC_fpCVTDI, "ftosizd", " $dst, $a", [(set SPR:$dst, (arm_ftosi DPR:$a))]> { let Inst{7} = 1; // Z bit } -def FTOSIZS : AVConv1I<0b11101011, 0b1101, 0b1010, - (outs SPR:$dst), (ins SPR:$a), - "ftosizs", " $dst, $a", +def FTOSIZS : AVConv1In<0b11101011, 0b1101, 0b1010, + (outs SPR:$dst), (ins SPR:$a), + IIC_fpCVTSI, "ftosizs", " $dst, $a", [(set SPR:$dst, (arm_ftosi SPR:$a))]> { let Inst{7} = 1; // Z bit } def FTOUIZD : AVConv1I<0b11101011, 0b1100, 0b1011, (outs SPR:$dst), (ins DPR:$a), - "ftouizd", " $dst, $a", + IIC_fpCVTDI, "ftouizd", " $dst, $a", [(set SPR:$dst, (arm_ftoui DPR:$a))]> { let Inst{7} = 1; // Z bit } -def FTOUIZS : AVConv1I<0b11101011, 0b1100, 0b1010, - (outs SPR:$dst), (ins SPR:$a), - "ftouizs", " $dst, $a", +def FTOUIZS : AVConv1In<0b11101011, 0b1100, 0b1010, + (outs SPR:$dst), (ins SPR:$a), + IIC_fpCVTSI, "ftouizs", " $dst, $a", [(set SPR:$dst, (arm_ftoui SPR:$a))]> { let Inst{7} = 1; // Z bit } @@ -309,48 +313,53 @@ def FTOUIZS : AVConv1I<0b11101011, 0b1100, 0b1010, // def FMACD : ADbI<0b11100000, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b), - "fmacd", " $dst, $a, $b", + IIC_fpMAC64, "fmacd", " $dst, $a, $b", [(set DPR:$dst, (fadd (fmul DPR:$a, DPR:$b), DPR:$dstin))]>, RegConstraint<"$dstin = $dst">; -def FMACS : ASbI<0b11100000, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b), - "fmacs", " $dst, $a, $b", - [(set SPR:$dst, (fadd (fmul SPR:$a, SPR:$b), SPR:$dstin))]>, - RegConstraint<"$dstin = $dst">; +def FMACS : ASbIn<0b11100000, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b), + IIC_fpMAC32, "fmacs", " $dst, $a, $b", + [(set SPR:$dst, (fadd (fmul SPR:$a, SPR:$b), SPR:$dstin))]>, + RegConstraint<"$dstin = $dst">; def FMSCD : ADbI<0b11100001, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b), - "fmscd", " $dst, $a, $b", + IIC_fpMAC64, "fmscd", " $dst, $a, $b", [(set DPR:$dst, (fsub (fmul DPR:$a, DPR:$b), DPR:$dstin))]>, RegConstraint<"$dstin = $dst">; def FMSCS : ASbI<0b11100001, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b), - "fmscs", " $dst, $a, $b", + IIC_fpMAC32, "fmscs", " $dst, $a, $b", [(set SPR:$dst, (fsub (fmul SPR:$a, SPR:$b), SPR:$dstin))]>, RegConstraint<"$dstin = $dst">; def FNMACD : ADbI<0b11100000, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b), - "fnmacd", " $dst, $a, $b", + IIC_fpMAC64, "fnmacd", " $dst, $a, $b", [(set DPR:$dst, (fadd (fneg (fmul DPR:$a, DPR:$b)), DPR:$dstin))]>, RegConstraint<"$dstin = $dst"> { let Inst{6} = 1; } -def FNMACS : ASbI<0b11100000, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b), - "fnmacs", " $dst, $a, $b", +def FNMACS : ASbIn<0b11100000, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b), + IIC_fpMAC32, "fnmacs", " $dst, $a, $b", [(set SPR:$dst, (fadd (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin))]>, RegConstraint<"$dstin = $dst"> { let Inst{6} = 1; } +def : Pat<(fsub DPR:$dstin, (fmul DPR:$a, DPR:$b)), + (FNMACD DPR:$dstin, DPR:$a, DPR:$b)>, Requires<[DontUseNEONForFP]>; +def : Pat<(fsub SPR:$dstin, (fmul SPR:$a, SPR:$b)), + (FNMACS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[DontUseNEONForFP]>; + def FNMSCD : ADbI<0b11100001, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b), - "fnmscd", " $dst, $a, $b", + IIC_fpMAC64, "fnmscd", " $dst, $a, $b", [(set DPR:$dst, (fsub (fneg (fmul DPR:$a, DPR:$b)), DPR:$dstin))]>, RegConstraint<"$dstin = $dst"> { let Inst{6} = 1; } def FNMSCS : ASbI<0b11100001, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b), - "fnmscs", " $dst, $a, $b", + IIC_fpMAC32, "fnmscs", " $dst, $a, $b", [(set SPR:$dst, (fsub (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin))]>, RegConstraint<"$dstin = $dst"> { let Inst{6} = 1; @@ -362,25 +371,25 @@ def FNMSCS : ASbI<0b11100001, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b), def FCPYDcc : ADuI<0b11101011, 0b0000, 0b0100, (outs DPR:$dst), (ins DPR:$false, DPR:$true), - "fcpyd", " $dst, $true", + IIC_fpUNA64, "fcpyd", " $dst, $true", [/*(set DPR:$dst, (ARMcmov DPR:$false, DPR:$true, imm:$cc))*/]>, RegConstraint<"$false = $dst">; def FCPYScc : ASuI<0b11101011, 0b0000, 0b0100, (outs SPR:$dst), (ins SPR:$false, SPR:$true), - "fcpys", " $dst, $true", + IIC_fpUNA32, "fcpys", " $dst, $true", [/*(set SPR:$dst, (ARMcmov SPR:$false, SPR:$true, imm:$cc))*/]>, RegConstraint<"$false = $dst">; def FNEGDcc : ADuI<0b11101011, 0b0001, 0b0100, (outs DPR:$dst), (ins DPR:$false, DPR:$true), - "fnegd", " $dst, $true", + IIC_fpUNA64, "fnegd", " $dst, $true", [/*(set DPR:$dst, (ARMcneg DPR:$false, DPR:$true, imm:$cc))*/]>, RegConstraint<"$false = $dst">; def FNEGScc : ASuI<0b11101011, 0b0001, 0b0100, (outs SPR:$dst), (ins SPR:$false, SPR:$true), - "fnegs", " $dst, $true", + IIC_fpUNA32, "fnegs", " $dst, $true", [/*(set SPR:$dst, (ARMcneg SPR:$false, SPR:$true, imm:$cc))*/]>, RegConstraint<"$false = $dst">; @@ -389,8 +398,8 @@ def FNEGScc : ASuI<0b11101011, 0b0001, 0b0100, // Misc. // -let Defs = [CPSR] in -def FMSTAT : AI<(outs), (ins), VFPMiscFrm, "fmstat", "", [(arm_fmstat)]> { +let Defs = [CPSR], Uses = [FPSCR] in +def FMSTAT : VFPAI<(outs), (ins), VFPMiscFrm, IIC_fpSTAT, "fmstat", "", [(arm_fmstat)]> { let Inst{27-20} = 0b11101111; let Inst{19-16} = 0b0001; let Inst{15-12} = 0b1111; diff --git a/lib/Target/ARM/ARMJITInfo.cpp b/lib/Target/ARM/ARMJITInfo.cpp index e551c41..24990e6 100644 --- a/lib/Target/ARM/ARMJITInfo.cpp +++ b/lib/Target/ARM/ARMJITInfo.cpp @@ -19,15 +19,15 @@ #include "ARMSubtarget.h" #include "llvm/Function.h" #include "llvm/CodeGen/JITCodeEmitter.h" -#include "llvm/Config/alloca.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/Streams.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/System/Memory.h" #include <cstdlib> using namespace llvm; void ARMJITInfo::replaceMachineCodeForFunction(void *Old, void *New) { - abort(); + llvm_report_error("ARMJITInfo::replaceMachineCodeForFunction"); } /// JITCompilerFunction - This contains the address of the JIT function used to @@ -45,11 +45,11 @@ static TargetJITInfo::JITCompilerFn JITCompilerFunction; // CompilationCallback stub - We can't use a C function with inline assembly in // it, because we the prolog/epilog inserted by GCC won't work for us (we need // to preserve more context and manipulate the stack directly). Instead, -// write our own wrapper, which does things our way, so we have complete +// write our own wrapper, which does things our way, so we have complete // control over register saving and restoring. extern "C" { #if defined(__arm__) - void ARMCompilationCallback(void); + void ARMCompilationCallback(); asm( ".text\n" ".align 2\n" @@ -77,11 +77,11 @@ extern "C" { // order for the registers. // +--------+ // 0 | LR | Original return address - // +--------+ + // +--------+ // 1 | LR | Stub address (start of stub) // 2-5 | R3..R0 | Saved registers (we need to preserve all regs) // 6-20 | D0..D7 | Saved VFP registers - // +--------+ + // +--------+ // #ifndef __SOFTFP__ // Restore VFP caller-saved registers. @@ -103,15 +103,14 @@ extern "C" { ); #else // Not an ARM host void ARMCompilationCallback() { - assert(0 && "Cannot call ARMCompilationCallback() on a non-ARM arch!\n"); - abort(); + llvm_unreachable("Cannot call ARMCompilationCallback() on a non-ARM arch!"); } #endif } -/// ARMCompilationCallbackC - This is the target-specific function invoked -/// by the function stub when we did not know the real target of a call. -/// This function must locate the start of the stub or call site and pass +/// ARMCompilationCallbackC - This is the target-specific function invoked +/// by the function stub when we did not know the real target of a call. +/// This function must locate the start of the stub or call site and pass /// it into the JIT compiler function. extern "C" void ARMCompilationCallbackC(intptr_t StubAddr) { // Get the address of the compiled code for this function. @@ -123,14 +122,12 @@ extern "C" void ARMCompilationCallbackC(intptr_t StubAddr) { // ldr pc, [pc,#-4] // <addr> if (!sys::Memory::setRangeWritable((void*)StubAddr, 8)) { - cerr << "ERROR: Unable to mark stub writable\n"; - abort(); + llvm_unreachable("ERROR: Unable to mark stub writable"); } *(intptr_t *)StubAddr = 0xe51ff004; // ldr pc, [pc, #-4] *(intptr_t *)(StubAddr+4) = NewVal; if (!sys::Memory::setRangeExecutable((void*)StubAddr, 8)) { - cerr << "ERROR: Unable to mark stub executable\n"; - abort(); + llvm_unreachable("ERROR: Unable to mark stub executable"); } } @@ -143,7 +140,14 @@ ARMJITInfo::getLazyResolverFunction(JITCompilerFn F) { void *ARMJITInfo::emitGlobalValueIndirectSym(const GlobalValue *GV, void *Ptr, JITCodeEmitter &JCE) { JCE.startGVStub(GV, 4, 4); + intptr_t Addr = (intptr_t)JCE.getCurrentPCValue(); + if (!sys::Memory::setRangeWritable((void*)Addr, 4)) { + llvm_unreachable("ERROR: Unable to mark indirect symbol writable"); + } JCE.emitWordLE((intptr_t)Ptr); + if (!sys::Memory::setRangeExecutable((void*)Addr, 4)) { + llvm_unreachable("ERROR: Unable to mark indirect symbol executable"); + } void *PtrAddr = JCE.finishGVStub(GV); addIndirectSymAddr(Ptr, (intptr_t)PtrAddr); return PtrAddr; @@ -161,31 +165,43 @@ void *ARMJITInfo::emitFunctionStub(const Function* F, void *Fn, if (!LazyPtr) { // In PIC mode, the function stub is loading a lazy-ptr. LazyPtr= (intptr_t)emitGlobalValueIndirectSym((GlobalValue*)F, Fn, JCE); - if (F) - DOUT << "JIT: Indirect symbol emitted at [" << LazyPtr << "] for GV '" - << F->getName() << "'\n"; - else - DOUT << "JIT: Stub emitted at [" << LazyPtr - << "] for external function at '" << Fn << "'\n"; + DEBUG(if (F) + errs() << "JIT: Indirect symbol emitted at [" << LazyPtr + << "] for GV '" << F->getName() << "'\n"; + else + errs() << "JIT: Stub emitted at [" << LazyPtr + << "] for external function at '" << Fn << "'\n"); } JCE.startGVStub(F, 16, 4); intptr_t Addr = (intptr_t)JCE.getCurrentPCValue(); + if (!sys::Memory::setRangeWritable((void*)Addr, 16)) { + llvm_unreachable("ERROR: Unable to mark stub writable"); + } JCE.emitWordLE(0xe59fc004); // ldr pc, [pc, #+4] JCE.emitWordLE(0xe08fc00c); // L_func$scv: add ip, pc, ip JCE.emitWordLE(0xe59cf000); // ldr pc, [ip] JCE.emitWordLE(LazyPtr - (Addr+4+8)); // func - (L_func$scv+8) sys::Memory::InvalidateInstructionCache((void*)Addr, 16); + if (!sys::Memory::setRangeExecutable((void*)Addr, 16)) { + llvm_unreachable("ERROR: Unable to mark stub executable"); + } } else { // The stub is 8-byte size and 4-aligned. JCE.startGVStub(F, 8, 4); intptr_t Addr = (intptr_t)JCE.getCurrentPCValue(); + if (!sys::Memory::setRangeWritable((void*)Addr, 8)) { + llvm_unreachable("ERROR: Unable to mark stub writable"); + } JCE.emitWordLE(0xe51ff004); // ldr pc, [pc, #-4] JCE.emitWordLE((intptr_t)Fn); // addr of function sys::Memory::InvalidateInstructionCache((void*)Addr, 8); + if (!sys::Memory::setRangeExecutable((void*)Addr, 8)) { + llvm_unreachable("ERROR: Unable to mark stub executable"); + } } } else { // The compilation callback will overwrite the first two words of this - // stub with indirect branch instructions targeting the compiled code. + // stub with indirect branch instructions targeting the compiled code. // This stub sets the return address to restart the stub, so that // the new branch will be invoked when we come back. // @@ -193,6 +209,9 @@ void *ARMJITInfo::emitFunctionStub(const Function* F, void *Fn, // The stub is 16-byte size and 4-byte aligned. JCE.startGVStub(F, 16, 4); intptr_t Addr = (intptr_t)JCE.getCurrentPCValue(); + if (!sys::Memory::setRangeWritable((void*)Addr, 16)) { + llvm_unreachable("ERROR: Unable to mark stub writable"); + } // Save LR so the callback can determine which stub called it. // The compilation callback is responsible for popping this prior // to returning. @@ -204,6 +223,9 @@ void *ARMJITInfo::emitFunctionStub(const Function* F, void *Fn, // The address of the compilation callback. JCE.emitWordLE((intptr_t)ARMCompilationCallback); sys::Memory::InvalidateInstructionCache((void*)Addr, 16); + if (!sys::Memory::setRangeExecutable((void*)Addr, 16)) { + llvm_unreachable("ERROR: Unable to mark stub executable"); + } } return JCE.finishGVStub(F); diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 59cf125..d2ec9ee 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -15,9 +15,11 @@ #define DEBUG_TYPE "arm-ldst-opt" #include "ARM.h" #include "ARMAddressingModes.h" +#include "ARMBaseInstrInfo.h" #include "ARMMachineFunctionInfo.h" #include "ARMRegisterInfo.h" #include "llvm/DerivedTypes.h" +#include "llvm/Function.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -29,6 +31,7 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Support/Compiler.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" @@ -61,6 +64,7 @@ namespace { const TargetRegisterInfo *TRI; ARMFunctionInfo *AFI; RegScavenger *RS; + bool isThumb2; virtual bool runOnMachineFunction(MachineFunction &Fn); @@ -93,6 +97,15 @@ namespace { void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps); bool FixInvalidRegPairOp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI); + bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const TargetInstrInfo *TII, + bool &Advance, + MachineBasicBlock::iterator &I); + bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + bool &Advance, + MachineBasicBlock::iterator &I); bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); bool MergeReturnIntoLDM(MachineBasicBlock &MBB); }; @@ -107,6 +120,14 @@ static int getLoadStoreMultipleOpcode(int Opcode) { case ARM::STR: NumSTMGened++; return ARM::STM; + case ARM::t2LDRi8: + case ARM::t2LDRi12: + NumLDMGened++; + return ARM::t2LDM; + case ARM::t2STRi8: + case ARM::t2STRi12: + NumSTMGened++; + return ARM::t2STM; case ARM::FLDS: NumFLDMGened++; return ARM::FLDMS; @@ -119,14 +140,30 @@ static int getLoadStoreMultipleOpcode(int Opcode) { case ARM::FSTD: NumFSTMGened++; return ARM::FSTMD; - default: abort(); + default: llvm_unreachable("Unhandled opcode!"); } return 0; } +static bool isT2i32Load(unsigned Opc) { + return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8; +} + +static bool isi32Load(unsigned Opc) { + return Opc == ARM::LDR || isT2i32Load(Opc); +} + +static bool isT2i32Store(unsigned Opc) { + return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8; +} + +static bool isi32Store(unsigned Opc) { + return Opc == ARM::STR || isT2i32Store(Opc); +} + /// MergeOps - Create and insert a LDM or STM with Base as base register and /// registers in Regs as the register operands that would be loaded / stored. -/// It returns true if the transformation is done. +/// It returns true if the transformation is done. bool ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -140,14 +177,20 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, return false; ARM_AM::AMSubMode Mode = ARM_AM::ia; - bool isAM4 = Opcode == ARM::LDR || Opcode == ARM::STR; - if (isAM4 && Offset == 4) + bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode); + if (isAM4 && Offset == 4) { + if (isThumb2) + // Thumb2 does not support ldmib / stmib. + return false; Mode = ARM_AM::ib; - else if (isAM4 && Offset == -4 * (int)NumRegs + 4) + } else if (isAM4 && Offset == -4 * (int)NumRegs + 4) { + if (isThumb2) + // Thumb2 does not support ldmda / stmda. + return false; Mode = ARM_AM::da; - else if (isAM4 && Offset == -4 * (int)NumRegs) + } else if (isAM4 && Offset == -4 * (int)NumRegs) { Mode = ARM_AM::db; - else if (Offset != 0) { + } else if (Offset != 0) { // If starting offset isn't zero, insert a MI to materialize a new base. // But only do so if it is cost effective, i.e. merging more than two // loads / stores. @@ -155,7 +198,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, return false; unsigned NewBase; - if (Opcode == ARM::LDR) + if (isi32Load(Opcode)) // If it is a load, then just use one of the destination register to // use as the new base. NewBase = Regs[NumRegs-1].first; @@ -165,24 +208,30 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, if (NewBase == 0) return false; } - int BaseOpc = ARM::ADDri; + int BaseOpc = !isThumb2 + ? ARM::ADDri + : ((Base == ARM::SP) ? ARM::t2ADDrSPi : ARM::t2ADDri); if (Offset < 0) { - BaseOpc = ARM::SUBri; + BaseOpc = !isThumb2 + ? ARM::SUBri + : ((Base == ARM::SP) ? ARM::t2SUBrSPi : ARM::t2SUBri); Offset = - Offset; } - int ImmedOffset = ARM_AM::getSOImmVal(Offset); + int ImmedOffset = isThumb2 + ? ARM_AM::getT2SOImmVal(Offset) : ARM_AM::getSOImmVal(Offset); if (ImmedOffset == -1) + // FIXME: Try t2ADDri12 or t2SUBri12? return false; // Probably not worth it then. BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase) - .addReg(Base, getKillRegState(BaseKill)).addImm(ImmedOffset) + .addReg(Base, getKillRegState(BaseKill)).addImm(Offset) .addImm(Pred).addReg(PredReg).addReg(0); Base = NewBase; BaseKill = true; // New base is always killed right its use. } bool isDPR = Opcode == ARM::FLDD || Opcode == ARM::FSTD; - bool isDef = Opcode == ARM::LDR || Opcode == ARM::FLDS || Opcode == ARM::FLDD; + bool isDef = isi32Load(Opcode) || Opcode == ARM::FLDS || Opcode == ARM::FLDD; Opcode = getLoadStoreMultipleOpcode(Opcode); MachineInstrBuilder MIB = (isAM4) ? BuildMI(MBB, MBBI, dl, TII->get(Opcode)) @@ -192,6 +241,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, .addReg(Base, getKillRegState(BaseKill)) .addImm(ARM_AM::getAM5Opc(Mode, false, isDPR ? NumRegs<<1 : NumRegs)) .addImm(Pred).addReg(PredReg); + MIB.addReg(0); // Add optional writeback (0 for now). for (unsigned i = 0; i != NumRegs; ++i) MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef) | getKillRegState(Regs[i].second)); @@ -207,7 +257,7 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch, MemOpQueue &MemOps, SmallVector<MachineBasicBlock::iterator, 4> &Merges) { - bool isAM4 = Opcode == ARM::LDR || Opcode == ARM::STR; + bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode); int Offset = MemOps[SIndex].Offset; int SOffset = Offset; unsigned Pos = MemOps[SIndex].Position; @@ -265,41 +315,53 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, return; } -/// getInstrPredicate - If instruction is predicated, returns its predicate -/// condition, otherwise returns AL. It also returns the condition code -/// register by reference. -static ARMCC::CondCodes getInstrPredicate(MachineInstr *MI, unsigned &PredReg) { - int PIdx = MI->findFirstPredOperandIdx(); - if (PIdx == -1) { - PredReg = 0; - return ARMCC::AL; - } - - PredReg = MI->getOperand(PIdx+1).getReg(); - return (ARMCC::CondCodes)MI->getOperand(PIdx).getImm(); -} - static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base, - unsigned Bytes, ARMCC::CondCodes Pred, - unsigned PredReg) { + unsigned Bytes, unsigned Limit, + ARMCC::CondCodes Pred, unsigned PredReg){ unsigned MyPredReg = 0; - return (MI && MI->getOpcode() == ARM::SUBri && - MI->getOperand(0).getReg() == Base && + if (!MI) + return false; + if (MI->getOpcode() != ARM::t2SUBri && + MI->getOpcode() != ARM::t2SUBrSPi && + MI->getOpcode() != ARM::t2SUBrSPi12 && + MI->getOpcode() != ARM::tSUBspi && + MI->getOpcode() != ARM::SUBri) + return false; + + // Make sure the offset fits in 8 bits. + if (Bytes <= 0 || (Limit && Bytes >= Limit)) + return false; + + unsigned Scale = (MI->getOpcode() == ARM::tSUBspi) ? 4 : 1; // FIXME + return (MI->getOperand(0).getReg() == Base && MI->getOperand(1).getReg() == Base && - ARM_AM::getAM2Offset(MI->getOperand(2).getImm()) == Bytes && - getInstrPredicate(MI, MyPredReg) == Pred && + (MI->getOperand(2).getImm()*Scale) == Bytes && + llvm::getInstrPredicate(MI, MyPredReg) == Pred && MyPredReg == PredReg); } static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base, - unsigned Bytes, ARMCC::CondCodes Pred, - unsigned PredReg) { + unsigned Bytes, unsigned Limit, + ARMCC::CondCodes Pred, unsigned PredReg){ unsigned MyPredReg = 0; - return (MI && MI->getOpcode() == ARM::ADDri && - MI->getOperand(0).getReg() == Base && + if (!MI) + return false; + if (MI->getOpcode() != ARM::t2ADDri && + MI->getOpcode() != ARM::t2ADDrSPi && + MI->getOpcode() != ARM::t2ADDrSPi12 && + MI->getOpcode() != ARM::tADDspi && + MI->getOpcode() != ARM::ADDri) + return false; + + if (Bytes <= 0 || (Limit && Bytes >= Limit)) + // Make sure the offset fits in 8 bits. + return false; + + unsigned Scale = (MI->getOpcode() == ARM::tADDspi) ? 4 : 1; // FIXME + return (MI->getOperand(0).getReg() == Base && MI->getOperand(1).getReg() == Base && - ARM_AM::getAM2Offset(MI->getOperand(2).getImm()) == Bytes && - getInstrPredicate(MI, MyPredReg) == Pred && + (MI->getOperand(2).getImm()*Scale) == Bytes && + llvm::getInstrPredicate(MI, MyPredReg) == Pred && MyPredReg == PredReg); } @@ -308,6 +370,10 @@ static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) { default: return 0; case ARM::LDR: case ARM::STR: + case ARM::t2LDRi8: + case ARM::t2LDRi12: + case ARM::t2STRi8: + case ARM::t2STRi12: case ARM::FLDS: case ARM::FSTS: return 4; @@ -316,7 +382,9 @@ static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) { return 8; case ARM::LDM: case ARM::STM: - return (MI->getNumOperands() - 4) * 4; + case ARM::t2LDM: + case ARM::t2STM: + return (MI->getNumOperands() - 5) * 4; case ARM::FLDMS: case ARM::FSTMS: case ARM::FLDMD: @@ -325,7 +393,7 @@ static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) { } } -/// mergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base +/// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base /// register into the LDM/STM/FLDM{D|S}/FSTM{D|S} op when possible: /// /// stmia rn, <ra, rb, rc> @@ -337,17 +405,18 @@ static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) { /// ldmia rn, <ra, rb, rc> /// => /// ldmdb rn!, <ra, rb, rc> -static bool mergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - bool &Advance, - MachineBasicBlock::iterator &I) { +bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + bool &Advance, + MachineBasicBlock::iterator &I) { MachineInstr *MI = MBBI; unsigned Base = MI->getOperand(0).getReg(); unsigned Bytes = getLSMultipleTransferSize(MI); unsigned PredReg = 0; - ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); + ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); int Opcode = MI->getOpcode(); - bool isAM4 = Opcode == ARM::LDM || Opcode == ARM::STM; + bool isAM4 = Opcode == ARM::LDM || Opcode == ARM::t2LDM || + Opcode == ARM::STM || Opcode == ARM::t2STM; if (isAM4) { if (ARM_AM::getAM4WBFlag(MI->getOperand(1).getImm())) @@ -364,13 +433,17 @@ static bool mergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, if (MBBI != MBB.begin()) { MachineBasicBlock::iterator PrevMBBI = prior(MBBI); if (Mode == ARM_AM::ia && - isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) { + isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::db, true)); + MI->getOperand(4).setReg(Base); + MI->getOperand(4).setIsDef(); MBB.erase(PrevMBBI); return true; } else if (Mode == ARM_AM::ib && - isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) { + isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::da, true)); + MI->getOperand(4).setReg(Base); // WB to base + MI->getOperand(4).setIsDef(); MBB.erase(PrevMBBI); return true; } @@ -379,8 +452,10 @@ static bool mergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, if (MBBI != MBB.end()) { MachineBasicBlock::iterator NextMBBI = next(MBBI); if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) && - isMatchingIncrement(NextMBBI, Base, Bytes, Pred, PredReg)) { + isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true)); + MI->getOperand(4).setReg(Base); // WB to base + MI->getOperand(4).setIsDef(); if (NextMBBI == I) { Advance = true; ++I; @@ -388,8 +463,10 @@ static bool mergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, MBB.erase(NextMBBI); return true; } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) && - isMatchingDecrement(NextMBBI, Base, Bytes, Pred, PredReg)) { + isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true)); + MI->getOperand(4).setReg(Base); // WB to base + MI->getOperand(4).setIsDef(); if (NextMBBI == I) { Advance = true; ++I; @@ -408,8 +485,10 @@ static bool mergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, if (MBBI != MBB.begin()) { MachineBasicBlock::iterator PrevMBBI = prior(MBBI); if (Mode == ARM_AM::ia && - isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) { + isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::db, true, Offset)); + MI->getOperand(4).setReg(Base); // WB to base + MI->getOperand(4).setIsDef(); MBB.erase(PrevMBBI); return true; } @@ -418,8 +497,10 @@ static bool mergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, if (MBBI != MBB.end()) { MachineBasicBlock::iterator NextMBBI = next(MBBI); if (Mode == ARM_AM::ia && - isMatchingIncrement(NextMBBI, Base, Bytes, Pred, PredReg)) { + isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::ia, true, Offset)); + MI->getOperand(4).setReg(Base); // WB to base + MI->getOperand(4).setIsDef(); if (NextMBBI == I) { Advance = true; ++I; @@ -441,7 +522,13 @@ static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc) { case ARM::FLDD: return ARM::FLDMD; case ARM::FSTS: return ARM::FSTMS; case ARM::FSTD: return ARM::FSTMD; - default: abort(); + case ARM::t2LDRi8: + case ARM::t2LDRi12: + return ARM::t2LDR_PRE; + case ARM::t2STRi8: + case ARM::t2STRi12: + return ARM::t2STR_PRE; + default: llvm_unreachable("Unhandled opcode!"); } return 0; } @@ -454,48 +541,62 @@ static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc) { case ARM::FLDD: return ARM::FLDMD; case ARM::FSTS: return ARM::FSTMS; case ARM::FSTD: return ARM::FSTMD; - default: abort(); + case ARM::t2LDRi8: + case ARM::t2LDRi12: + return ARM::t2LDR_POST; + case ARM::t2STRi8: + case ARM::t2STRi12: + return ARM::t2STR_POST; + default: llvm_unreachable("Unhandled opcode!"); } return 0; } -/// mergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base +/// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base /// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible: -static bool mergeBaseUpdateLoadStore(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - const TargetInstrInfo *TII, - bool &Advance, - MachineBasicBlock::iterator &I) { +bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const TargetInstrInfo *TII, + bool &Advance, + MachineBasicBlock::iterator &I) { MachineInstr *MI = MBBI; unsigned Base = MI->getOperand(1).getReg(); bool BaseKill = MI->getOperand(1).isKill(); unsigned Bytes = getLSMultipleTransferSize(MI); int Opcode = MI->getOpcode(); DebugLoc dl = MI->getDebugLoc(); + bool isAM5 = Opcode == ARM::FLDD || Opcode == ARM::FLDS || + Opcode == ARM::FSTD || Opcode == ARM::FSTS; bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR; - if ((isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0) || - (!isAM2 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)) + if (isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0) return false; + else if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0) + return false; + else if (isT2i32Load(Opcode) || isT2i32Store(Opcode)) + if (MI->getOperand(2).getImm() != 0) + return false; - bool isLd = Opcode == ARM::LDR || Opcode == ARM::FLDS || Opcode == ARM::FLDD; + bool isLd = isi32Load(Opcode) || Opcode == ARM::FLDS || Opcode == ARM::FLDD; // Can't do the merge if the destination register is the same as the would-be // writeback register. if (isLd && MI->getOperand(0).getReg() == Base) return false; unsigned PredReg = 0; - ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); + ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); bool DoMerge = false; ARM_AM::AddrOpc AddSub = ARM_AM::add; unsigned NewOpc = 0; + // AM2 - 12 bits, thumb2 - 8 bits. + unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100); if (MBBI != MBB.begin()) { MachineBasicBlock::iterator PrevMBBI = prior(MBBI); - if (isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) { + if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) { DoMerge = true; AddSub = ARM_AM::sub; NewOpc = getPreIndexedLoadStoreOpcode(Opcode); - } else if (isAM2 && isMatchingIncrement(PrevMBBI, Base, Bytes, - Pred, PredReg)) { + } else if (!isAM5 && + isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) { DoMerge = true; NewOpc = getPreIndexedLoadStoreOpcode(Opcode); } @@ -505,11 +606,12 @@ static bool mergeBaseUpdateLoadStore(MachineBasicBlock &MBB, if (!DoMerge && MBBI != MBB.end()) { MachineBasicBlock::iterator NextMBBI = next(MBBI); - if (isAM2 && isMatchingDecrement(NextMBBI, Base, Bytes, Pred, PredReg)) { + if (!isAM5 && + isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) { DoMerge = true; AddSub = ARM_AM::sub; NewOpc = getPostIndexedLoadStoreOpcode(Opcode); - } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Pred, PredReg)) { + } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) { DoMerge = true; NewOpc = getPostIndexedLoadStoreOpcode(Opcode); } @@ -526,33 +628,51 @@ static bool mergeBaseUpdateLoadStore(MachineBasicBlock &MBB, return false; bool isDPR = NewOpc == ARM::FLDMD || NewOpc == ARM::FSTMD; - unsigned Offset = isAM2 ? ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift) - : ARM_AM::getAM5Opc((AddSub == ARM_AM::sub) ? ARM_AM::db : ARM_AM::ia, - true, isDPR ? 2 : 1); + unsigned Offset = 0; + if (isAM5) + Offset = ARM_AM::getAM5Opc((AddSub == ARM_AM::sub) + ? ARM_AM::db + : ARM_AM::ia, true, (isDPR ? 2 : 1)); + else if (isAM2) + Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); + else + Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; if (isLd) { - if (isAM2) - // LDR_PRE, LDR_POST; - BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) - .addReg(Base, RegState::Define) - .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); - else + if (isAM5) // FLDMS, FLDMD BuildMI(MBB, MBBI, dl, TII->get(NewOpc)) .addReg(Base, getKillRegState(BaseKill)) .addImm(Offset).addImm(Pred).addReg(PredReg) + .addReg(Base, getDefRegState(true)) // WB base register .addReg(MI->getOperand(0).getReg(), RegState::Define); - } else { - MachineOperand &MO = MI->getOperand(0); - if (isAM2) - // STR_PRE, STR_POST; - BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) - .addReg(MO.getReg(), getKillRegState(MO.isKill())) + else if (isAM2) + // LDR_PRE, LDR_POST, + BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) + .addReg(Base, RegState::Define) .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); else + // t2LDR_PRE, t2LDR_POST + BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) + .addReg(Base, RegState::Define) + .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); + } else { + MachineOperand &MO = MI->getOperand(0); + if (isAM5) // FSTMS, FSTMD BuildMI(MBB, MBBI, dl, TII->get(NewOpc)).addReg(Base).addImm(Offset) .addImm(Pred).addReg(PredReg) + .addReg(Base, getDefRegState(true)) // WB base register .addReg(MO.getReg(), getKillRegState(MO.isKill())); + else if (isAM2) + // STR_PRE, STR_POST + BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) + .addReg(MO.getReg(), getKillRegState(MO.isKill())) + .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); + else + // t2STR_PRE, t2STR_POST + BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) + .addReg(MO.getReg(), getKillRegState(MO.isKill())) + .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); } MBB.erase(MBBI); @@ -561,7 +681,7 @@ static bool mergeBaseUpdateLoadStore(MachineBasicBlock &MBB, /// isMemoryOp - Returns true if instruction is a memory operations (that this /// pass is capable of operating on). -static bool isMemoryOp(MachineInstr *MI) { +static bool isMemoryOp(const MachineInstr *MI) { int Opcode = MI->getOpcode(); switch (Opcode) { default: break; @@ -574,6 +694,11 @@ static bool isMemoryOp(MachineInstr *MI) { case ARM::FLDD: case ARM::FSTD: return MI->getOperand(1).isReg(); + case ARM::t2LDRi8: + case ARM::t2LDRi12: + case ARM::t2STRi8: + case ARM::t2STRi12: + return MI->getOperand(1).isReg(); } return false; } @@ -600,6 +725,12 @@ static int getMemoryOpOffset(const MachineInstr *MI) { bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD; unsigned NumOperands = MI->getDesc().getNumOperands(); unsigned OffField = MI->getOperand(NumOperands-3).getImm(); + + if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 || + Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 || + Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) + return OffField; + int Offset = isAM2 ? ARM_AM::getAM2Offset(OffField) : (isAM3 ? ARM_AM::getAM3Offset(OffField) @@ -621,37 +752,43 @@ static void InsertLDR_STR(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, int OffImm, bool isDef, DebugLoc dl, unsigned NewOpc, - unsigned Reg, bool RegDeadKill, - unsigned BaseReg, bool BaseKill, - unsigned OffReg, bool OffKill, + unsigned Reg, bool RegDeadKill, bool RegUndef, + unsigned BaseReg, bool BaseKill, bool BaseUndef, + unsigned OffReg, bool OffKill, bool OffUndef, ARMCC::CondCodes Pred, unsigned PredReg, - const TargetInstrInfo *TII) { - unsigned Offset; - if (OffImm < 0) - Offset = ARM_AM::getAM2Opc(ARM_AM::sub, -OffImm, ARM_AM::no_shift); - else - Offset = ARM_AM::getAM2Opc(ARM_AM::add, OffImm, ARM_AM::no_shift); - if (isDef) - BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) + const TargetInstrInfo *TII, bool isT2) { + int Offset = OffImm; + if (!isT2) { + if (OffImm < 0) + Offset = ARM_AM::getAM2Opc(ARM_AM::sub, -OffImm, ARM_AM::no_shift); + else + Offset = ARM_AM::getAM2Opc(ARM_AM::add, OffImm, ARM_AM::no_shift); + } + if (isDef) { + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), + TII->get(NewOpc)) .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill)) - .addReg(BaseReg, getKillRegState(BaseKill)) - .addReg(OffReg, getKillRegState(OffKill)) - .addImm(Offset) - .addImm(Pred).addReg(PredReg); - else - BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) - .addReg(Reg, getKillRegState(RegDeadKill)) - .addReg(BaseReg, getKillRegState(BaseKill)) - .addReg(OffReg, getKillRegState(OffKill)) - .addImm(Offset) - .addImm(Pred).addReg(PredReg); + .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); + if (!isT2) + MIB.addReg(OffReg, getKillRegState(OffKill)|getUndefRegState(OffUndef)); + MIB.addImm(Offset).addImm(Pred).addReg(PredReg); + } else { + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), + TII->get(NewOpc)) + .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef)) + .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); + if (!isT2) + MIB.addReg(OffReg, getKillRegState(OffKill)|getUndefRegState(OffUndef)); + MIB.addImm(Offset).addImm(Pred).addReg(PredReg); + } } bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI) { MachineInstr *MI = &*MBBI; unsigned Opcode = MI->getOpcode(); - if (Opcode == ARM::LDRD || Opcode == ARM::STRD) { + if (Opcode == ARM::LDRD || Opcode == ARM::STRD || + Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) { unsigned EvenReg = MI->getOperand(0).getReg(); unsigned OddReg = MI->getOperand(1).getReg(); unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false); @@ -659,45 +796,59 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, if ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum) return false; - bool isLd = Opcode == ARM::LDRD; + bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8; + bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8; bool EvenDeadKill = isLd ? MI->getOperand(0).isDead() : MI->getOperand(0).isKill(); + bool EvenUndef = MI->getOperand(0).isUndef(); bool OddDeadKill = isLd ? MI->getOperand(1).isDead() : MI->getOperand(1).isKill(); + bool OddUndef = MI->getOperand(1).isUndef(); const MachineOperand &BaseOp = MI->getOperand(2); unsigned BaseReg = BaseOp.getReg(); bool BaseKill = BaseOp.isKill(); - const MachineOperand &OffOp = MI->getOperand(3); - unsigned OffReg = OffOp.getReg(); - bool OffKill = OffOp.isKill(); + bool BaseUndef = BaseOp.isUndef(); + unsigned OffReg = isT2 ? 0 : MI->getOperand(3).getReg(); + bool OffKill = isT2 ? false : MI->getOperand(3).isKill(); + bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef(); int OffImm = getMemoryOpOffset(MI); unsigned PredReg = 0; - ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); + ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); if (OddRegNum > EvenRegNum && OffReg == 0 && OffImm == 0) { // Ascending register numbers and no offset. It's safe to change it to a // ldm or stm. - unsigned NewOpc = (Opcode == ARM::LDRD) ? ARM::LDM : ARM::STM; + unsigned NewOpc = (isLd) + ? (isT2 ? ARM::t2LDM : ARM::LDM) + : (isT2 ? ARM::t2STM : ARM::STM); if (isLd) { BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) .addReg(BaseReg, getKillRegState(BaseKill)) .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)) .addImm(Pred).addReg(PredReg) + .addReg(0) .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill)) - .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill)); + .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill)); ++NumLDRD2LDM; } else { BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) .addReg(BaseReg, getKillRegState(BaseKill)) .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)) .addImm(Pred).addReg(PredReg) - .addReg(EvenReg, getKillRegState(EvenDeadKill)) - .addReg(OddReg, getKillRegState(OddDeadKill)); + .addReg(0) + .addReg(EvenReg, + getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef)) + .addReg(OddReg, + getKillRegState(OddDeadKill) | getUndefRegState(OddUndef)); ++NumSTRD2STM; } } else { // Split into two instructions. - unsigned NewOpc = (Opcode == ARM::LDRD) ? ARM::LDR : ARM::STR; + assert((!isT2 || !OffReg) && + "Thumb2 ldrd / strd does not encode offset register!"); + unsigned NewOpc = (isLd) + ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDR) + : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STR); DebugLoc dl = MBBI->getDebugLoc(); // If this is a load and base register is killed, it may have been // re-defed by the load, make sure the first load does not clobber it. @@ -707,17 +858,23 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, (OffReg && TRI->regsOverlap(EvenReg, OffReg)))) { assert(!TRI->regsOverlap(OddReg, BaseReg) && (!OffReg || !TRI->regsOverlap(OddReg, OffReg))); - InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc, OddReg, OddDeadKill, - BaseReg, false, OffReg, false, Pred, PredReg, TII); - InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, EvenReg, EvenDeadKill, - BaseReg, BaseKill, OffReg, OffKill, Pred, PredReg, TII); + InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc, + OddReg, OddDeadKill, false, + BaseReg, false, BaseUndef, OffReg, false, OffUndef, + Pred, PredReg, TII, isT2); + InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, + EvenReg, EvenDeadKill, false, + BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef, + Pred, PredReg, TII, isT2); } else { InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, - EvenReg, EvenDeadKill, BaseReg, false, OffReg, false, - Pred, PredReg, TII); + EvenReg, EvenDeadKill, EvenUndef, + BaseReg, false, BaseUndef, OffReg, false, OffUndef, + Pred, PredReg, TII, isT2); InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc, - OddReg, OddDeadKill, BaseReg, BaseKill, OffReg, OffKill, - Pred, PredReg, TII); + OddReg, OddDeadKill, OddUndef, + BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef, + Pred, PredReg, TII, isT2); } if (isLd) ++NumLDRD2LDR; @@ -761,7 +918,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { unsigned Size = getLSMultipleTransferSize(MBBI); unsigned Base = MBBI->getOperand(1).getReg(); unsigned PredReg = 0; - ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg); + ARMCC::CondCodes Pred = llvm::getInstrPredicate(MBBI, PredReg); int Offset = getMemoryOpOffset(MBBI); // Watch out for: // r4 := ldr [r5] @@ -772,7 +929,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { // looks like the later ldr(s) use the same base register. Try to // merge the ldr's so far, including this one. But don't try to // combine the following ldr(s). - Clobber = (Opcode == ARM::LDR && Base == MBBI->getOperand(0).getReg()); + Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg()); if (CurrBase == 0 && !Clobber) { // Start of a new chain. CurrBase = Base; @@ -825,12 +982,8 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { // Try to find a free register to use as a new base in case it's needed. // First advance to the instruction just before the start of the chain. AdvanceRS(MBB, MemOps); - // Find a scratch register. Make sure it's a call clobbered register or - // a spilled callee-saved register. - unsigned Scratch = RS->FindUnusedReg(&ARM::GPRRegClass, true); - if (!Scratch) - Scratch = RS->FindUnusedReg(&ARM::GPRRegClass, - AFI->getSpilledCSRegisters()); + // Find a scratch register. + unsigned Scratch = RS->FindUnusedReg(ARM::GPRRegisterClass); // Process the load / store instructions. RS->forward(prior(MBBI)); @@ -842,7 +995,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { // Try folding preceeding/trailing base inc/dec into the generated // LDM/STM ops. for (unsigned i = 0, e = Merges.size(); i < e; ++i) - if (mergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI)) + if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI)) ++NumMerges; NumMerges += Merges.size(); @@ -850,15 +1003,15 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { // that were not merged to form LDM/STM ops. for (unsigned i = 0; i != NumMemOps; ++i) if (!MemOps[i].Merged) - if (mergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI)) + if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI)) ++NumMerges; - // RS may be pointing to an instruction that's deleted. + // RS may be pointing to an instruction that's deleted. RS->skipTo(prior(MBBI)); } else if (NumMemOps == 1) { // Try folding preceeding/trailing base inc/dec into the single // load/store. - if (mergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) { + if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) { ++NumMerges; RS->forward(prior(MBBI)); } @@ -907,16 +1060,18 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) { if (MBB.empty()) return false; MachineBasicBlock::iterator MBBI = prior(MBB.end()); - if (MBBI->getOpcode() == ARM::BX_RET && MBBI != MBB.begin()) { + if (MBBI != MBB.begin() && + (MBBI->getOpcode() == ARM::BX_RET || MBBI->getOpcode() == ARM::tBX_RET)) { MachineInstr *PrevMI = prior(MBBI); - if (PrevMI->getOpcode() == ARM::LDM) { + if (PrevMI->getOpcode() == ARM::LDM || PrevMI->getOpcode() == ARM::t2LDM) { MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1); - if (MO.getReg() == ARM::LR) { - PrevMI->setDesc(TII->get(ARM::LDM_RET)); - MO.setReg(ARM::PC); - MBB.erase(MBBI); - return true; - } + if (MO.getReg() != ARM::LR) + return false; + unsigned NewOpc = isThumb2 ? ARM::t2LDM_RET : ARM::LDM_RET; + PrevMI->setDesc(TII->get(NewOpc)); + MO.setReg(ARM::PC); + MBB.erase(MBBI); + return true; } } return false; @@ -928,6 +1083,7 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { TII = TM.getInstrInfo(); TRI = TM.getRegisterInfo(); RS = new RegScavenger(); + isThumb2 = AFI->isThumb2Function(); bool Modified = false; for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; @@ -956,6 +1112,7 @@ namespace { const TargetRegisterInfo *TRI; const ARMSubtarget *STI; MachineRegisterInfo *MRI; + MachineFunction *MF; virtual bool runOnMachineFunction(MachineFunction &Fn); @@ -967,8 +1124,9 @@ namespace { bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl, unsigned &NewOpc, unsigned &EvenReg, unsigned &OddReg, unsigned &BaseReg, - unsigned &OffReg, unsigned &Offset, - unsigned &PredReg, ARMCC::CondCodes &Pred); + unsigned &OffReg, int &Offset, + unsigned &PredReg, ARMCC::CondCodes &Pred, + bool &isT2); bool RescheduleOps(MachineBasicBlock *MBB, SmallVector<MachineInstr*, 4> &Ops, unsigned Base, bool isLd, @@ -984,6 +1142,7 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { TRI = Fn.getTarget().getRegisterInfo(); STI = &Fn.getTarget().getSubtarget<ARMSubtarget>(); MRI = &Fn.getRegInfo(); + MF = &Fn; bool Modified = false; for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; @@ -1045,48 +1204,83 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl, unsigned &NewOpc, unsigned &EvenReg, unsigned &OddReg, unsigned &BaseReg, - unsigned &OffReg, unsigned &Offset, + unsigned &OffReg, int &Offset, unsigned &PredReg, - ARMCC::CondCodes &Pred) { + ARMCC::CondCodes &Pred, + bool &isT2) { + // Make sure we're allowed to generate LDRD/STRD. + if (!STI->hasV5TEOps()) + return false; + // FIXME: FLDS / FSTS -> FLDD / FSTD + unsigned Scale = 1; unsigned Opcode = Op0->getOpcode(); if (Opcode == ARM::LDR) NewOpc = ARM::LDRD; else if (Opcode == ARM::STR) NewOpc = ARM::STRD; - else - return 0; + else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) { + NewOpc = ARM::t2LDRDi8; + Scale = 4; + isT2 = true; + } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) { + NewOpc = ARM::t2STRDi8; + Scale = 4; + isT2 = true; + } else + return false; + + // Make sure the offset registers match. + if (!isT2 && + (Op0->getOperand(2).getReg() != Op1->getOperand(2).getReg())) + return false; // Must sure the base address satisfies i64 ld / st alignment requirement. if (!Op0->hasOneMemOperand() || - !Op0->memoperands_begin()->getValue() || - Op0->memoperands_begin()->isVolatile()) + !(*Op0->memoperands_begin())->getValue() || + (*Op0->memoperands_begin())->isVolatile()) return false; - unsigned Align = Op0->memoperands_begin()->getAlignment(); + unsigned Align = (*Op0->memoperands_begin())->getAlignment(); + Function *Func = MF->getFunction(); unsigned ReqAlign = STI->hasV6Ops() - ? TD->getPrefTypeAlignment(Type::Int64Ty) : 8; // Pre-v6 need 8-byte align + ? TD->getPrefTypeAlignment(Type::getInt64Ty(Func->getContext())) + : 8; // Pre-v6 need 8-byte align if (Align < ReqAlign) return false; // Then make sure the immediate offset fits. int OffImm = getMemoryOpOffset(Op0); - ARM_AM::AddrOpc AddSub = ARM_AM::add; - if (OffImm < 0) { - AddSub = ARM_AM::sub; - OffImm = - OffImm; + if (isT2) { + if (OffImm < 0) { + if (OffImm < -255) + // Can't fall back to t2LDRi8 / t2STRi8. + return false; + } else { + int Limit = (1 << 8) * Scale; + if (OffImm >= Limit || (OffImm & (Scale-1))) + return false; + } + Offset = OffImm; + } else { + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (OffImm < 0) { + AddSub = ARM_AM::sub; + OffImm = - OffImm; + } + int Limit = (1 << 8) * Scale; + if (OffImm >= Limit || (OffImm & (Scale-1))) + return false; + Offset = ARM_AM::getAM3Opc(AddSub, OffImm); } - if (OffImm >= 256) // 8 bits - return false; - Offset = ARM_AM::getAM3Opc(AddSub, OffImm); - EvenReg = Op0->getOperand(0).getReg(); OddReg = Op1->getOperand(0).getReg(); if (EvenReg == OddReg) return false; BaseReg = Op0->getOperand(1).getReg(); - OffReg = Op0->getOperand(2).getReg(); - Pred = getInstrPredicate(Op0, PredReg); + if (!isT2) + OffReg = Op0->getOperand(2).getReg(); + Pred = llvm::getInstrPredicate(Op0, PredReg); dl = Op0->getDebugLoc(); return true; } @@ -1138,7 +1332,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, LastOffset = Offset; LastBytes = Bytes; LastOpcode = Opcode; - if (++NumMove == 8) // FIXME: Tune + if (++NumMove == 8) // FIXME: Tune this limit. break; } @@ -1174,29 +1368,36 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, unsigned EvenReg = 0, OddReg = 0; unsigned BaseReg = 0, OffReg = 0, PredReg = 0; ARMCC::CondCodes Pred = ARMCC::AL; + bool isT2 = false; unsigned NewOpc = 0; - unsigned Offset = 0; + int Offset = 0; DebugLoc dl; if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc, EvenReg, OddReg, BaseReg, OffReg, - Offset, PredReg, Pred)) { + Offset, PredReg, Pred, isT2)) { Ops.pop_back(); Ops.pop_back(); // Form the pair instruction. if (isLd) { - BuildMI(*MBB, InsertPos, dl, TII->get(NewOpc)) + MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, + dl, TII->get(NewOpc)) .addReg(EvenReg, RegState::Define) .addReg(OddReg, RegState::Define) - .addReg(BaseReg).addReg(0).addImm(Offset) - .addImm(Pred).addReg(PredReg); + .addReg(BaseReg); + if (!isT2) + MIB.addReg(OffReg); + MIB.addImm(Offset).addImm(Pred).addReg(PredReg); ++NumLDRDFormed; } else { - BuildMI(*MBB, InsertPos, dl, TII->get(NewOpc)) + MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, + dl, TII->get(NewOpc)) .addReg(EvenReg) .addReg(OddReg) - .addReg(BaseReg).addReg(0).addImm(Offset) - .addImm(Pred).addReg(PredReg); + .addReg(BaseReg); + if (!isT2) + MIB.addReg(OffReg); + MIB.addImm(Offset).addImm(Pred).addReg(PredReg); ++NumSTRDFormed; } MBB->erase(Op0); @@ -1249,12 +1450,11 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { if (!isMemoryOp(MI)) continue; unsigned PredReg = 0; - if (getInstrPredicate(MI, PredReg) != ARMCC::AL) + if (llvm::getInstrPredicate(MI, PredReg) != ARMCC::AL) continue; - int Opcode = MI->getOpcode(); - bool isLd = Opcode == ARM::LDR || - Opcode == ARM::FLDS || Opcode == ARM::FLDD; + int Opc = MI->getOpcode(); + bool isLd = isi32Load(Opc) || Opc == ARM::FLDS || Opc == ARM::FLDD; unsigned Base = MI->getOperand(1).getReg(); int Offset = getMemoryOpOffset(MI); diff --git a/lib/Target/ARM/ARMMCAsmInfo.cpp b/lib/Target/ARM/ARMMCAsmInfo.cpp new file mode 100644 index 0000000..0ff65d2 --- /dev/null +++ b/lib/Target/ARM/ARMMCAsmInfo.cpp @@ -0,0 +1,72 @@ +//===-- ARMMCAsmInfo.cpp - ARM asm properties -------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of the ARMMCAsmInfo properties. +// +//===----------------------------------------------------------------------===// + +#include "ARMMCAsmInfo.h" +using namespace llvm; + +static const char *const arm_asm_table[] = { + "{r0}", "r0", + "{r1}", "r1", + "{r2}", "r2", + "{r3}", "r3", + "{r4}", "r4", + "{r5}", "r5", + "{r6}", "r6", + "{r7}", "r7", + "{r8}", "r8", + "{r9}", "r9", + "{r10}", "r10", + "{r11}", "r11", + "{r12}", "r12", + "{r13}", "r13", + "{r14}", "r14", + "{lr}", "lr", + "{sp}", "sp", + "{ip}", "ip", + "{fp}", "fp", + "{sl}", "sl", + "{memory}", "memory", + "{cc}", "cc", + 0,0 +}; + +ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin() { + AsmTransCBE = arm_asm_table; + Data64bitsDirective = 0; + CommentString = "@"; + COMMDirectiveTakesAlignment = false; + SupportsDebugInformation = true; + + // Exceptions handling + ExceptionsType = ExceptionHandling::SjLj; + AbsoluteEHSectionOffsets = false; +} + +ARMELFMCAsmInfo::ARMELFMCAsmInfo() { + AlignmentIsInBytes = false; + Data64bitsDirective = 0; + CommentString = "@"; + COMMDirectiveTakesAlignment = false; + + NeedsSet = false; + HasLEB128 = true; + AbsoluteDebugSectionOffsets = true; + PrivateGlobalPrefix = ".L"; + WeakRefDirective = "\t.weak\t"; + SetDirective = "\t.set\t"; + LCOMMDirective = "\t.lcomm\t"; + + DwarfRequiresFrameSection = false; + + SupportsDebugInformation = true; +} diff --git a/lib/Target/ARM/ARMMCAsmInfo.h b/lib/Target/ARM/ARMMCAsmInfo.h new file mode 100644 index 0000000..90f7822 --- /dev/null +++ b/lib/Target/ARM/ARMMCAsmInfo.h @@ -0,0 +1,31 @@ +//=====-- ARMMCAsmInfo.h - ARM asm properties -------------*- C++ -*--====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the ARMMCAsmInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ARMTARGETASMINFO_H +#define LLVM_ARMTARGETASMINFO_H + +#include "llvm/MC/MCAsmInfoDarwin.h" + +namespace llvm { + + struct ARMMCAsmInfoDarwin : public MCAsmInfoDarwin { + explicit ARMMCAsmInfoDarwin(); + }; + + struct ARMELFMCAsmInfo : public MCAsmInfo { + explicit ARMELFMCAsmInfo(); + }; + +} // namespace llvm + +#endif diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h index 66d3df6..2176b27 100644 --- a/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -1,10 +1,10 @@ //====- ARMMachineFuctionInfo.h - ARM machine function info -----*- C++ -*-===// -// +// // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. -// +// //===----------------------------------------------------------------------===// // // This file declares ARM-specific per-machine-function information. @@ -52,10 +52,6 @@ class ARMFunctionInfo : public MachineFunctionInfo { /// enable far jump. bool LRSpilledForFarJump; - /// R3IsLiveIn - True if R3 is live in to this function. - /// FIXME: Remove when register scavenger for Thumb is done. - bool R3IsLiveIn; - /// FramePtrSpillOffset - If HasStackFrame, this records the frame pointer /// spill stack offset. unsigned FramePtrSpillOffset; @@ -100,7 +96,7 @@ public: hasThumb2(false), Align(2U), VarArgsRegSaveSize(0), HasStackFrame(false), - LRSpilledForFarJump(false), R3IsLiveIn(false), + LRSpilledForFarJump(false), FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), GPRCS1Frames(0), GPRCS2Frames(0), DPRCSFrames(0), @@ -111,7 +107,7 @@ public: hasThumb2(MF.getTarget().getSubtarget<ARMSubtarget>().hasThumb2()), Align(isThumb ? 1U : 2U), VarArgsRegSaveSize(0), HasStackFrame(false), - LRSpilledForFarJump(false), R3IsLiveIn(false), + LRSpilledForFarJump(false), FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), GPRCS1Frames(32), GPRCS2Frames(32), DPRCSFrames(32), @@ -119,6 +115,7 @@ public: JumpTableUId(0), ConstPoolEntryUId(0) {} bool isThumbFunction() const { return isThumb; } + bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; } bool isThumb2Function() const { return isThumb && hasThumb2; } unsigned getAlign() const { return Align; } @@ -133,13 +130,9 @@ public: bool isLRSpilledForFarJump() const { return LRSpilledForFarJump; } void setLRIsSpilledForFarJump(bool s) { LRSpilledForFarJump = s; } - // FIXME: Remove when register scavenger for Thumb is done. - bool isR3LiveIn() const { return R3IsLiveIn; } - void setR3IsLiveIn(bool l) { R3IsLiveIn = l; } - unsigned getFramePtrSpillOffset() const { return FramePtrSpillOffset; } void setFramePtrSpillOffset(unsigned o) { FramePtrSpillOffset = o; } - + unsigned getGPRCalleeSavedArea1Offset() const { return GPRCS1Offset; } unsigned getGPRCalleeSavedArea2Offset() const { return GPRCS2Offset; } unsigned getDPRCalleeSavedAreaOffset() const { return DPRCSOffset; } diff --git a/lib/Target/ARM/ARMPerfectShuffle.h b/lib/Target/ARM/ARMPerfectShuffle.h new file mode 100644 index 0000000..5ff7c38 --- /dev/null +++ b/lib/Target/ARM/ARMPerfectShuffle.h @@ -0,0 +1,6586 @@ +//===-- ARMPerfectShuffle.h - NEON Perfect Shuffle Table ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file, which was autogenerated by llvm-PerfectShuffle, contains data +// for the optimal way to build a perfect shuffle using neon instructions. +// +//===----------------------------------------------------------------------===// + +// 31 entries have cost 0 +// 242 entries have cost 1 +// 1447 entries have cost 2 +// 3602 entries have cost 3 +// 1237 entries have cost 4 +// 2 entries have cost 5 + +// This table is 6561*4 = 26244 bytes in size. +static const unsigned PerfectShuffleTable[6561+1] = { + 135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS + 1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS + 2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0> + 2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0> + 1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS + 2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3> + 2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3> + 2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0> + 135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS + 2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0> + 1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS + 1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS + 2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0> + 2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5> + 2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7> + 3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1> + 2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1> + 1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS + 3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0> + 3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1> + 1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS + 2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0> + 3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6> + 3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6> + 2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7> + 2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2> + 1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS + 2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> + 2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0> + 2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0> + 2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3> + 2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6> + 3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6> + 3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7> + 3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0> + 2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> + 2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1> + 2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS + 3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS + 3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4> + 3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6> + 1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS + 2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS + 3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5> + 1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS + 2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7> + 2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3> + 3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7> + 3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5> + 2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6> + 2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5> + 2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7> + 2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7> + 2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7> + 2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7> + 3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS + 2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3> + 3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7> + 3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS + 3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0> + 2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6> + 2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0> + 2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0> + 2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> + 3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0> + 3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7> + 3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0> + 2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6> + 3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0> + 3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7> + 2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7> + 2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> + 135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS + 1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS + 1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS + 2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u> + 1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS + 1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS + 2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS + 2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u> + 135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS + 2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1> + 1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS + 2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1> + 2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0> + 2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5> + 4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7> + 2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1> + 2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0> + 1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS + 1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1> + 2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1> + 2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0> + 2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3> + 1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS + 2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7> + 2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3> + 2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1> + 1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS + 1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS + 3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1> + 2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2> + 835584U, // <0,1,2,3>: Cost 0 copy LHS + 1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS + 3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7> + 2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7> + 1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2> + 835584U, // <0,1,2,u>: Cost 0 copy LHS + 2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0> + 2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3> + 2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0> + 2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0> + 2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS + 2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7> + 2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0> + 2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1> + 2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3> + 2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS + 4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1> + 2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1> + 2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4> + 2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS + 1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS + 2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS + 2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4> + 1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS + 3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1> + 2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1> + 3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0> + 2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7> + 2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6> + 2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1> + 2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1> + 2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1> + 2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7> + 2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS + 3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7> + 2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1> + 3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7> + 2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS + 3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7> + 2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1> + 1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1> + 1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1> + 2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0> + 2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1> + 2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0> + 2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1> + 2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6> + 3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0> + 2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0> + 2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7> + 2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2> + 1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS + 1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS + 2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS + 835584U, // <0,1,u,3>: Cost 0 copy LHS + 1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS + 1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS + 2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS + 1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u> + 835584U, // <0,1,u,u>: Cost 0 copy LHS + 2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0> + 1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS + 1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS + 2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0> + 2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6> + 2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7> + 2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7> + 2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0> + 1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS + 2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2> + 2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1> + 2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2> + 2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> + 2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS + 2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7> + 2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7> + 3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7> + 2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2> + 1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2> + 2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2> + 2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2> + 3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3> + 1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS + 2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3> + 2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7> + 2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2> + 1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS + 2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2> + 2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> + 3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3> + 2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> + 2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6> + 2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> + 3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3> + 2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0> + 2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> + 2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS + 4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3> + 2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4> + 2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4> + 2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS + 1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS + 1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS + 2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4> + 1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS + 2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7> + 2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3> + 3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7> + 3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6> + 2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6> + 2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5> + 2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0> + 2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS + 2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS + 2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1> + 3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2> + 2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3> + 2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7> + 2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5> + 3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6> + 2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6> + 2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2> + 2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7> + 2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> + 2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2> + 3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2> + 2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0> + 2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6> + 2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2> + 3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2> + 2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7> + 2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> + 1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u> + 1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS + 1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS + 3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3> + 1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS + 1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS + 1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS + 2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS + 1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS + 2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0> + 2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2> + 2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0> + 4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3> + 2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS + 4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6> + 3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7> + 3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0> + 2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS + 2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2> + 2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1> + 2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3> + 2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3> + 2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6> + 4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6> + 3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1> + 3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3> + 2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2> + 1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS + 1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2> + 2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2> + 3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3> + 1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS + 3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6> + 2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3> + 2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2> + 1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS + 3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2> + 2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3> + 3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3> + 2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3> + 3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6> + 3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6> + 3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7> + 3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7> + 2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3> + 3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2> + 2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4> + 2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4> + 4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3> + 3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6> + 2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6> + 3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS + 3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4> + 2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6> + 3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS + 3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2> + 4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2> + 3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7> + 2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5> + 3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7> + 2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7> + 2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0> + 2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5> + 2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7> + 3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3> + 3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7> + 3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7> + 3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7> + 3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6> + 3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6> + 2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0> + 2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0> + 3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1> + 2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3> + 3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7> + 3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7> + 3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5> + 3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7> + 3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7> + 3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0> + 2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3> + 1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS + 1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u> + 2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2> + 3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3> + 1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS + 3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6> + 2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3> + 2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0> + 1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS + 2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4> + 2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS + 2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4> + 3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0> + 2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6> + 2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1> + 3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS + 3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0> + 3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS + 2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS + 3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1> + 3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0> + 2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1> + 2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS + 1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS + 3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS + 2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1> + 1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS + 2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS + 2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2> + 3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2> + 2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4> + 2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS + 2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS + 1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS + 2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2> + 1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS + 3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2> + 3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2> + 3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4> + 3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3> + 3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6> + 3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6> + 4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS + 3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4> + 3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2> + 2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4> + 3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0> + 3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3> + 3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4> + 2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4> + 2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS + 2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS + 3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4> + 2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS + 2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS + 2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0> + 3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5> + 2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5> + 2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS + 2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6> + 1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS + 2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5> + 1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS + 2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6> + 3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2> + 2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6> + 3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0> + 2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6> + 3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6> + 2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0> + 2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4> + 2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2> + 3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS + 3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1> + 3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4> + 3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4> + 3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS + 2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> + 2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0> + 3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2> + 2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> + 2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS + 2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS + 2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS + 2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u> + 2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS + 1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS + 1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS + 2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u> + 1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS + 3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0> + 2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS + 3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2> + 3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5> + 3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS + 3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1> + 2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1> + 3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS + 2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS + 1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS + 2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3> + 2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2> + 2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2> + 1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1> + 2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5> + 2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0> + 2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3> + 1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS + 2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS + 2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2> + 3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7> + 2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2> + 2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS + 3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5> + 4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6> + 3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS + 3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS + 3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2> + 3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3> + 3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1> + 3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3> + 2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5> + 3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0> + 3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7> + 2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0> + 2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0> + 2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1> + 3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4> + 3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4> + 3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5> + 3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6> + 2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS + 3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5> + 2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6> + 2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS + 3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0> + 3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0> + 3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5> + 3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0> + 3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5> + 2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5> + 2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0> + 2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7> + 2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7> + 2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS + 3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0> + 3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3> + 3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4> + 2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS + 2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0> + 3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7> + 1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0> + 1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0> + 2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS + 2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0> + 2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7> + 3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2> + 2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS + 2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7> + 2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0> + 2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0> + 2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS + 1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS + 2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS + 2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0> + 2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u> + 1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u> + 2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS + 2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u> + 1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0> + 1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0> + 2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS + 2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS + 2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2> + 3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4> + 2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6> + 2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0> + 2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0> + 2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS + 2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS + 2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS + 3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1> + 2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3> + 3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3> + 2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS + 2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1> + 2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6> + 2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS + 2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS + 1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS + 2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2> + 2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2> + 2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1> + 1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2> + 2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3> + 3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6> + 2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS + 1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS + 3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2> + 3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7> + 3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0> + 3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3> + 2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6> + 3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7> + 3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0> + 2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0> + 2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0> + 2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS + 4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2> + 2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4> + 3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2> + 2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS + 2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS + 2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0> + 4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS + 2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS + 2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> + 3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0> + 3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7> + 3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0> + 3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5> + 3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0> + 3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7> + 4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS + 2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> + 3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0> + 3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3> + 3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6> + 3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0> + 3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4> + 3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6> + 2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6> + 2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7> + 2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7> + 2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1> + 2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0> + 3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7> + 2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0> + 2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5> + 2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6> + 3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2> + 2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0> + 2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1> + 1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS + 2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS + 2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u> + 2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0> + 1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u> + 2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS + 2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0> + 2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS + 1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS + 2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0> + 2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS + 2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0> + 3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0> + 2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5> + 2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6> + 2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7> + 3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7> + 2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS + 2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1> + 3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1> + 2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0> + 3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5> + 2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1> + 3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3> + 3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7> + 2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7> + 2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1> + 2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS + 1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2> + 3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2> + 2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0> + 2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS + 2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7> + 2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2> + 3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7> + 1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2> + 3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2> + 3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3> + 3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3> + 3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3> + 3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6> + 3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7> + 2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> + 3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0> + 2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7> + 3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS + 2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4> + 2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4> + 3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7> + 3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6> + 2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS + 3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7> + 3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5> + 2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS + 3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0> + 3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7> + 3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5> + 3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7> + 2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5> + 3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7> + 2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7> + 2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0> + 2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7> + 3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0> + 2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6> + 3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7> + 3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7> + 3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS + 2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7> + 3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6> + 2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0> + 2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7> + 3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1> + 3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0> + 3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7> + 3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0> + 3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS + 3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7> + 3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7> + 2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7> + 2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7> + 2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u> + 1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u> + 2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u> + 2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0> + 2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u> + 2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS + 2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7> + 2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0> + 1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u> + 135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS + 1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS + 1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS + 2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2> + 1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS + 2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6> + 3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS + 2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0> + 135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS + 1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1> + 1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS + 1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS + 2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> + 1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS + 1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS + 2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7> + 2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS + 1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS + 1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS + 1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2> + 1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS + 835584U, // <0,u,2,3>: Cost 0 copy LHS + 1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS + 3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6> + 1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS + 1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2> + 835584U, // <0,u,2,u>: Cost 0 copy LHS + 2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2> + 2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> + 2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u> + 2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> + 2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6> + 2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> + 2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> + 2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u> + 2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> + 2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS + 2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS + 3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS + 2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4> + 2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS + 1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS + 1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS + 2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6> + 1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS + 2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS + 2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0> + 3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7> + 2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7> + 2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS + 2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u> + 1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS + 2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS + 1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS + 2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS + 2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6> + 2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u> + 2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7> + 2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS + 2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u> + 2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u> + 1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u> + 1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u> + 2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS + 2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0> + 2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7> + 2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u> + 2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS + 2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6> + 2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7> + 2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7> + 2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS + 135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS + 1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS + 1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS + 835584U, // <0,u,u,3>: Cost 0 copy LHS + 1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS + 1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS + 1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS + 1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u> + 835584U, // <0,u,u,u>: Cost 0 copy LHS + 2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0> + 1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1> + 2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2> + 2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0> + 2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1> + 2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0> + 3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7> + 3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0> + 1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1> + 2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS + 2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1> + 1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS + 3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3> + 2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS + 2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1> + 2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7> + 3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2> + 1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS + 2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1> + 2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1> + 2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0> + 2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1> + 2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6> + 3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7> + 2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0> + 2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2> + 2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1> + 3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0> + 3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1> + 67944550U, // <1,0,3,2>: Cost 1 vrev LHS + 2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3> + 2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS + 4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7> + 3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7> + 2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3> + 68386972U, // <1,0,3,u>: Cost 1 vrev LHS + 2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1> + 2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5> + 2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6> + 3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1> + 3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1> + 2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS + 3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1> + 3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4> + 2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS + 4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0> + 2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS + 3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS + 3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5> + 2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5> + 3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0> + 3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0> + 3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS + 3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS + 3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1> + 2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7> + 3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7> + 3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6> + 3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1> + 3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0> + 3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0> + 2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0> + 2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0> + 2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0> + 4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1> + 2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7> + 3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0> + 3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6> + 3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0> + 2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0> + 3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7> + 2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0> + 3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0> + 1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1> + 67985515U, // <1,0,u,2>: Cost 1 vrev LHS + 2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1> + 2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6> + 2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS + 2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0> + 2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u> + 68427937U, // <1,0,u,u>: Cost 1 vrev LHS + 1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1> + 1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS + 2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1> + 2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2> + 2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5> + 2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1> + 3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7> + 3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0> + 1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1> + 1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS + 2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0> + 2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3> + 1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS + 2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7> + 2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7> + 2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1> + 202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS + 2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2> + 2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1> + 2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2> + 2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1> + 2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS + 3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7> + 2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7> + 3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0> + 2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1> + 2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2> + 3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1> + 4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2> + 2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS + 2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6> + 2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7> + 3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7> + 2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3> + 2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS + 2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS + 2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4> + 3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0> + 3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5> + 2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS + 1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS + 2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS + 3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4> + 1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS + 2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1> + 2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3> + 4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2> + 2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7> + 2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5> + 2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5> + 2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0> + 2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS + 2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7> + 3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2> + 2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7> + 2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3> + 3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7> + 3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6> + 4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5> + 2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6> + 2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0> + 2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0> + 2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1> + 2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> + 3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3> + 4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS + 2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6> + 3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7> + 3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0> + 2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7> + 2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> + 1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS + 2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3> + 2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS + 1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS + 1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS + 2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7> + 2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS + 202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS + 2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0> + 1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS + 2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2> + 2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1> + 2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5> + 3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7> + 2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2> + 2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1> + 1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS + 2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2> + 2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1> + 2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0> + 2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS + 2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS + 2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7> + 2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7> + 3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0> + 2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS + 2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2> + 2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2> + 2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2> + 2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3> + 3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5> + 3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7> + 2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7> + 2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1> + 2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3> + 403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS + 1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3> + 403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS + 1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3> + 1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> + 403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS + 2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2> + 3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6> + 3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4> + 2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2> + 2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4> + 1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS + 2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6> + 2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0> + 1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS + 2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS + 2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7> + 3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2> + 2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS + 2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS + 2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5> + 2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0> + 2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7> + 2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS + 2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1> + 2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2> + 2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3> + 2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7> + 3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5> + 3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7> + 2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6> + 2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2> + 2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7> + 1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2> + 2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2> + 3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3> + 2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1> + 2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6> + 3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0> + 3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1> + 2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1> + 1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2> + 403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS + 1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2> + 403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS + 1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS + 1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2> + 403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS + 2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0> + 1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS + 2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2> + 2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3> + 2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5> + 3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6> + 3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7> + 3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1> + 1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS + 2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2> + 2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1> + 2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3> + 1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS + 2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS + 2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7> + 3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7> + 2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1> + 1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS + 2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> + 2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3> + 2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> + 2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> + 2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> + 3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5> + 2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> + 3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3> + 2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> + 1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS + 1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3> + 2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2> + 2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3> + 1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS + 2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5> + 2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3> + 3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7> + 1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS + 2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS + 2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3> + 3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3> + 2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4> + 2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS + 1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS + 2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6> + 3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4> + 1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS + 2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS + 2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7> + 2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5> + 2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5> + 2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS + 2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5> + 2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4> + 1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS + 1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS + 2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0> + 3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1> + 2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3> + 3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7> + 2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3> + 3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5> + 2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3> + 2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> + 2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> + 2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1> + 2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3> + 3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7> + 2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3> + 2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6> + 2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5> + 3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1> + 2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7> + 2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1> + 1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS + 1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u> + 2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2> + 1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS + 1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS + 1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS + 2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6> + 1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS + 1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS + 2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4> + 2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS + 3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4> + 2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4> + 2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5> + 1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1> + 2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2> + 3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1> + 1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1> + 3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2> + 2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4> + 3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0> + 3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6> + 3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1> + 2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS + 3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS + 3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0> + 2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS + 3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2> + 3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3> + 3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2> + 2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1> + 3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4> + 2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS + 3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7> + 3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2> + 2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS + 2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS + 2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4> + 2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3> + 3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3> + 2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6> + 3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5> + 3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6> + 3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3> + 3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u> + 2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1> + 3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4> + 3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4> + 3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4> + 2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4> + 2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS + 2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6> + 3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1> + 2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS + 2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS + 2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5> + 2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1> + 3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2> + 2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS + 2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS + 1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS + 3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS + 1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS + 2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS + 2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1> + 3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2> + 2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2> + 2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS + 3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7> + 3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7> + 2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> + 2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS + 2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4> + 3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1> + 3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4> + 3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4> + 3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS + 3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0> + 2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1> + 3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2> + 2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4> + 2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS + 2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS + 2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u> + 2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1> + 2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6> + 1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1> + 1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS + 2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> + 1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS + 2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0> + 1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS + 2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5> + 3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4> + 1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> + 2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5> + 2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1> + 3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1> + 1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS + 2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2> + 2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1> + 2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0> + 2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7> + 2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5> + 2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1> + 2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7> + 3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS + 2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3> + 3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1> + 2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5> + 2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2> + 2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1> + 3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5> + 3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7> + 2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7> + 4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS + 2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1> + 2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2> + 2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7> + 3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2> + 2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3> + 2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6> + 3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5> + 4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6> + 2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS + 2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS + 2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1> + 2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0> + 3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3> + 3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4> + 2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4> + 1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS + 2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS + 2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6> + 1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS + 2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS + 2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5> + 3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5> + 4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3> + 2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS + 2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5> + 2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0> + 2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7> + 2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS + 2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1> + 2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5> + 2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3> + 3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4> + 3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6> + 3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6> + 2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6> + 2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0> + 2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0> + 2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS + 2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1> + 2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1> + 2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7> + 2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS + 2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7> + 3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0> + 2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1> + 2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS + 2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2> + 1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS + 2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3> + 2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1> + 1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5> + 1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS + 2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7> + 2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS + 2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS + 3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0> + 2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS + 2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6> + 3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1> + 2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5> + 2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6> + 2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6> + 4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS + 2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS + 3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2> + 2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1> + 3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0> + 3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3> + 2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6> + 3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7> + 3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1> + 2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS + 2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS + 2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1> + 3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0> + 3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2> + 3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1> + 3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS + 3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7> + 2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7> + 2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS + 2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS + 2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS + 3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1> + 2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3> + 3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3> + 2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS + 4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5> + 2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3> + 3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7> + 3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u> + 3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1> + 3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0> + 3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4> + 3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4> + 3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5> + 2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS + 3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS + 4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS + 2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS + 3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2> + 3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5> + 3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5> + 3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5> + 3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6> + 3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5> + 2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0> + 2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS + 2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS + 2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1> + 3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6> + 3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7> + 3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0> + 3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS + 3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7> + 2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6> + 2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7> + 2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7> + 1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1> + 2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1> + 2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3> + 3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7> + 2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5> + 2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1> + 2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7> + 4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS + 1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1> + 1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1> + 2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS + 3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2> + 3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1> + 2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5> + 2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS + 2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7> + 3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7> + 1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1> + 2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1> + 2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS + 3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1> + 3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1> + 2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1> + 3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0> + 2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7> + 2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7> + 2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS + 2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS + 2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1> + 3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0> + 2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1> + 2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS + 2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7> + 2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7> + 2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1> + 2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7> + 2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS + 3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7> + 3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2> + 2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1> + 2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS + 3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7> + 2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2> + 3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2> + 2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS + 1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS + 2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7> + 2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2> + 2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3> + 1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS + 1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3> + 2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3> + 2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2> + 1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS + 2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1> + 3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1> + 3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4> + 3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4> + 3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4> + 2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS + 3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0> + 3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6> + 2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS + 2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS + 2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7> + 3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2> + 2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5> + 2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS + 4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6> + 2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7> + 3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS + 2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS + 2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1> + 2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0> + 3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7> + 3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6> + 3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS + 3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6> + 3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6> + 2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0> + 2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0> + 2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1> + 3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1> + 3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1> + 3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0> + 3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS + 2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7> + 3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7> + 2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7> + 2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1> + 1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS + 2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS + 2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2> + 2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u> + 1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS + 1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u> + 2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3> + 2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2> + 1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS + 1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u> + 1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS + 2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2> + 2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u> + 1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u> + 1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1> + 2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2> + 2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1> + 1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS + 1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS + 1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS + 1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS + 1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS + 2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7> + 3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS + 2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS + 202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS + 2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> + 2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS + 2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> + 2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> + 2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> + 2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS + 2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> + 2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS + 2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> + 403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS + 1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 115726126U, // <1,u,3,2>: Cost 1 vrev LHS + 2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS + 403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS + 1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3> + 1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS + 403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS + 2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1> + 2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0> + 2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4> + 2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4> + 2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS + 1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS + 2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6> + 2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6> + 1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS + 2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS + 2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7> + 3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS + 2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS + 2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS + 2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS + 1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS + 1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS + 1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS + 2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u> + 2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0> + 2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3> + 2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7> + 2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4> + 3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7> + 2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u> + 2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> + 2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> + 1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u> + 2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1> + 2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1> + 2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7> + 2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS + 2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u> + 2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7> + 2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7> + 1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u> + 403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS + 202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS + 115767091U, // <1,u,u,2>: Cost 1 vrev LHS + 1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS + 403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS + 1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS + 1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS + 1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS + 403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS + 2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0> + 2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1> + 1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2> + 2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0> + 2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS + 3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5> + 2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0> + 4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7> + 1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2> + 2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1> + 2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0> + 1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS + 2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS + 2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS + 3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7> + 2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1> + 3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2> + 1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS + 1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS + 2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2> + 2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0> + 2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2> + 1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS + 2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3> + 2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2> + 2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2> + 1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS + 2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0> + 2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1> + 2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2> + 4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3> + 3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS + 3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5> + 4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6> + 3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7> + 2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u> + 2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS + 2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5> + 2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6> + 3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2> + 2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS + 2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS + 2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS + 3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5> + 2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6> + 3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS + 3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5> + 2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7> + 3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5> + 2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5> + 3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5> + 3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0> + 2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS + 2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5> + 4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0> + 2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS + 2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7> + 3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6> + 2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6> + 3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0> + 3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6> + 3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7> + 2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6> + 2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> + 3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3> + 2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1> + 3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0> + 3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6> + 3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2> + 3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0> + 3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7> + 2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> + 1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS + 2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1> + 1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS + 2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2> + 1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS + 2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS + 2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS + 2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS + 1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS + 2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS + 2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS + 3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0> + 1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2> + 2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS + 2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2> + 3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0> + 2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1> + 1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2> + 2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS + 2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1> + 2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0> + 2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3> + 2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS + 3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7> + 3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1> + 3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2> + 2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3> + 2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS + 3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1> + 2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2> + 2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0> + 2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS + 3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3> + 3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7> + 2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0> + 2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0> + 2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS + 2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1> + 2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2> + 2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3> + 2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS + 2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5> + 2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> + 4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7> + 2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u> + 2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5> + 3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6> + 3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5> + 1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4> + 3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS + 2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS + 3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS + 3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6> + 1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4> + 2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS + 2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7> + 3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1> + 2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7> + 2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS + 3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5> + 3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5> + 3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS + 2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7> + 2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS + 3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2> + 2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7> + 3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS + 2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS + 4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5> + 3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6> + 3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1> + 2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS + 2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2> + 3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1> + 3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0> + 2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7> + 3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6> + 3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7> + 3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1> + 3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u> + 2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2> + 2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS + 2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1> + 2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2> + 1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u> + 2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS + 2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5> + 2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> + 2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1> + 1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u> + 1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2> + 1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS + 2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0> + 2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2> + 2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS + 3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7> + 3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6> + 3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2> + 1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2> + 2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2> + 2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1> + 2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0> + 2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS + 3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS + 2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7> + 3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3> + 3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1> + 2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS + 1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS + 2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2> + 269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS + 2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3> + 1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS + 2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7> + 2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7> + 2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2> + 269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS + 2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1> + 2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0> + 2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2> + 1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS + 2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5> + 3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5> + 2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6> + 2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3> + 1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS + 2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS + 3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2> + 2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5> + 3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5> + 2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS + 1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS + 2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS + 3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4> + 1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS + 3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2> + 2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3> + 2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7> + 4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS + 2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5> + 2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5> + 2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0> + 2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS + 2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS + 3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6> + 3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3> + 2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3> + 2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7> + 3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS + 3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7> + 3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6> + 2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2> + 2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7> + 2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1> + 3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5> + 2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2> + 4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS + 2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6> + 3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7> + 3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7> + 2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7> + 2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1> + 1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS + 1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS + 269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS + 1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS + 1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS + 1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS + 2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6> + 2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS + 269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS + 1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS + 1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3> + 1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7> + 2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7> + 2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0> + 470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS + 1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS + 1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7> + 2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0> + 1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3> + 2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2> + 2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1> + 1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> + 1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1> + 2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5> + 2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7> + 1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1> + 1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1> + 1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3> + 1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3> + 1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3> + 1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7> + 2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3> + 2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7> + 1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2> + 1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS + 1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4> + 2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0> + 2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2> + 1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS + 470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS + 1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6> + 2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4> + 470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS + 2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS + 1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3> + 2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4> + 2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6> + 1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> + 1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7> + 1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7> + 2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2> + 2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6> + 1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3> + 2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5> + 2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6> + 2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7> + 1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1> + 1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1> + 1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2> + 2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3> + 2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3> + 2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1> + 1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6> + 2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2> + 2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1> + 1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2> + 1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2> + 470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS + 1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0> + 1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1> + 1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6> + 470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS + 1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7> + 1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1> + 470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS + 2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4> + 2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS + 2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4> + 2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4> + 2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2> + 2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1> + 1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> + 3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1> + 1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> + 2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2> + 3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1> + 2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4> + 2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS + 3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4> + 3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7> + 3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3> + 3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1> + 2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS + 2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4> + 3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3> + 2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4> + 2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5> + 2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4> + 2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS + 3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS + 3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0> + 2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS + 2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4> + 4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1> + 3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4> + 3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3> + 3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4> + 2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5> + 2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6> + 3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7> + 2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u> + 2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2> + 3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1> + 3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2> + 3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3> + 2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4> + 2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS + 2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS + 4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7> + 2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS + 2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5> + 2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2> + 2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5> + 2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5> + 2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS + 3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5> + 1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS + 2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS + 1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS + 1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS + 2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2> + 2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4> + 2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2> + 1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS + 2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS + 2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6> + 2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1> + 1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS + 2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> + 3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2> + 3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4> + 3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4> + 3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6> + 2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0> + 2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1> + 3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7> + 2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> + 1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS + 2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS + 2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2> + 2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2> + 1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS + 2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5> + 1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS + 2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS + 1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS + 3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0> + 2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS + 2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2> + 2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5> + 2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5> + 2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5> + 3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7> + 3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS + 2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS + 3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2> + 3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1> + 2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0> + 2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7> + 2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5> + 3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7> + 3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2> + 2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3> + 2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5> + 3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2> + 4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1> + 2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2> + 3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1> + 2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5> + 3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1> + 3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7> + 3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS + 3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS + 2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS + 2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3> + 2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5> + 3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3> + 2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6> + 2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3> + 2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6> + 2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7> + 2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u> + 2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS + 3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4> + 2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5> + 2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4> + 2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS + 2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS + 3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6> + 3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS + 2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS + 2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS + 3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7> + 3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5> + 3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5> + 2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS + 2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5> + 4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6> + 2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7> + 2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7> + 2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS + 3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6> + 2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3> + 3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6> + 2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5> + 3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7> + 4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6> + 3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS + 3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS + 2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS + 2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2> + 2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7> + 3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2> + 2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS + 2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7> + 2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2> + 4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS + 2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS + 2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS + 2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS + 2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u> + 2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u> + 2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6> + 2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS + 2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6> + 2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7> + 2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS + 2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0> + 1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS + 2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2> + 3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0> + 1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6> + 3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6> + 2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1> + 2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2> + 1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS + 2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2> + 2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1> + 2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0> + 2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS + 2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6> + 2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7> + 3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7> + 2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2> + 2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS + 2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1> + 2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3> + 2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2> + 2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1> + 2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6> + 2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7> + 2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6> + 2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS + 2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1> + 2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2> + 3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1> + 2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7> + 2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3> + 2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6> + 3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5> + 3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6> + 1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS + 1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS + 2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2> + 2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3> + 2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0> + 3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4> + 2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4> + 1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS + 2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6> + 4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS + 1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS + 3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2> + 2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3> + 2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7> + 3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5> + 2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5> + 2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5> + 2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0> + 2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS + 2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS + 2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS + 3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1> + 2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6> + 3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3> + 2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS + 3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5> + 2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6> + 2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS + 2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS + 2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1> + 2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2> + 2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2> + 2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0> + 2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5> + 3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7> + 2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2> + 2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0> + 2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1> + 2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2> + 1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS + 2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3> + 2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS + 1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6> + 1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS + 2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7> + 1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS + 1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS + 2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2> + 1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2> + 2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2> + 2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0> + 2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2> + 2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7> + 2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7> + 2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2> + 1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2> + 2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> + 3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1> + 3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0> + 3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7> + 3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5> + 3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7> + 2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7> + 3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0> + 2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> + 3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2> + 3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7> + 2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2> + 3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1> + 3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6> + 2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7> + 2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7> + 3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1> + 2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7> + 1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS + 2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7> + 2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2> + 2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3> + 1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS + 2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3> + 1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3> + 2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2> + 1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS + 2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6> + 3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4> + 3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7> + 2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4> + 3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4> + 2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS + 2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u> + 3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0> + 2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS + 2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2> + 3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7> + 3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7> + 3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5> + 2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS + 3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5> + 3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7> + 2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6> + 2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS + 2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS + 3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7> + 2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3> + 3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6> + 2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS + 2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6> + 2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6> + 3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0> + 2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS + 2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1> + 3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2> + 3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7> + 3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7> + 2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS + 3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5> + 2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7> + 2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7> + 2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1> + 1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS + 1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2> + 2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2> + 2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u> + 1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS + 2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS + 1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u> + 2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2> + 1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS + 1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS + 1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2> + 1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7> + 1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2> + 3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS + 470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS + 1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5> + 1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7> + 2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3> + 1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3> + 1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS + 2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1> + 269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS + 1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1> + 1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS + 2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6> + 1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS + 269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS + 1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1> + 1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3> + 1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS + 1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5> + 1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3> + 1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS + 1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS + 1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS + 1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4> + 2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5> + 1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4> + 1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS + 470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS + 1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6> + 3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS + 470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS + 2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5> + 1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3> + 2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5> + 2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7> + 1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS + 1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7> + 1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS + 1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS + 2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2> + 1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3> + 2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7> + 1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS + 2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS + 1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1> + 1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS + 1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2> + 2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u> + 2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7> + 2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1> + 1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6> + 2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2> + 2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7> + 1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2> + 1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS + 470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS + 269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS + 1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS + 1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS + 470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS + 1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS + 1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS + 470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS + 1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0> + 1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> + 1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> + 3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1> + 2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1> + 3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7> + 3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1> + 3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0> + 1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2> + 1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS + 2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0> + 537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS + 2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3> + 1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS + 2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7> + 2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7> + 2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1> + 537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS + 1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1> + 2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0> + 2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1> + 1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7> + 2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7> + 2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7> + 1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2> + 2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2> + 2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0> + 2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1> + 2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3> + 2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6> + 3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7> + 2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7> + 3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1> + 2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3> + 2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4> + 1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> + 1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> + 3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1> + 2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6> + 1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS + 2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6> + 3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4> + 1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> + 2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7> + 2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3> + 3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS + 3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6> + 2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6> + 2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5> + 2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7> + 2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7> + 3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS + 2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7> + 2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7> + 2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7> + 3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2> + 3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1> + 3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0> + 2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6> + 2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0> + 2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7> + 2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2> + 2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7> + 2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0> + 3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7> + 2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6> + 3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7> + 3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3> + 2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7> + 2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0> + 1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2> + 1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1> + 537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS + 2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1> + 1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> + 1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS + 2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7> + 2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u> + 537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS + 2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS + 2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS + 2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1> + 1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2> + 2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS + 2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1> + 3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6> + 3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0> + 1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2> + 2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1> + 1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1> + 2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1> + 1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3> + 2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5> + 2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5> + 3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5> + 3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3> + 1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3> + 2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1> + 2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3> + 2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2> + 1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0> + 2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS + 2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3> + 3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7> + 2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0> + 1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0> + 1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS + 1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> + 2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0> + 2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1> + 1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS + 1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> + 2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7> + 2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3> + 1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3> + 2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS + 2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5> + 2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5> + 2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5> + 2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS + 2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS + 2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS + 3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4> + 2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5> + 2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1> + 2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7> + 2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5> + 1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7> + 2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5> + 3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7> + 3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0> + 2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS + 1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7> + 3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1> + 2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7> + 2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7> + 2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7> + 3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5> + 2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7> + 3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7> + 2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0> + 2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7> + 3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS + 2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1> + 4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2> + 3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS + 3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS + 2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7> + 3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7> + 3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7> + 3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS + 1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS + 1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3> + 2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0> + 1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0> + 1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS + 1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7> + 2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7> + 2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS + 1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3> + 2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0> + 1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS + 2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0> + 2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0> + 2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5> + 2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7> + 2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4> + 2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0> + 1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS + 1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2> + 2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1> + 2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0> + 2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1> + 2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS + 2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7> + 2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3> + 3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1> + 1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1> + 2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1> + 2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3> + 1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2> + 1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3> + 2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5> + 2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7> + 2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6> + 3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3> + 1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3> + 1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1> + 2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0> + 2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2> + 2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3> + 1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5> + 2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1> + 2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2> + 2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0> + 1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1> + 2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS + 2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4> + 2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4> + 2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5> + 2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS + 1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS + 2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0> + 2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4> + 1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS + 2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS + 2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5> + 2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7> + 2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6> + 1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5> + 2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5> + 2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7> + 2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7> + 1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5> + 2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1> + 2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3> + 2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6> + 1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> + 2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5> + 2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7> + 2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7> + 2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1> + 1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7> + 2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2> + 2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3> + 4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2> + 2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS + 2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6> + 2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7> + 2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7> + 2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7> + 2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS + 1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1> + 1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS + 1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2> + 1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3> + 1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5> + 1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS + 2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0> + 2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1> + 1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1> + 2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0> + 1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2> + 2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0> + 2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2> + 2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1> + 2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2> + 3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2> + 4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7> + 1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2> + 2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3> + 1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3> + 2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3> + 2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1> + 2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS + 2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3> + 3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3> + 2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3> + 1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3> + 2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS + 2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3> + 1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3> + 2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0> + 2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS + 2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4> + 2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3> + 2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3> + 1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3> + 1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS + 2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3> + 2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3> + 336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS + 1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS + 2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5> + 2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7> + 2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3> + 336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS + 2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS + 2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4> + 2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4> + 2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6> + 2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4> + 1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6> + 2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS + 4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7> + 1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6> + 2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS + 2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5> + 2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5> + 2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5> + 2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS + 3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5> + 2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0> + 2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS + 2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS + 2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7> + 3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7> + 2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7> + 2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7> + 2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7> + 3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7> + 2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6> + 2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3> + 2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3> + 2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS + 2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7> + 2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7> + 2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3> + 2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS + 2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7> + 3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3> + 3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7> + 2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS + 1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS + 1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2> + 1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3> + 336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS + 1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS + 1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6> + 2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3> + 2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS + 336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS + 2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0> + 1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS + 2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2> + 3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4> + 2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5> + 1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1> + 1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2> + 3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0> + 1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS + 2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2> + 2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1> + 1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4> + 2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3> + 2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS + 2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0> + 2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3> + 2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4> + 1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4> + 3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS + 2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3> + 2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2> + 2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1> + 2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3> + 2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3> + 2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0> + 2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4> + 2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0> + 2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2> + 2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4> + 2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4> + 2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3> + 2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1> + 2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS + 3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS + 3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1> + 2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2> + 2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS + 2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4> + 2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4> + 2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4> + 1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4> + 1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS + 1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6> + 3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4> + 1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS + 1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS + 2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3> + 1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5> + 2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2> + 1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS + 2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7> + 537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS + 2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5> + 537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS + 2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1> + 2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6> + 2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3> + 2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6> + 1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6> + 2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7> + 2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7> + 2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4> + 1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2> + 2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2> + 3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5> + 2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4> + 3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7> + 2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6> + 2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0> + 2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0> + 2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7> + 2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4> + 1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS + 1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS + 1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u> + 2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1> + 1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS + 1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS + 537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS + 2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u> + 537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS + 3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0> + 2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS + 2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5> + 3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4> + 2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1> + 2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1> + 2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1> + 2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0> + 2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS + 2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS + 3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1> + 2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5> + 2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5> + 2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5> + 2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3> + 3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7> + 1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3> + 1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3> + 3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3> + 2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5> + 3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2> + 2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5> + 2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5> + 2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3> + 3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7> + 2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3> + 2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5> + 3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2> + 2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5> + 3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4> + 2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3> + 2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6> + 2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5> + 4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6> + 3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS + 3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS + 2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS + 3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5> + 2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4> + 3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0> + 2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS + 2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS + 2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS + 1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6> + 1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6> + 2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS + 2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3> + 2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5> + 2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5> + 2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS + 1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5> + 2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6> + 1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7> + 1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7> + 2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1> + 2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7> + 2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6> + 2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4> + 2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5> + 2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7> + 2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7> + 1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0> + 1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0> + 1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS + 1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7> + 2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2> + 2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2> + 1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS + 1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7> + 2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0> + 2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7> + 1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS + 1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS + 1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u> + 2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2> + 2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2> + 1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS + 1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7> + 2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS + 1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3> + 1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS + 2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS + 2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2> + 2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4> + 3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4> + 2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2> + 2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7> + 2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0> + 2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS + 2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2> + 3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3> + 3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1> + 2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6> + 3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1> + 2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6> + 3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3> + 2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3> + 2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3> + 2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6> + 2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS + 2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3> + 2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6> + 3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0> + 2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6> + 2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6> + 2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3> + 1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3> + 1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3> + 3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2> + 3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3> + 3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3> + 2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3> + 2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6> + 3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6> + 3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2> + 2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS + 2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS + 3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6> + 2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3> + 2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5> + 3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6> + 2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6> + 2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6> + 2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0> + 2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS + 2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6> + 2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS + 3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2> + 2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7> + 2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6> + 2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5> + 3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6> + 4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6> + 2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5> + 2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS + 2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1> + 2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3> + 2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3> + 3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6> + 2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4> + 2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7> + 1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7> + 1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7> + 1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1> + 2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7> + 2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7> + 2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3> + 1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5> + 2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1> + 2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2> + 2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS + 1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1> + 1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1> + 2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2> + 2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u> + 2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6> + 1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5> + 2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6> + 1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3> + 1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1> + 2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0> + 1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS + 2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2> + 3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0> + 2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5> + 2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0> + 2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0> + 2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1> + 1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS + 2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2> + 2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1> + 2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0> + 2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7> + 2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS + 1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7> + 2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7> + 2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3> + 1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7> + 2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS + 3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3> + 2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2> + 2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1> + 2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS + 2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7> + 1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7> + 2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3> + 1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7> + 2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2> + 2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3> + 2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3> + 2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3> + 2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6> + 2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7> + 2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7> + 2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7> + 2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2> + 2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS + 2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7> + 2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7> + 3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7> + 2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS + 1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS + 2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4> + 2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6> + 1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS + 2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2> + 2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3> + 3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3> + 2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0> + 2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5> + 2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5> + 2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7> + 2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS + 2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS + 2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1> + 2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0> + 2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3> + 2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0> + 2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5> + 2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4> + 2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6> + 2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0> + 2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7> + 2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS + 2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7> + 2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7> + 2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7> + 2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS + 2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7> + 2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7> + 1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7> + 2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2> + 1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS + 2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0> + 2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS + 2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6> + 1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS + 1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7> + 1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS + 1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0> + 1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2> + 1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2> + 1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2> + 2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1> + 1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1> + 1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2> + 2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS + 1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2> + 1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u> + 1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u> + 537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS + 1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3> + 1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS + 1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u> + 2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u> + 1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3> + 537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS + 1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3> + 1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u> + 1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0> + 1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7> + 1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u> + 1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3> + 1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0> + 1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1> + 1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3> + 2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2> + 336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS + 1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5> + 1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7> + 2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7> + 2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS + 336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS + 2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS + 1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5> + 1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6> + 2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5> + 1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4> + 1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6> + 1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6> + 1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6> + 1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6> + 1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS + 2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7> + 1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5> + 1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7> + 1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS + 1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5> + 537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS + 1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7> + 537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS + 2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1> + 2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6> + 2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7> + 1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7> + 1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6> + 2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7> + 1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0> + 1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7> + 1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS + 1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7> + 2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2> + 3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS + 1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS + 1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7> + 2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2> + 1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS + 1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1> + 1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2> + 537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS + 336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS + 1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5> + 1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6> + 537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS + 1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0> + 537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS + 2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0> + 2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1> + 2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2> + 3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4> + 2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4> + 3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0> + 3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0> + 3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0> + 2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1> + 2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS + 2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4> + 1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1> + 2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS + 2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4> + 3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1> + 2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1> + 1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4> + 3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4> + 2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4> + 2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0> + 2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6> + 3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7> + 2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> + 3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2> + 2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4> + 3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS + 2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> + 2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4> + 3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4> + 3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS + 3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5> + 3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7> + 3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0> + 2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> + 2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS + 2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5> + 2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6> + 3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2> + 2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS + 2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS + 3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2> + 3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4> + 2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS + 2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS + 1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS + 2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2> + 2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5> + 2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS + 3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7> + 3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7> + 2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5> + 1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS + 2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS + 3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1> + 1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS + 2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6> + 2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS + 2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6> + 3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6> + 2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0> + 1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS + 3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2> + 3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS + 4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS + 3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0> + 3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5> + 3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5> + 3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0> + 3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7> + 4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS + 2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS + 1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS + 1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u> + 2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS + 2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS + 2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> + 2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u> + 1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1> + 2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS + 3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6> + 2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2> + 2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5> + 3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4> + 3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1> + 3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4> + 2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS + 3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2> + 2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4> + 2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4> + 2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3> + 3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5> + 3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1> + 3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7> + 3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1> + 2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3> + 2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS + 2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4> + 3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2> + 1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4> + 2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS + 3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3> + 3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7> + 3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2> + 1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4> + 2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS + 2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3> + 2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4> + 2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4> + 2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS + 2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7> + 2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2> + 2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3> + 3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1> + 3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0> + 3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5> + 3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS + 3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5> + 2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS + 3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4> + 3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0> + 2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS + 1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS + 2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2> + 2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5> + 2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2> + 1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS + 2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7> + 2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3> + 2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2> + 1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS + 2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS + 2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1> + 4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2> + 3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS + 2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS + 2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6> + 4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7> + 3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1> + 2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS + 2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> + 3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1> + 3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1> + 2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4> + 3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6> + 3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4> + 3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7> + 3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7> + 2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> + 1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS + 2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2> + 2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2> + 1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4> + 1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS + 2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS + 2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2> + 1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS + 2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2> + 2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS + 2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6> + 3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2> + 2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6> + 3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7> + 2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4> + 3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2> + 2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS + 3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2> + 3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1> + 3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0> + 3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS + 2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> + 3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7> + 3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3> + 3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3> + 2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> + 3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4> + 3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3> + 2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2> + 2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3> + 3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0> + 3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7> + 3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6> + 3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2> + 2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3> + 2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1> + 3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1> + 3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2> + 2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4> + 2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5> + 2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4> + 3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4> + 2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4> + 2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1> + 2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS + 2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4> + 2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4> + 2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4> + 2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS + 2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS + 2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4> + 3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0> + 2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS + 2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS + 2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0> + 2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2> + 2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS + 2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS + 3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7> + 2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7> + 3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS + 2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS + 1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS + 2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2> + 2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2> + 2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS + 1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS + 2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6> + 2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3> + 2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2> + 1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS + 2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> + 3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2> + 3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2> + 2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4> + 4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4> + 3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7> + 3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4> + 3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7> + 2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> + 1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS + 2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2> + 2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2> + 2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS + 1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS + 2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS + 2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u> + 2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2> + 1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS + 3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0> + 2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2> + 2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4> + 3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3> + 3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1> + 2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0> + 3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0> + 3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0> + 2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2> + 2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1> + 3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1> + 2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4> + 2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4> + 3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0> + 3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3> + 3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1> + 3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3> + 2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4> + 3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS + 2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3> + 3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2> + 2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4> + 3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS + 2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4> + 3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3> + 3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3> + 2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3> + 3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1> + 3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1> + 3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3> + 2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> + 2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4> + 3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7> + 3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7> + 3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7> + 2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> + 2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1> + 2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2> + 2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4> + 3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0> + 2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5> + 2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6> + 2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4> + 3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2> + 2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1> + 2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS + 2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5> + 2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5> + 2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3> + 2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS + 4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5> + 3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5> + 2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4> + 2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS + 2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS + 2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6> + 2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6> + 2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3> + 2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS + 1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6> + 4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6> + 2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4> + 1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6> + 3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1> + 3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5> + 3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7> + 3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7> + 3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5> + 3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7> + 3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7> + 3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4> + 3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1> + 2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS + 2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2> + 2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u> + 2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4> + 2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS + 1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u> + 2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u> + 2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4> + 1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u> + 2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4> + 1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS + 2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS + 3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1> + 2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0> + 2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1> + 2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2> + 3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0> + 1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS + 2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2> + 2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1> + 2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3> + 2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3> + 2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3> + 2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4> + 3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3> + 3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3> + 2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3> + 3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4> + 3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4> + 2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2> + 2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4> + 2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4> + 3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7> + 2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4> + 3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4> + 2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4> + 2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2> + 3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4> + 3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3> + 2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> + 2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4> + 3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5> + 3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4> + 3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4> + 2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> + 1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS + 2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4> + 2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2> + 2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4> + 161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS + 1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS + 2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS + 2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4> + 161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS + 2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS + 2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4> + 3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5> + 2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5> + 2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS + 1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS + 1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS + 2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5> + 1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS + 2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS + 3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2> + 2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5> + 2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6> + 2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS + 2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6> + 1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS + 2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4> + 1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS + 2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2> + 3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4> + 3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7> + 3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4> + 2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> + 2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4> + 2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4> + 2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7> + 2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> + 1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS + 1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS + 2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS + 2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u> + 161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS + 1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS + 1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS + 2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u> + 161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS + 2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0> + 1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS + 2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5> + 2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0> + 2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5> + 3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7> + 2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5> + 2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0> + 1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS + 2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2> + 2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5> + 2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0> + 2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS + 3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4> + 2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5> + 3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6> + 2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3> + 2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS + 2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS + 3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3> + 2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5> + 1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5> + 2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5> + 3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7> + 2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7> + 4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS + 1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5> + 2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2> + 3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1> + 2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4> + 2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3> + 2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0> + 2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5> + 2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4> + 2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5> + 2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2> + 2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS + 3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4> + 2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5> + 2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4> + 2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS + 1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS + 2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5> + 2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6> + 1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS + 1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS + 2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3> + 2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4> + 2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2> + 1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5> + 2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5> + 2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0> + 2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS + 1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS + 1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS + 2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6> + 2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6> + 1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6> + 1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS + 3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5> + 2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6> + 27705344U, // <4,5,6,7>: Cost 0 copy RHS + 27705344U, // <4,5,6,u>: Cost 0 copy RHS + 2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS + 2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4> + 2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7> + 2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5> + 2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS + 2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7> + 2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4> + 2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4> + 2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS + 1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS + 1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS + 2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0> + 1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u> + 1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS + 1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS + 2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7> + 27705344U, // <4,5,u,7>: Cost 0 copy RHS + 27705344U, // <4,5,u,u>: Cost 0 copy RHS + 2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0> + 1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS + 1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6> + 3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0> + 2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5> + 3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7> + 2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0> + 4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS + 1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS + 2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2> + 2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1> + 2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0> + 2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3> + 3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5> + 2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7> + 3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7> + 4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS + 2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3> + 2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4> + 2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3> + 2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2> + 2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1> + 2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6> + 2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7> + 2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7> + 2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3> + 2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1> + 2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2> + 2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6> + 3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2> + 2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3> + 2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6> + 2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6> + 3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6> + 2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4> + 2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2> + 2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS + 2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3> + 2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4> + 2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4> + 2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS + 1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS + 1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS + 2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4> + 1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS + 2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS + 2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3> + 2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3> + 3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3> + 2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6> + 2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5> + 2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6> + 2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS + 2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS + 1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS + 2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2> + 2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3> + 2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2> + 1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6> + 2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6> + 2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6> + 2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS + 1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS + 2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2> + 2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2> + 3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7> + 2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4> + 2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6> + 2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> + 3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3> + 2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> + 2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2> + 1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS + 1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS + 1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS + 2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1> + 1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u> + 1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS + 1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS + 2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS + 1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS + 3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0> + 2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS + 2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4> + 3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4> + 3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5> + 2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0> + 3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7> + 3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4> + 2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS + 2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1> + 3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1> + 2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4> + 3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5> + 3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS + 3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7> + 3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7> + 3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3> + 2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1> + 3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS + 3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3> + 3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2> + 2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4> + 3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7> + 2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7> + 3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7> + 3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3> + 2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7> + 3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2> + 3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4> + 3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4> + 3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3> + 3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6> + 2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7> + 2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7> + 3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4> + 2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7> + 2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1> + 3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3> + 3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7> + 3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5> + 2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4> + 2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS + 2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4> + 3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7> + 2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS + 2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2> + 3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7> + 2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5> + 3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5> + 2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6> + 2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5> + 2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5> + 2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7> + 2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2> + 1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS + 3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2> + 2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2> + 2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2> + 1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS + 1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6> + 2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3> + 3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7> + 1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS + 3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS + 3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4> + 3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7> + 3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4> + 3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7> + 2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7> + 3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7> + 2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7> + 2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7> + 1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS + 2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS + 2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2> + 2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2> + 1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS + 1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u> + 2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3> + 3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7> + 1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS + 2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0> + 1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS + 1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u> + 2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2> + 2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5> + 2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0> + 2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u> + 2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0> + 1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS + 2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2> + 2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1> + 1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3> + 2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3> + 2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7> + 3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7> + 2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3> + 1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS + 2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u> + 2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2> + 1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u> + 2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u> + 2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4> + 2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7> + 2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3> + 1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u> + 2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2> + 2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2> + 2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u> + 2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3> + 2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6> + 2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7> + 2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u> + 2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2> + 1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS + 2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2> + 2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4> + 2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4> + 161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS + 1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS + 1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS + 2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6> + 161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS + 1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS + 1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS + 2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5> + 3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS + 1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS + 1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS + 1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS + 3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS + 1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS + 1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS + 2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2> + 1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS + 1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6> + 1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS + 1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6> + 1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS + 27705344U, // <4,u,6,7>: Cost 0 copy RHS + 27705344U, // <4,u,6,u>: Cost 0 copy RHS + 2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS + 2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4> + 2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7> + 2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u> + 2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS + 2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> + 2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7> + 2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> + 2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS + 1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS + 1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS + 1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS + 1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u> + 161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS + 1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS + 1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS + 27705344U, // <4,u,u,7>: Cost 0 copy RHS + 27705344U, // <4,u,u,u>: Cost 0 copy RHS + 2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0> + 2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1> + 2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2> + 3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5> + 2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5> + 3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0> + 3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0> + 3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0> + 2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2> + 2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS + 2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1> + 1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS + 3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7> + 2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS + 2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1> + 3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7> + 3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2> + 1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2> + 2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> + 2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4> + 2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5> + 2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5> + 2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5> + 2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4> + 2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> + 2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5> + 3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5> + 3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4> + 2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5> + 3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5> + 2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0> + 3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4> + 3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0> + 3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7> + 2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5> + 2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS + 1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> + 2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6> + 3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5> + 2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS + 2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS + 3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5> + 2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0> + 1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> + 3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1> + 2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS + 3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS + 3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0> + 3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5> + 2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0> + 3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0> + 3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS + 2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS + 4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0> + 2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS + 3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7> + 3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6> + 3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5> + 3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7> + 3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6> + 2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5> + 2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS + 2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS + 2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0> + 2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7> + 3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2> + 2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS + 2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0> + 2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7> + 4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7> + 2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS + 2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2> + 1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5> + 1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS + 2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5> + 2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6> + 2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS + 2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u> + 2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> + 1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0> + 1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS + 2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2> + 2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2> + 2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5> + 2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0> + 3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7> + 3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0> + 1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS + 2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1> + 2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1> + 2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0> + 2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3> + 2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5> + 2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5> + 3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5> + 3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5> + 2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3> + 3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2> + 3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3> + 2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2> + 2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0> + 2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5> + 2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3> + 2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7> + 3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0> + 2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0> + 2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS + 2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3> + 3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5> + 2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5> + 2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5> + 2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7> + 3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7> + 2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5> + 2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3> + 1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1> + 2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5> + 2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5> + 2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5> + 2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4> + 1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS + 2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6> + 3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4> + 1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1> + 2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1> + 2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1> + 3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1> + 2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7> + 2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5> + 2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5> + 2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0> + 2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7> + 2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1> + 2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS + 3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7> + 2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3> + 3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7> + 2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS + 2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7> + 2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6> + 2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1> + 2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1> + 2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS + 2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1> + 2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1> + 2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS + 2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS + 2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3> + 4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6> + 2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7> + 2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS + 1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1> + 1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS + 2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5> + 2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS + 2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5> + 1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS + 2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7> + 2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1> + 2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS + 3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0> + 2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS + 2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2> + 3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2> + 3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1> + 3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1> + 3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4> + 3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0> + 2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS + 2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2> + 3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2> + 3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5> + 2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5> + 3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS + 3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0> + 3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3> + 3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1> + 2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5> + 3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS + 3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3> + 2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2> + 2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3> + 2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5> + 3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7> + 3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6> + 3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5> + 2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3> + 2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1> + 2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5> + 2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5> + 3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5> + 1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5> + 2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5> + 3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5> + 3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7> + 1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5> + 2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2> + 3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3> + 2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5> + 2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5> + 2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6> + 2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS + 2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2> + 3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4> + 2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS + 2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS + 3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3> + 3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7> + 2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS + 2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS + 2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5> + 3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7> + 3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1> + 2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS + 3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS + 3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3> + 2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3> + 2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7> + 2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5> + 3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7> + 3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7> + 3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1> + 2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7> + 2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS + 2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2> + 2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7> + 2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS + 2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS + 4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5> + 3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6> + 3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7> + 2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS + 2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1> + 2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS + 2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u> + 2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3> + 1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5> + 2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS + 2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5> + 3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u> + 1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5> + 3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0> + 2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2> + 3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0> + 3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2> + 2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1> + 3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2> + 3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0> + 2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0> + 2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2> + 3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3> + 3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1> + 2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3> + 2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5> + 3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3> + 2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7> + 3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7> + 2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5> + 2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3> + 3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1> + 3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5> + 3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2> + 2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4> + 2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5> + 2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4> + 3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3> + 3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3> + 2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4> + 3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1> + 2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3> + 3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2> + 2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3> + 2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5> + 2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5> + 3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7> + 3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5> + 2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5> + 2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5> + 3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0> + 2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3> + 2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5> + 2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5> + 2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6> + 3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5> + 2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4> + 2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6> + 2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS + 2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5> + 2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5> + 2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5> + 2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS + 2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5> + 3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0> + 2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5> + 2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS + 2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS + 3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6> + 2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6> + 2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6> + 2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS + 3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0> + 4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6> + 2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4> + 2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS + 1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS + 1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7> + 2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2> + 2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2> + 1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS + 2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3> + 2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3> + 3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7> + 1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS + 1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS + 1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u> + 2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2> + 2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2> + 1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS + 2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6> + 2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3> + 2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u> + 1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS + 3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS + 2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS + 3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2> + 3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5> + 2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5> + 2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1> + 2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0> + 3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0> + 2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS + 2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1> + 3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4> + 3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4> + 3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7> + 3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4> + 2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0> + 3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5> + 3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1> + 2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1> + 3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4> + 3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4> + 3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4> + 2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5> + 3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4> + 2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3> + 3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3> + 3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5> + 2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5> + 3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS + 3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4> + 2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4> + 3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3> + 2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4> + 3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0> + 2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5> + 3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7> + 2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4> + 2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS + 2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4> + 3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3> + 3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4> + 2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4> + 1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5> + 2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4> + 2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4> + 1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5> + 2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS + 2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5> + 2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3> + 3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2> + 2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS + 2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5> + 1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS + 3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS + 1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS + 2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS + 2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7> + 3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2> + 2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6> + 2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS + 2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5> + 3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7> + 2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5> + 2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS + 2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS + 2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4> + 2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7> + 2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7> + 2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS + 3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5> + 94817590U, // <5,4,7,6>: Cost 1 vrev RHS + 2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7> + 94965064U, // <5,4,7,u>: Cost 1 vrev RHS + 2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS + 2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u> + 2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u> + 2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4> + 2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS + 1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5> + 94825783U, // <5,4,u,6>: Cost 1 vrev RHS + 2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5> + 94973257U, // <5,4,u,u>: Cost 1 vrev RHS + 2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0> + 1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS + 2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2> + 3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2> + 2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1> + 2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0> + 3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7> + 4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS + 1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS + 2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2> + 2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5> + 2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0> + 2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3> + 2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5> + 2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7> + 3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7> + 2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3> + 2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5> + 3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS + 3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3> + 2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2> + 2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4> + 3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3> + 2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3> + 2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7> + 4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7> + 2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4> + 2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2> + 3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5> + 3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3> + 2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5> + 2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6> + 2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5> + 3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7> + 3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1> + 2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2> + 2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1> + 2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5> + 3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3> + 3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4> + 1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> + 1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS + 2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5> + 2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6> + 1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5> + 1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS + 2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3> + 2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2> + 2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2> + 1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS + 2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0> + 2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7> + 229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS + 2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS + 3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6> + 2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3> + 3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6> + 2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5> + 2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5> + 2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6> + 2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1> + 2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1> + 2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS + 3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7> + 2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5> + 2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7> + 2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS + 3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5> + 4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6> + 2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS + 2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS + 1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS + 1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS + 2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5> + 2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u> + 1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS + 2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7> + 2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS + 229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS + 2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0> + 1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS + 2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2> + 2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4> + 2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5> + 3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6> + 3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7> + 4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS + 1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS + 2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2> + 2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1> + 2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0> + 2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3> + 2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6> + 2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7> + 3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7> + 2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS + 2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6> + 3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2> + 3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3> + 2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2> + 2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1> + 2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6> + 2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6> + 2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7> + 2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3> + 2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6> + 2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2> + 3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3> + 3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6> + 2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3> + 1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6> + 2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6> + 3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7> + 4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS + 1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6> + 2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS + 3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5> + 2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5> + 2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5> + 2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6> + 1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS + 2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6> + 2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5> + 1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS + 2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS + 2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3> + 3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6> + 3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4> + 2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6> + 2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5> + 2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1> + 2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS + 2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS + 2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS + 3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4> + 2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3> + 2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6> + 2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS + 2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6> + 2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6> + 2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS + 2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS + 430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS + 1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> + 1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> + 430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS + 1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6> + 1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6> + 1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7> + 430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS + 430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS + 1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS + 1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2> + 430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS + 1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS + 1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3> + 1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2> + 430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS + 2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0> + 1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS + 2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2> + 2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0> + 2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5> + 2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7> + 3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7> + 2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0> + 1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS + 2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2> + 2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1> + 2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0> + 1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7> + 2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS + 2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7> + 2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7> + 3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7> + 1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7> + 2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7> + 3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3> + 2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2> + 2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1> + 2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7> + 2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7> + 2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7> + 3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7> + 2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1> + 2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2> + 2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5> + 3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1> + 2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3> + 2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6> + 2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0> + 3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7> + 2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7> + 2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2> + 2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS + 2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7> + 3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0> + 2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4> + 2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS + 1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS + 2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> + 2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7> + 1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS + 2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS + 2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3> + 2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3> + 2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7> + 2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS + 2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5> + 2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7> + 1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS + 1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS + 2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0> + 3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5> + 2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2> + 2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6> + 2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4> + 2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u> + 2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> + 2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7> + 2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u> + 1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS + 2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1> + 2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2> + 2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3> + 1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS + 1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7> + 2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3> + 2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7> + 1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS + 1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS + 1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS + 2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0> + 1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS + 1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS + 1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS + 2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7> + 1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS + 1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS + 2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0> + 1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS + 2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2> + 2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2> + 2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1> + 2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1> + 2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0> + 2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0> + 1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS + 2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2> + 2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1> + 1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS + 1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u> + 2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u> + 2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0> + 2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u> + 2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS + 1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0> + 2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> + 2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2> + 2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3> + 2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u> + 2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u> + 2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7> + 2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3> + 2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1> + 2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1> + 2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u> + 2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u> + 2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3> + 1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u> + 2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0> + 2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5> + 2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u> + 1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u> + 1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u> + 1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5> + 2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6> + 2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5> + 1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> + 1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS + 2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> + 2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u> + 1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS + 1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS + 2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5> + 2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3> + 2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7> + 1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS + 1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS + 1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS + 229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS + 2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS + 2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS + 2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6> + 2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7> + 2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS + 2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS + 2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> + 2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS + 2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7> + 430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS + 1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7> + 1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS + 430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS + 1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3> + 118708378U, // <5,u,7,6>: Cost 1 vrev RHS + 2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS + 430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS + 430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS + 1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS + 1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS + 1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS + 430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS + 229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS + 118716571U, // <5,u,u,6>: Cost 1 vrev RHS + 1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS + 430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS + 2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0> + 2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1> + 2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2> + 3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5> + 2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6> + 3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0> + 3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6> + 3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7> + 2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2> + 2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS + 3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0> + 1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS + 2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6> + 2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS + 2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1> + 2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1> + 2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1> + 1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS + 2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2> + 2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6> + 3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6> + 2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5> + 1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6> + 3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7> + 2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6> + 3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2> + 1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6> + 3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2> + 2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4> + 2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5> + 3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3> + 2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6> + 3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6> + 3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6> + 3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7> + 2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5> + 2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6> + 2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5> + 1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6> + 3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6> + 2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6> + 2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS + 2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0> + 2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0> + 1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6> + 3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS + 2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6> + 3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6> + 3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0> + 2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6> + 3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6> + 2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0> + 2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS + 2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS + 2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0> + 2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS + 3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS + 3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5> + 2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0> + 3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7> + 2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0> + 2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1> + 2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS + 2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS + 2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0> + 2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2> + 2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7> + 2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS + 4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5> + 3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0> + 4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7> + 2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS + 2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2> + 2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1> + 1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS + 2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5> + 1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6> + 2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS + 2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u> + 2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS + 1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS + 2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS + 2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS + 3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6> + 2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2> + 2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS + 3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2> + 2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1> + 2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0> + 2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2> + 3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1> + 2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1> + 3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6> + 2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3> + 2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6> + 3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5> + 3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6> + 3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1> + 2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3> + 2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS + 3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3> + 3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2> + 2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0> + 2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS + 3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3> + 2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3> + 3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0> + 2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0> + 2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS + 2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3> + 2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6> + 3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1> + 2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6> + 2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7> + 2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3> + 3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2> + 2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3> + 2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1> + 2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6> + 3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4> + 2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6> + 3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS + 2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6> + 3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0> + 2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1> + 2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6> + 2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1> + 3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7> + 3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6> + 2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7> + 2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6> + 3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6> + 2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0> + 3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS + 2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7> + 2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS + 3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7> + 3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6> + 3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS + 2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS + 3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7> + 2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6> + 3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1> + 3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS + 2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS + 2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7> + 2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2> + 3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS + 2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS + 2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5> + 3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0> + 4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7> + 2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS + 2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS + 2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3> + 2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6> + 2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0> + 2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6> + 2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7> + 2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u> + 2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u> + 2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0> + 2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0> + 1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS + 2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2> + 3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0> + 2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6> + 3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3> + 2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4> + 3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0> + 1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS + 2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1> + 2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1> + 2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0> + 2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS + 3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6> + 2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7> + 2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3> + 3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1> + 2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1> + 3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1> + 3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3> + 2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2> + 2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3> + 2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6> + 3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7> + 2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6> + 3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7> + 2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3> + 2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1> + 3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0> + 2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6> + 2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4> + 2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5> + 2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6> + 2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6> + 2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4> + 2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1> + 1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2> + 3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u> + 2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6> + 2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6> + 2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6> + 1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS + 2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0> + 3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2> + 1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2> + 3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3> + 2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3> + 3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7> + 2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6> + 2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5> + 2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5> + 2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0> + 2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS + 2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6> + 2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1> + 2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3> + 2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6> + 2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7> + 2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5> + 2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7> + 2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6> + 2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1> + 2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7> + 2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS + 2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2> + 2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7> + 1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS + 2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS + 3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7> + 2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6> + 2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7> + 1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS + 1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2> + 1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS + 2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6> + 1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS + 2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5> + 1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS + 2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0> + 2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS + 1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS + 3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0> + 2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2> + 2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4> + 3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2> + 2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2> + 2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2> + 3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0> + 2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0> + 2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2> + 3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3> + 3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1> + 3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3> + 3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1> + 2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> + 3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3> + 3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0> + 3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3> + 2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> + 3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4> + 2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> + 3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2> + 3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0> + 3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6> + 3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7> + 2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7> + 3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6> + 2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> + 3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1> + 3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3> + 2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3> + 2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3> + 2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6> + 3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5> + 3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7> + 3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7> + 2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5> + 2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS + 2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3> + 2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3> + 2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6> + 2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS + 1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6> + 2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6> + 2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4> + 1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6> + 3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS + 3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7> + 3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5> + 3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5> + 2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6> + 3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7> + 3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6> + 3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0> + 2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6> + 2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS + 2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3> + 2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6> + 3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1> + 2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6> + 3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7> + 2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6> + 3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7> + 2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS + 1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS + 2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7> + 1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7> + 2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2> + 1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS + 2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3> + 2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3> + 2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7> + 1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS + 1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS + 2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2> + 1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u> + 2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2> + 1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS + 1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6> + 2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0> + 2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7> + 1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS + 3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0> + 2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS + 2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6> + 3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1> + 2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6> + 2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1> + 2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2> + 3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0> + 2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS + 2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1> + 3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1> + 3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0> + 2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS + 3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS + 3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0> + 2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3> + 3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1> + 2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1> + 2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4> + 3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3> + 3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2> + 3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1> + 2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4> + 2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS + 2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0> + 3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0> + 2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4> + 3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2> + 3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3> + 3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6> + 3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3> + 2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6> + 3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6> + 2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5> + 3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7> + 2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6> + 2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS + 3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4> + 2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4> + 3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4> + 2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4> + 2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS + 1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> + 2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4> + 1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> + 2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS + 3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3> + 2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5> + 2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6> + 2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS + 2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5> + 1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS + 2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS + 1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS + 1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS + 2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2> + 2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2> + 2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2> + 1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS + 2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3> + 2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3> + 2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2> + 1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS + 2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS + 2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4> + 3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5> + 2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7> + 2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS + 2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5> + 2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6> + 4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7> + 2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS + 1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS + 2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS + 2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u> + 2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6> + 1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6> + 2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS + 1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS + 2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS + 1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS + 3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0> + 2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS + 3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6> + 3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2> + 3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1> + 4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5> + 3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7> + 1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0> + 1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0> + 2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1> + 3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5> + 3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0> + 3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS + 3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6> + 3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7> + 3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4> + 2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3> + 2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3> + 2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS + 3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5> + 3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2> + 3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5> + 2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS + 3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6> + 2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7> + 3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS + 2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5> + 3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2> + 3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5> + 3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3> + 3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3> + 2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6> + 3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6> + 3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0> + 2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3> + 2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3> + 2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS + 3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5> + 3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5> + 2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5> + 2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS + 2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS + 3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0> + 1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6> + 1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6> + 2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS + 3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2> + 3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2> + 3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2> + 2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS + 2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5> + 2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6> + 2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7> + 2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7> + 2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1> + 3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4> + 3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4> + 2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4> + 2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5> + 3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5> + 2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6> + 2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0> + 2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1> + 2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS + 2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7> + 2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7> + 3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2> + 2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS + 2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5> + 2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6> + 3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS + 2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS + 2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS + 2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u> + 2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u> + 2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7> + 2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS + 2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS + 2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6> + 1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u> + 1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u> + 2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS + 1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS + 2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4> + 3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1> + 2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2> + 3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3> + 2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0> + 4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS + 1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS + 2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2> + 2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1> + 2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0> + 2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS + 3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3> + 2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7> + 2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3> + 3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3> + 2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS + 3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6> + 3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3> + 2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6> + 2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1> + 3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3> + 3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7> + 2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7> + 2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3> + 2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3> + 2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2> + 3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4> + 3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6> + 2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3> + 2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5> + 3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4> + 2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6> + 4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS + 2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5> + 2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS + 3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2> + 3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6> + 3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6> + 1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6> + 1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS + 2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0> + 2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6> + 1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6> + 3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS + 2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3> + 3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5> + 3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0> + 2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6> + 2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5> + 2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0> + 2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS + 2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS + 1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS + 2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2> + 2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3> + 2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2> + 1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS + 2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3> + 296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS + 2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7> + 296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS + 2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS + 3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7> + 2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7> + 2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6> + 2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS + 2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4> + 2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6> + 1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS + 1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS + 1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS + 1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS + 2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u> + 2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS + 1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS + 1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS + 296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS + 1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS + 296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS + 1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS + 1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0> + 1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0> + 2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7> + 2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2> + 497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS + 1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> + 1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3> + 2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5> + 1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7> + 2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7> + 2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2> + 1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3> + 2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2> + 2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3> + 1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1> + 2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6> + 2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7> + 1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7> + 2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1> + 1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1> + 1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3> + 2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1> + 1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7> + 2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7> + 2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1> + 1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3> + 2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3> + 2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5> + 1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS + 1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7> + 497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS + 2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2> + 1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4> + 2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0> + 1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> + 1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7> + 1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7> + 2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2> + 2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3> + 1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5> + 2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6> + 2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7> + 1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> + 1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1> + 1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1> + 1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2> + 2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7> + 2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3> + 2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7> + 1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6> + 2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7> + 1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7> + 1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7> + 1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2> + 1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2> + 497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS + 1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0> + 1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1> + 1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6> + 497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS + 1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7> + 1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1> + 497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS + 1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS + 1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2> + 1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0> + 2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7> + 1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0> + 497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS + 1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS + 1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3> + 2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS + 1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7> + 2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7> + 2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3> + 1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS + 1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS + 2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3> + 1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1> + 1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS + 2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u> + 1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7> + 2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3> + 1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1> + 1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3> + 2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3> + 1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7> + 2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5> + 2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1> + 1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3> + 1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6> + 2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6> + 1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS + 1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6> + 497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS + 2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS + 1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5> + 2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7> + 1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS + 1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7> + 1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS + 1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS + 2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2> + 1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7> + 1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS + 2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5> + 296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS + 1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1> + 296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS + 1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS + 2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7> + 1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7> + 1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS + 1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS + 2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5> + 1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7> + 1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS + 1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS + 1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS + 497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS + 1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS + 1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1> + 1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS + 497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS + 296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS + 1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS + 497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS + 1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0> + 1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1> + 1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2> + 3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0> + 2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1> + 2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6> + 2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0> + 3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7> + 1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1> + 1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS + 2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5> + 564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS + 2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7> + 1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS + 2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7> + 1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1> + 2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0> + 564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS + 1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2> + 2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5> + 2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0> + 2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1> + 1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6> + 2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7> + 2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7> + 2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7> + 1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2> + 2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2> + 2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0> + 2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0> + 2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3> + 2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6> + 2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0> + 2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0> + 3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7> + 2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0> + 2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4> + 1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5> + 1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6> + 3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4> + 2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6> + 1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS + 2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6> + 3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5> + 1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5> + 2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS + 2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3> + 2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7> + 2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0> + 2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6> + 2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5> + 1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0> + 2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7> + 1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0> + 2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7> + 2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7> + 2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7> + 2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0> + 3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7> + 2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7> + 2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6> + 2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1> + 2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7> + 2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2> + 2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS + 3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS + 3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7> + 2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6> + 2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7> + 2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0> + 2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7> + 2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2> + 1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2> + 1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1> + 564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS + 2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u> + 1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6> + 1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS + 1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u> + 2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1> + 564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS + 2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS + 2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS + 2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS + 1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2> + 2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS + 2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1> + 2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0> + 3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1> + 1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2> + 2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1> + 1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1> + 3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6> + 1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3> + 2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5> + 2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7> + 2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1> + 3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5> + 1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3> + 2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1> + 2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3> + 3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2> + 1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0> + 2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5> + 2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3> + 2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2> + 2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0> + 1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0> + 2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0> + 1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3> + 2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0> + 2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7> + 2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5> + 1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7> + 2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7> + 2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3> + 1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7> + 2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5> + 2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5> + 2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5> + 2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5> + 3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5> + 2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS + 2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1> + 3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0> + 2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS + 2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS + 2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7> + 3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6> + 1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7> + 2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS + 3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7> + 2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1> + 2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1> + 1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7> + 3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7> + 2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7> + 2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7> + 2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7> + 3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7> + 2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7> + 3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7> + 2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1> + 2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7> + 2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2> + 2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1> + 3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3> + 3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS + 2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS + 3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7> + 3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0> + 2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7> + 3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS + 2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS + 1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3> + 2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0> + 1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7> + 2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS + 1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7> + 2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7> + 2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS + 1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7> + 2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2> + 2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2> + 2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0> + 2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0> + 2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6> + 2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7> + 2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1> + 3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2> + 2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7> + 2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3> + 3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0> + 3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0> + 2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1> + 2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3> + 3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0> + 2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3> + 2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2> + 2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1> + 2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1> + 2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3> + 1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3> + 2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5> + 2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7> + 2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6> + 3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5> + 1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3> + 1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1> + 2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5> + 2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6> + 2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7> + 1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5> + 2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7> + 2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6> + 2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0> + 1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1> + 2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6> + 2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3> + 2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4> + 2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5> + 2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6> + 2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7> + 2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0> + 3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0> + 2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7> + 2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7> + 3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3> + 2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7> + 2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7> + 2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7> + 3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7> + 2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7> + 3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0> + 2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7> + 2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS + 2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3> + 2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7> + 1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7> + 2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS + 2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7> + 2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7> + 3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7> + 1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7> + 2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1> + 3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0> + 3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5> + 2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS + 2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS + 3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7> + 4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6> + 2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7> + 2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS + 1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1> + 2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5> + 1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7> + 1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5> + 2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7> + 2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7> + 2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0> + 1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7> + 2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0> + 1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2> + 2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0> + 2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2> + 2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1> + 2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2> + 2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0> + 3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0> + 1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2> + 2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1> + 2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1> + 2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3> + 2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5> + 2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6> + 2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3> + 3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1> + 2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5> + 2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5> + 2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1> + 2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0> + 2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2> + 2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0> + 2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5> + 2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4> + 2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3> + 2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3> + 2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0> + 2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1> + 2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3> + 2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3> + 1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3> + 2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4> + 2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7> + 2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3> + 2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7> + 1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3> + 2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1> + 2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2> + 2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4> + 2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5> + 2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5> + 1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6> + 2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6> + 3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4> + 1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6> + 2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS + 1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3> + 2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3> + 2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7> + 2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5> + 2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7> + 2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0> + 2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0> + 1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3> + 2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7> + 2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3> + 1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3> + 2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7> + 2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7> + 3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7> + 2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6> + 2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7> + 1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3> + 2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1> + 2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5> + 2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6> + 2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7> + 2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5> + 2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7> + 2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7> + 2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7> + 2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1> + 2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u> + 1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2> + 1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3> + 1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3> + 2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u> + 1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6> + 2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3> + 2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0> + 1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2> + 2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0> + 1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS + 2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2> + 3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1> + 2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5> + 1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> + 1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> + 3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1> + 1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1> + 2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2> + 2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1> + 2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3> + 2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7> + 2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3> + 2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0> + 2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3> + 3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3> + 2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0> + 3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1> + 3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3> + 2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2> + 2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1> + 3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7> + 2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3> + 2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0> + 3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5> + 2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0> + 2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2> + 3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5> + 3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1> + 2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3> + 2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6> + 2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4> + 2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5> + 3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7> + 2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4> + 2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2> + 3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1> + 3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4> + 3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3> + 1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4> + 1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5> + 1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> + 3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7> + 1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5> + 1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS + 2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7> + 2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3> + 2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5> + 1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS + 2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7> + 564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS + 2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7> + 564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS + 2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1> + 3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5> + 2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3> + 2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4> + 1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6> + 2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7> + 2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7> + 2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1> + 1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2> + 2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2> + 3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2> + 3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5> + 3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6> + 2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1> + 2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0> + 2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1> + 2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7> + 2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7> + 1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS + 1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS + 2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3> + 2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u> + 1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6> + 1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1> + 564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS + 2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5> + 564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS + 2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS + 2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS + 2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS + 3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0> + 2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1> + 2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1> + 2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1> + 2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2> + 2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS + 2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1> + 3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1> + 3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0> + 2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7> + 2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5> + 2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3> + 2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1> + 1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3> + 1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3> + 3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1> + 3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0> + 3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7> + 2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4> + 3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7> + 2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3> + 3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3> + 2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3> + 2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4> + 3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS + 2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5> + 3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5> + 3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3> + 3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS + 2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5> + 3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5> + 2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0> + 2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0> + 2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS + 3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7> + 3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5> + 3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4> + 2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS + 2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS + 2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5> + 1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6> + 1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6> + 2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1> + 2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3> + 3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3> + 3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3> + 2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4> + 1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5> + 2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6> + 1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7> + 1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7> + 2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1> + 2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7> + 3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2> + 2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4> + 2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5> + 2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7> + 2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7> + 1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0> + 1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0> + 1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS + 2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3> + 2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2> + 2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7> + 1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS + 1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> + 2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0> + 2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1> + 1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3> + 1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS + 2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS + 2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0> + 2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u> + 1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS + 1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7> + 2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7> + 1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0> + 1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3> + 2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0> + 1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS + 2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4> + 2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0> + 2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2> + 2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0> + 2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0> + 2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2> + 1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS + 1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1> + 2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1> + 2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0> + 2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3> + 2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS + 2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7> + 2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3> + 2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3> + 1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1> + 2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1> + 2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3> + 2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2> + 2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1> + 2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5> + 2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7> + 2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7> + 1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3> + 1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3> + 2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2> + 2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3> + 2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3> + 2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3> + 2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5> + 2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7> + 3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6> + 2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0> + 2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0> + 2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1> + 2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3> + 2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5> + 2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4> + 2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6> + 1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS + 2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0> + 2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5> + 1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS + 2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS + 2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3> + 2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7> + 3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4> + 1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6> + 2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5> + 2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0> + 2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7> + 1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5> + 2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1> + 2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3> + 2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3> + 3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4> + 2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4> + 2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7> + 1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6> + 1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7> + 1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7> + 1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1> + 2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0> + 2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2> + 2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0> + 1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5> + 2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4> + 2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6> + 2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7> + 1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1> + 1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1> + 1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS + 2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2> + 2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0> + 1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5> + 1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS + 1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6> + 1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3> + 1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1> + 2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0> + 1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2> + 2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0> + 3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0> + 2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1> + 2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0> + 2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0> + 2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2> + 1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2> + 2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3> + 2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1> + 2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3> + 2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5> + 2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS + 2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3> + 2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1> + 2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1> + 2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3> + 3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5> + 3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0> + 2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2> + 2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0> + 3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5> + 3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3> + 2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3> + 2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3> + 2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3> + 2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1> + 3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3> + 3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6> + 2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7> + 2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5> + 2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3> + 2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7> + 2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7> + 2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7> + 2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5> + 3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7> + 3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3> + 3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4> + 2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4> + 1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6> + 2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4> + 2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6> + 1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6> + 2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS + 2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7> + 3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3> + 2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7> + 2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7> + 1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7> + 2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7> + 2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5> + 1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7> + 2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS + 2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0> + 2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7> + 2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7> + 2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS + 2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7> + 1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7> + 2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0> + 1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7> + 1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS + 2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2> + 2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2> + 2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2> + 1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS + 2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7> + 2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7> + 363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS + 363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS + 1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS + 1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2> + 2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3> + 2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0> + 1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS + 1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6> + 1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7> + 363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS + 363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS + 1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0> + 1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2> + 1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2> + 1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2> + 2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1> + 1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1> + 1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2> + 2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS + 1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2> + 1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS + 1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1> + 564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS + 1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3> + 1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS + 2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3> + 1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1> + 1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3> + 564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS + 1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2> + 2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0> + 1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0> + 1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6> + 2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4> + 2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3> + 1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3> + 1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0> + 1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1> + 1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3> + 2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6> + 1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3> + 1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5> + 1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7> + 2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7> + 2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0> + 1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1> + 2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1> + 1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5> + 1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6> + 2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5> + 1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4> + 1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6> + 1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6> + 1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6> + 1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6> + 1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS + 1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u> + 2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3> + 1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7> + 1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u> + 1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u> + 564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS + 1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7> + 564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS + 2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1> + 2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7> + 1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u> + 1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7> + 1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6> + 2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7> + 1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u> + 1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0> + 1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7> + 1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1> + 2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3> + 2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6> + 2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7> + 1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5> + 1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7> + 2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2> + 363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS + 363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS + 1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1> + 1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2> + 564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS + 1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0> + 1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5> + 1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6> + 564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS + 363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS + 564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS + 135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS + 1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> + 1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> + 2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0> + 1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS + 2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3> + 2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0> + 2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0> + 135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS + 1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS + 1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS + 537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS + 2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3> + 1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS + 2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7> + 1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1> + 2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2> + 537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS + 1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1> + 1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS + 2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1> + 1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7> + 2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7> + 2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2> + 1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2> + 2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0> + 2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0> + 72589981U, // <u,0,3,2>: Cost 1 vrev LHS + 2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3> + 2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6> + 2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0> + 2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0> + 2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3> + 73032403U, // <u,0,3,u>: Cost 1 vrev LHS + 2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u> + 1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> + 1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> + 3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2> + 2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS + 1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS + 2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6> + 2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0> + 1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> + 2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS + 1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS + 2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7> + 2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0> + 2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5> + 2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5> + 1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0> + 2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS + 1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0> + 2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS + 2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7> + 1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS + 2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0> + 2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS + 2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u> + 2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6> + 2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1> + 1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS + 2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS + 2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0> + 2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7> + 2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7> + 2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS + 2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u> + 2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7> + 2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7> + 2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS + 135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS + 1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1> + 537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS + 2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1> + 1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> + 1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS + 1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u> + 2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS + 537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS + 1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1> + 1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS + 2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1> + 1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2> + 2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS + 2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1> + 2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0> + 2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0> + 1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2> + 1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS + 202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS + 2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2> + 1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3> + 1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS + 2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5> + 2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1> + 2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1> + 202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS + 1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS + 2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3> + 2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1> + 835584U, // <u,1,2,3>: Cost 0 copy LHS + 1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS + 2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3> + 2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7> + 1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2> + 835584U, // <u,1,2,u>: Cost 0 copy LHS + 1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS + 1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> + 2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0> + 2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS + 1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS + 1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> + 2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7> + 2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1> + 1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3> + 1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1> + 2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u> + 2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u> + 1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4> + 2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS + 1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS + 2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1> + 2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4> + 1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4> + 1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS + 2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2> + 2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2> + 1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7> + 1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS + 2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1> + 2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1> + 2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS + 1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7> + 2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS + 2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7> + 2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7> + 2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7> + 2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS + 2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7> + 2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1> + 1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1> + 1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1> + 2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS + 2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7> + 2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1> + 2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS + 2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS + 2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7> + 2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u> + 2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7> + 2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS + 1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS + 202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS + 2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0> + 835584U, // <u,1,u,3>: Cost 0 copy LHS + 1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS + 1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7> + 2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7> + 1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u> + 835584U, // <u,1,u,u>: Cost 0 copy LHS + 1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2> + 1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS + 1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2> + 2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0> + 2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5> + 2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7> + 2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4> + 2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u> + 1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS + 1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2> + 2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1> + 2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0> + 2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1> + 2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS + 2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7> + 2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3> + 2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2> + 1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1> + 1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS + 2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2> + 269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS + 1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3> + 1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS + 2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7> + 2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7> + 2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2> + 269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS + 408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS + 1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS + 408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS + 1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3> + 1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> + 408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS + 1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2> + 2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4> + 2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5> + 2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5> + 2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS + 1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS + 1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2> + 2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4> + 1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS + 2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS + 2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0> + 2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7> + 2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6> + 1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5> + 2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5> + 2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7> + 2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS + 1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5> + 1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS + 2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2> + 2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2> + 1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> + 1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS + 2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7> + 2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7> + 2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1> + 1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7> + 1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2> + 2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2> + 2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7> + 1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS + 2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS + 2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6> + 2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6> + 2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7> + 1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS + 408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS + 1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS + 269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS + 1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3> + 408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS + 1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS + 1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS + 1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2> + 408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS + 1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS + 1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0> + 1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0> + 2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7> + 2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0> + 471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS + 1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5> + 1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7> + 2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u> + 1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3> + 1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS + 1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2> + 1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> + 1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1> + 1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS + 2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6> + 1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u> + 1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1> + 1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3> + 1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3> + 336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS + 1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5> + 2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3> + 2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7> + 336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS + 1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS + 1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4> + 2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0> + 2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2> + 1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS + 471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS + 1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6> + 2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4> + 471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS + 2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS + 1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3> + 2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5> + 2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6> + 1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> + 1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS + 1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS + 2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2> + 2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6> + 1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3> + 2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u> + 2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6> + 1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6> + 1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1> + 1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6> + 1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS + 1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7> + 1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7> + 2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2> + 1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS + 2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u> + 2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u> + 1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS + 1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2> + 471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS + 1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0> + 336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS + 1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6> + 471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS + 1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7> + 1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS + 471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS + 2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0> + 1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS + 2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4> + 2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4> + 2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5> + 1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> + 1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> + 3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1> + 1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS + 2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS + 2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1> + 1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4> + 2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3> + 2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS + 1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS + 2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3> + 2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1> + 1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4> + 2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS + 2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u> + 2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2> + 2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5> + 2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS + 2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3> + 1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS + 2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2> + 1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS + 2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2> + 2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4> + 2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3> + 2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3> + 2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6> + 2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5> + 2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6> + 3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7> + 2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u> + 1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS + 2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4> + 2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4> + 2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4> + 161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS + 1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS + 1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> + 2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4> + 161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS + 1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS + 2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3> + 1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5> + 2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2> + 1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS + 1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS + 537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS + 2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS + 537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS + 1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS + 2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2> + 2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2> + 2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2> + 1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS + 2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7> + 1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS + 2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1> + 1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS + 2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS + 2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4> + 2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7> + 2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7> + 2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS + 2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0> + 96808489U, // <u,4,7,6>: Cost 1 vrev RHS + 2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7> + 96955963U, // <u,4,7,u>: Cost 1 vrev RHS + 1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS + 1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS + 1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u> + 2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2> + 161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS + 1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS + 537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS + 2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS + 537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS + 2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0> + 1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS + 2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5> + 2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5> + 1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> + 2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1> + 2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5> + 1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0> + 1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS + 1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS + 2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5> + 2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0> + 2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7> + 1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS + 2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5> + 2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3> + 1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3> + 1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3> + 2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS + 2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7> + 2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5> + 1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5> + 2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS + 2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3> + 2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7> + 3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS + 1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5> + 2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS + 2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3> + 2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u> + 2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3> + 2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6> + 3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5> + 2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6> + 2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS + 2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS + 2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS + 2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0> + 2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5> + 2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4> + 1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5> + 1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS + 2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5> + 1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6> + 1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS + 1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS + 2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7> + 2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2> + 2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2> + 1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS + 229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS + 2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6> + 1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7> + 229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS + 1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS + 2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6> + 2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6> + 1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6> + 1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS + 2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7> + 2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7> + 27705344U, // <u,5,6,7>: Cost 0 copy RHS + 27705344U, // <u,5,6,u>: Cost 0 copy RHS + 1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS + 1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7> + 2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2> + 2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2> + 1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS + 1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> + 2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0> + 2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS + 1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS + 1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS + 1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u> + 2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2> + 1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u> + 1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS + 229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS + 2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6> + 27705344U, // <u,5,u,7>: Cost 0 copy RHS + 27705344U, // <u,5,u,u>: Cost 0 copy RHS + 2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0> + 1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS + 1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6> + 2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0> + 1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6> + 2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u> + 2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0> + 2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS + 1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS + 1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1> + 2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1> + 2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0> + 2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3> + 2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6> + 2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7> + 2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3> + 2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS + 1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1> + 1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS + 2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2> + 2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2> + 2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1> + 1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS + 2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6> + 2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7> + 1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3> + 1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3> + 2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2> + 2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3> + 2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3> + 2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3> + 1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6> + 2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6> + 3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6> + 1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS + 1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS + 2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS + 2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3> + 2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4> + 2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u> + 1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6> + 1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS + 1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6> + 2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS + 1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS + 2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS + 2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3> + 2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7> + 2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6> + 1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6> + 2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5> + 2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0> + 2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS + 1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5> + 1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS + 2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2> + 2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3> + 2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2> + 1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS + 2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3> + 296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS + 1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7> + 296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS + 432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS + 1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> + 1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> + 432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS + 1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3> + 1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3> + 1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS + 432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS + 432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS + 1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS + 1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2> + 432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS + 1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS + 296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS + 1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS + 432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS + 1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS + 1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0> + 1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0> + 2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7> + 2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2> + 497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS + 1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> + 1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7> + 2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS + 1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7> + 2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7> + 2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u> + 1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7> + 2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7> + 1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2> + 1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1> + 2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7> + 2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7> + 1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7> + 2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u> + 1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7> + 1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u> + 2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u> + 1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3> + 1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3> + 2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7> + 1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7> + 2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3> + 2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4> + 1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS + 1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0> + 497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS + 2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2> + 1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4> + 2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0> + 1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> + 1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7> + 1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7> + 1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS + 2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2> + 1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5> + 1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS + 1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6> + 1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> + 1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1> + 1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1> + 1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2> + 2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7> + 2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2> + 2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7> + 1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6> + 1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7> + 1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7> + 363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS + 363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS + 1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2> + 497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS + 1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0> + 1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1> + 1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6> + 497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS + 1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7> + 363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS + 497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS + 135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS + 471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS + 1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2> + 1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0> + 1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2> + 1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0> + 471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS + 1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS + 537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS + 1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS + 1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1> + 1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3> + 537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS + 1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS + 1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2> + 269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS + 835584U, // <u,u,2,3>: Cost 0 copy LHS + 1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS + 2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6> + 1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3> + 835584U, // <u,u,2,u>: Cost 0 copy LHS + 408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS + 1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 120371557U, // <u,u,3,2>: Cost 1 vrev LHS + 336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS + 408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS + 1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7> + 1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS + 408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS + 1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS + 1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4> + 1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6> + 1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4> + 161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS + 471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS + 1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6> + 471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS + 1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS + 1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5> + 1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7> + 1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS + 229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS + 537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS + 1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS + 537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS + 1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS + 2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2> + 1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7> + 1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS + 1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6> + 296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS + 27705344U, // <u,u,6,7>: Cost 0 copy RHS + 27705344U, // <u,u,6,u>: Cost 0 copy RHS + 432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS + 1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7> + 1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7> + 1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS + 432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS + 1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3> + 120699277U, // <u,u,7,6>: Cost 1 vrev RHS + 363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS + 432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS + 408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS + 471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS + 537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS + 835584U, // <u,u,u,3>: Cost 0 copy LHS + 408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS + 471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS + 537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS + 27705344U, // <u,u,u,7>: Cost 0 copy RHS + 835584U, // <u,u,u,u>: Cost 0 copy LHS + 0 +}; diff --git a/lib/Target/ARM/ARMRegisterInfo.cpp b/lib/Target/ARM/ARMRegisterInfo.cpp index f809f37..d5bc3f6 100644 --- a/lib/Target/ARM/ARMRegisterInfo.cpp +++ b/lib/Target/ARM/ARMRegisterInfo.cpp @@ -13,6 +13,7 @@ #include "ARM.h" #include "ARMAddressingModes.h" +#include "ARMBaseInstrInfo.h" #include "ARMInstrInfo.h" #include "ARMMachineFunctionInfo.h" #include "ARMRegisterInfo.h" @@ -26,6 +27,7 @@ #include "llvm/CodeGen/MachineLocation.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetFrameInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" @@ -33,1370 +35,7 @@ #include "llvm/ADT/SmallVector.h" using namespace llvm; -unsigned ARMBaseRegisterInfo::getRegisterNumbering(unsigned RegEnum) { - using namespace ARM; - switch (RegEnum) { - case R0: case S0: case D0: return 0; - case R1: case S1: case D1: return 1; - case R2: case S2: case D2: return 2; - case R3: case S3: case D3: return 3; - case R4: case S4: case D4: return 4; - case R5: case S5: case D5: return 5; - case R6: case S6: case D6: return 6; - case R7: case S7: case D7: return 7; - case R8: case S8: case D8: return 8; - case R9: case S9: case D9: return 9; - case R10: case S10: case D10: return 10; - case R11: case S11: case D11: return 11; - case R12: case S12: case D12: return 12; - case SP: case S13: case D13: return 13; - case LR: case S14: case D14: return 14; - case PC: case S15: case D15: return 15; - case S16: return 16; - case S17: return 17; - case S18: return 18; - case S19: return 19; - case S20: return 20; - case S21: return 21; - case S22: return 22; - case S23: return 23; - case S24: return 24; - case S25: return 25; - case S26: return 26; - case S27: return 27; - case S28: return 28; - case S29: return 29; - case S30: return 30; - case S31: return 31; - default: - assert(0 && "Unknown ARM register!"); - abort(); - } -} - -unsigned ARMBaseRegisterInfo::getRegisterNumbering(unsigned RegEnum, - bool &isSPVFP) { - isSPVFP = false; - - using namespace ARM; - switch (RegEnum) { - default: - assert(0 && "Unknown ARM register!"); - abort(); - case R0: case D0: return 0; - case R1: case D1: return 1; - case R2: case D2: return 2; - case R3: case D3: return 3; - case R4: case D4: return 4; - case R5: case D5: return 5; - case R6: case D6: return 6; - case R7: case D7: return 7; - case R8: case D8: return 8; - case R9: case D9: return 9; - case R10: case D10: return 10; - case R11: case D11: return 11; - case R12: case D12: return 12; - case SP: case D13: return 13; - case LR: case D14: return 14; - case PC: case D15: return 15; - - case S0: case S1: case S2: case S3: - case S4: case S5: case S6: case S7: - case S8: case S9: case S10: case S11: - case S12: case S13: case S14: case S15: - case S16: case S17: case S18: case S19: - case S20: case S21: case S22: case S23: - case S24: case S25: case S26: case S27: - case S28: case S29: case S30: case S31: { - isSPVFP = true; - switch (RegEnum) { - default: return 0; // Avoid compile time warning. - case S0: return 0; - case S1: return 1; - case S2: return 2; - case S3: return 3; - case S4: return 4; - case S5: return 5; - case S6: return 6; - case S7: return 7; - case S8: return 8; - case S9: return 9; - case S10: return 10; - case S11: return 11; - case S12: return 12; - case S13: return 13; - case S14: return 14; - case S15: return 15; - case S16: return 16; - case S17: return 17; - case S18: return 18; - case S19: return 19; - case S20: return 20; - case S21: return 21; - case S22: return 22; - case S23: return 23; - case S24: return 24; - case S25: return 25; - case S26: return 26; - case S27: return 27; - case S28: return 28; - case S29: return 29; - case S30: return 30; - case S31: return 31; - } - } - } -} - -ARMBaseRegisterInfo::ARMBaseRegisterInfo(const TargetInstrInfo &tii, - const ARMSubtarget &sti) - : ARMGenRegisterInfo(ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP), - TII(tii), STI(sti), - FramePtr((STI.isTargetDarwin() || STI.isThumb()) ? ARM::R7 : ARM::R11) { -} - -ARMRegisterInfo::ARMRegisterInfo(const TargetInstrInfo &tii, +ARMRegisterInfo::ARMRegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &sti) : ARMBaseRegisterInfo(tii, sti) { } - -static inline -const MachineInstrBuilder &AddDefaultPred(const MachineInstrBuilder &MIB) { - return MIB.addImm((int64_t)ARMCC::AL).addReg(0); -} - -static inline -const MachineInstrBuilder &AddDefaultCC(const MachineInstrBuilder &MIB) { - return MIB.addReg(0); -} - -/// emitLoadConstPool - Emits a load from constpool to materialize the -/// specified immediate. -void ARMRegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - const TargetInstrInfo *TII, DebugLoc dl, - unsigned DestReg, int Val, - ARMCC::CondCodes Pred, - unsigned PredReg) const { - MachineFunction &MF = *MBB.getParent(); - MachineConstantPool *ConstantPool = MF.getConstantPool(); - Constant *C = ConstantInt::get(Type::Int32Ty, Val); - unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4); - - BuildMI(MBB, MBBI, dl, TII->get(ARM::LDRcp), DestReg) - .addConstantPoolIndex(Idx) - .addReg(0).addImm(0).addImm(Pred).addReg(PredReg); -} - -const unsigned* -ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { - static const unsigned CalleeSavedRegs[] = { - ARM::LR, ARM::R11, ARM::R10, ARM::R9, ARM::R8, - ARM::R7, ARM::R6, ARM::R5, ARM::R4, - - ARM::D15, ARM::D14, ARM::D13, ARM::D12, - ARM::D11, ARM::D10, ARM::D9, ARM::D8, - 0 - }; - - static const unsigned DarwinCalleeSavedRegs[] = { - // Darwin ABI deviates from ARM standard ABI. R9 is not a callee-saved - // register. - ARM::LR, ARM::R7, ARM::R6, ARM::R5, ARM::R4, - ARM::R11, ARM::R10, ARM::R8, - - ARM::D15, ARM::D14, ARM::D13, ARM::D12, - ARM::D11, ARM::D10, ARM::D9, ARM::D8, - 0 - }; - return STI.isTargetDarwin() ? DarwinCalleeSavedRegs : CalleeSavedRegs; -} - -const TargetRegisterClass* const * -ARMBaseRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const { - static const TargetRegisterClass * const CalleeSavedRegClasses[] = { - &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, - &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, - &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, - - &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, - &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, - 0 - }; - - static const TargetRegisterClass * const ThumbCalleeSavedRegClasses[] = { - &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, - &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::tGPRRegClass, - &ARM::tGPRRegClass,&ARM::tGPRRegClass,&ARM::tGPRRegClass, - - &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, - &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, - 0 - }; - - static const TargetRegisterClass * const DarwinCalleeSavedRegClasses[] = { - &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, - &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, - &ARM::GPRRegClass, &ARM::GPRRegClass, - - &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, - &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, - 0 - }; - - static const TargetRegisterClass * const DarwinThumbCalleeSavedRegClasses[] ={ - &ARM::GPRRegClass, &ARM::tGPRRegClass, &ARM::tGPRRegClass, - &ARM::tGPRRegClass, &ARM::tGPRRegClass, &ARM::GPRRegClass, - &ARM::GPRRegClass, &ARM::GPRRegClass, - - &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, - &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, - 0 - }; - - if (STI.isThumb()) { - return STI.isTargetDarwin() - ? DarwinThumbCalleeSavedRegClasses : ThumbCalleeSavedRegClasses; - } - return STI.isTargetDarwin() - ? DarwinCalleeSavedRegClasses : CalleeSavedRegClasses; -} - -BitVector ARMBaseRegisterInfo::getReservedRegs(const MachineFunction &MF) const { - // FIXME: avoid re-calculating this everytime. - BitVector Reserved(getNumRegs()); - Reserved.set(ARM::SP); - Reserved.set(ARM::PC); - if (STI.isTargetDarwin() || hasFP(MF)) - Reserved.set(FramePtr); - // Some targets reserve R9. - if (STI.isR9Reserved()) - Reserved.set(ARM::R9); - return Reserved; -} - -bool -ARMBaseRegisterInfo::isReservedReg(const MachineFunction &MF, unsigned Reg) const { - switch (Reg) { - default: break; - case ARM::SP: - case ARM::PC: - return true; - case ARM::R7: - case ARM::R11: - if (FramePtr == Reg && (STI.isTargetDarwin() || hasFP(MF))) - return true; - break; - case ARM::R9: - return STI.isR9Reserved(); - } - - return false; -} - -const TargetRegisterClass *ARMBaseRegisterInfo::getPointerRegClass() const { - return &ARM::GPRRegClass; -} - -/// getAllocationOrder - Returns the register allocation order for a specified -/// register class in the form of a pair of TargetRegisterClass iterators. -std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator> -ARMBaseRegisterInfo::getAllocationOrder(const TargetRegisterClass *RC, - unsigned HintType, unsigned HintReg, - const MachineFunction &MF) const { - // Alternative register allocation orders when favoring even / odd registers - // of register pairs. - - // No FP, R9 is available. - static const unsigned GPREven1[] = { - ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R8, ARM::R10, - ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R7, - ARM::R9, ARM::R11 - }; - static const unsigned GPROdd1[] = { - ARM::R1, ARM::R3, ARM::R5, ARM::R7, ARM::R9, ARM::R11, - ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, - ARM::R8, ARM::R10 - }; - - // FP is R7, R9 is available. - static const unsigned GPREven2[] = { - ARM::R0, ARM::R2, ARM::R4, ARM::R8, ARM::R10, - ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R6, - ARM::R9, ARM::R11 - }; - static const unsigned GPROdd2[] = { - ARM::R1, ARM::R3, ARM::R5, ARM::R9, ARM::R11, - ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, - ARM::R8, ARM::R10 - }; - - // FP is R11, R9 is available. - static const unsigned GPREven3[] = { - ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R8, - ARM::R1, ARM::R3, ARM::R10,ARM::R12,ARM::LR, ARM::R5, ARM::R7, - ARM::R9 - }; - static const unsigned GPROdd3[] = { - ARM::R1, ARM::R3, ARM::R5, ARM::R6, ARM::R9, - ARM::R0, ARM::R2, ARM::R10,ARM::R12,ARM::LR, ARM::R4, ARM::R7, - ARM::R8 - }; - - // No FP, R9 is not available. - static const unsigned GPREven4[] = { - ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R10, - ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R7, ARM::R8, - ARM::R11 - }; - static const unsigned GPROdd4[] = { - ARM::R1, ARM::R3, ARM::R5, ARM::R7, ARM::R11, - ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8, - ARM::R10 - }; - - // FP is R7, R9 is not available. - static const unsigned GPREven5[] = { - ARM::R0, ARM::R2, ARM::R4, ARM::R10, - ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R6, ARM::R8, - ARM::R11 - }; - static const unsigned GPROdd5[] = { - ARM::R1, ARM::R3, ARM::R5, ARM::R11, - ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8, - ARM::R10 - }; - - // FP is R11, R9 is not available. - static const unsigned GPREven6[] = { - ARM::R0, ARM::R2, ARM::R4, ARM::R6, - ARM::R1, ARM::R3, ARM::R10,ARM::R12,ARM::LR, ARM::R5, ARM::R7, ARM::R8 - }; - static const unsigned GPROdd6[] = { - ARM::R1, ARM::R3, ARM::R5, ARM::R7, - ARM::R0, ARM::R2, ARM::R10,ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8 - }; - - - if (HintType == ARMRI::RegPairEven) { - if (isPhysicalRegister(HintReg) && getRegisterPairEven(HintReg, MF) == 0) - // It's no longer possible to fulfill this hint. Return the default - // allocation order. - return std::make_pair(RC->allocation_order_begin(MF), - RC->allocation_order_end(MF)); - - if (!STI.isTargetDarwin() && !hasFP(MF)) { - if (!STI.isR9Reserved()) - return std::make_pair(GPREven1, - GPREven1 + (sizeof(GPREven1)/sizeof(unsigned))); - else - return std::make_pair(GPREven4, - GPREven4 + (sizeof(GPREven4)/sizeof(unsigned))); - } else if (FramePtr == ARM::R7) { - if (!STI.isR9Reserved()) - return std::make_pair(GPREven2, - GPREven2 + (sizeof(GPREven2)/sizeof(unsigned))); - else - return std::make_pair(GPREven5, - GPREven5 + (sizeof(GPREven5)/sizeof(unsigned))); - } else { // FramePtr == ARM::R11 - if (!STI.isR9Reserved()) - return std::make_pair(GPREven3, - GPREven3 + (sizeof(GPREven3)/sizeof(unsigned))); - else - return std::make_pair(GPREven6, - GPREven6 + (sizeof(GPREven6)/sizeof(unsigned))); - } - } else if (HintType == ARMRI::RegPairOdd) { - if (isPhysicalRegister(HintReg) && getRegisterPairOdd(HintReg, MF) == 0) - // It's no longer possible to fulfill this hint. Return the default - // allocation order. - return std::make_pair(RC->allocation_order_begin(MF), - RC->allocation_order_end(MF)); - - if (!STI.isTargetDarwin() && !hasFP(MF)) { - if (!STI.isR9Reserved()) - return std::make_pair(GPROdd1, - GPROdd1 + (sizeof(GPROdd1)/sizeof(unsigned))); - else - return std::make_pair(GPROdd4, - GPROdd4 + (sizeof(GPROdd4)/sizeof(unsigned))); - } else if (FramePtr == ARM::R7) { - if (!STI.isR9Reserved()) - return std::make_pair(GPROdd2, - GPROdd2 + (sizeof(GPROdd2)/sizeof(unsigned))); - else - return std::make_pair(GPROdd5, - GPROdd5 + (sizeof(GPROdd5)/sizeof(unsigned))); - } else { // FramePtr == ARM::R11 - if (!STI.isR9Reserved()) - return std::make_pair(GPROdd3, - GPROdd3 + (sizeof(GPROdd3)/sizeof(unsigned))); - else - return std::make_pair(GPROdd6, - GPROdd6 + (sizeof(GPROdd6)/sizeof(unsigned))); - } - } - return std::make_pair(RC->allocation_order_begin(MF), - RC->allocation_order_end(MF)); -} - -/// ResolveRegAllocHint - Resolves the specified register allocation hint -/// to a physical register. Returns the physical register if it is successful. -unsigned -ARMBaseRegisterInfo::ResolveRegAllocHint(unsigned Type, unsigned Reg, - const MachineFunction &MF) const { - if (Reg == 0 || !isPhysicalRegister(Reg)) - return 0; - if (Type == 0) - return Reg; - else if (Type == (unsigned)ARMRI::RegPairOdd) - // Odd register. - return getRegisterPairOdd(Reg, MF); - else if (Type == (unsigned)ARMRI::RegPairEven) - // Even register. - return getRegisterPairEven(Reg, MF); - return 0; -} - -void -ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg, - MachineFunction &MF) const { - MachineRegisterInfo *MRI = &MF.getRegInfo(); - std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg); - if ((Hint.first == (unsigned)ARMRI::RegPairOdd || - Hint.first == (unsigned)ARMRI::RegPairEven) && - Hint.second && TargetRegisterInfo::isVirtualRegister(Hint.second)) { - // If 'Reg' is one of the even / odd register pair and it's now changed - // (e.g. coalesced) into a different register. The other register of the - // pair allocation hint must be updated to reflect the relationship - // change. - unsigned OtherReg = Hint.second; - Hint = MRI->getRegAllocationHint(OtherReg); - if (Hint.second == Reg) - // Make sure the pair has not already divorced. - MRI->setRegAllocationHint(OtherReg, Hint.first, NewReg); - } -} - -bool -ARMRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const { - return true; -} - -/// hasFP - Return true if the specified function should have a dedicated frame -/// pointer register. This is true if the function has variable sized allocas -/// or if frame pointer elimination is disabled. -/// -bool ARMBaseRegisterInfo::hasFP(const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - return (NoFramePointerElim || - MFI->hasVarSizedObjects() || - MFI->isFrameAddressTaken()); -} - -// hasReservedCallFrame - Under normal circumstances, when a frame pointer is -// not required, we reserve argument space for call sites in the function -// immediately on entry to the current function. This eliminates the need for -// add/sub sp brackets around call sites. Returns true if the call frame is -// included as part of the stack frame. -bool ARMRegisterInfo::hasReservedCallFrame(MachineFunction &MF) const { - const MachineFrameInfo *FFI = MF.getFrameInfo(); - unsigned CFSize = FFI->getMaxCallFrameSize(); - // It's not always a good idea to include the call frame as part of the - // stack frame. ARM (especially Thumb) has small immediate offset to - // address the stack frame. So a large call frame can cause poor codegen - // and may even makes it impossible to scavenge a register. - if (CFSize >= ((1 << 12) - 1) / 2) // Half of imm12 - return false; - - return !MF.getFrameInfo()->hasVarSizedObjects(); -} - -/// emitARMRegPlusImmediate - Emits a series of instructions to materialize -/// a destreg = basereg + immediate in ARM code. -static -void emitARMRegPlusImmediate(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - unsigned DestReg, unsigned BaseReg, int NumBytes, - ARMCC::CondCodes Pred, unsigned PredReg, - const TargetInstrInfo &TII, - DebugLoc dl) { - bool isSub = NumBytes < 0; - if (isSub) NumBytes = -NumBytes; - - while (NumBytes) { - unsigned RotAmt = ARM_AM::getSOImmValRotate(NumBytes); - unsigned ThisVal = NumBytes & ARM_AM::rotr32(0xFF, RotAmt); - assert(ThisVal && "Didn't extract field correctly"); - - // We will handle these bits from offset, clear them. - NumBytes &= ~ThisVal; - - // Get the properly encoded SOImmVal field. - int SOImmVal = ARM_AM::getSOImmVal(ThisVal); - assert(SOImmVal != -1 && "Bit extraction didn't work?"); - - // Build the new ADD / SUB. - BuildMI(MBB, MBBI, dl, TII.get(isSub ? ARM::SUBri : ARM::ADDri), DestReg) - .addReg(BaseReg, RegState::Kill).addImm(SOImmVal) - .addImm((unsigned)Pred).addReg(PredReg).addReg(0); - BaseReg = DestReg; - } -} - -static void -emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - const TargetInstrInfo &TII, DebugLoc dl, - int NumBytes, - ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) { - emitARMRegPlusImmediate(MBB, MBBI, ARM::SP, ARM::SP, NumBytes, - Pred, PredReg, TII, dl); -} - -void ARMRegisterInfo:: -eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const { - if (!hasReservedCallFrame(MF)) { - // If we have alloca, convert as follows: - // ADJCALLSTACKDOWN -> sub, sp, sp, amount - // ADJCALLSTACKUP -> add, sp, sp, amount - MachineInstr *Old = I; - DebugLoc dl = Old->getDebugLoc(); - unsigned Amount = Old->getOperand(0).getImm(); - if (Amount != 0) { - // We need to keep the stack aligned properly. To do this, we round the - // amount of space needed for the outgoing arguments up to the next - // alignment boundary. - unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment(); - Amount = (Amount+Align-1)/Align*Align; - - // Replace the pseudo instruction with a new instruction... - unsigned Opc = Old->getOpcode(); - ARMCC::CondCodes Pred = (ARMCC::CondCodes)Old->getOperand(1).getImm(); - if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) { - // Note: PredReg is operand 2 for ADJCALLSTACKDOWN. - unsigned PredReg = Old->getOperand(2).getReg(); - emitSPUpdate(MBB, I, TII, dl, -Amount, Pred, PredReg); - } else { - // Note: PredReg is operand 3 for ADJCALLSTACKUP. - unsigned PredReg = Old->getOperand(3).getReg(); - assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP); - emitSPUpdate(MBB, I, TII, dl, Amount, Pred, PredReg); - } - } - } - MBB.erase(I); -} - -/// findScratchRegister - Find a 'free' ARM register. If register scavenger -/// is not being used, R12 is available. Otherwise, try for a call-clobbered -/// register first and then a spilled callee-saved register if that fails. -static -unsigned findScratchRegister(RegScavenger *RS, const TargetRegisterClass *RC, - ARMFunctionInfo *AFI) { - unsigned Reg = RS ? RS->FindUnusedReg(RC, true) : (unsigned) ARM::R12; - assert (!AFI->isThumbFunction()); - if (Reg == 0) - // Try a already spilled CS register. - Reg = RS->FindUnusedReg(RC, AFI->getSpilledCSRegisters()); - - return Reg; -} - -void ARMRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, - int SPAdj, RegScavenger *RS) const{ - unsigned i = 0; - MachineInstr &MI = *II; - MachineBasicBlock &MBB = *MI.getParent(); - MachineFunction &MF = *MBB.getParent(); - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - DebugLoc dl = MI.getDebugLoc(); - - while (!MI.getOperand(i).isFI()) { - ++i; - assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); - } - - unsigned FrameReg = ARM::SP; - int FrameIndex = MI.getOperand(i).getIndex(); - int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) + - MF.getFrameInfo()->getStackSize() + SPAdj; - - if (AFI->isGPRCalleeSavedArea1Frame(FrameIndex)) - Offset -= AFI->getGPRCalleeSavedArea1Offset(); - else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex)) - Offset -= AFI->getGPRCalleeSavedArea2Offset(); - else if (AFI->isDPRCalleeSavedAreaFrame(FrameIndex)) - Offset -= AFI->getDPRCalleeSavedAreaOffset(); - else if (hasFP(MF)) { - assert(SPAdj == 0 && "Unexpected"); - // There is alloca()'s in this function, must reference off the frame - // pointer instead. - FrameReg = getFrameRegister(MF); - Offset -= AFI->getFramePtrSpillOffset(); - } - - unsigned Opcode = MI.getOpcode(); - const TargetInstrDesc &Desc = MI.getDesc(); - unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); - bool isSub = false; - - // Memory operands in inline assembly always use AddrMode2. - if (Opcode == ARM::INLINEASM) - AddrMode = ARMII::AddrMode2; - - if (Opcode == ARM::ADDri) { - Offset += MI.getOperand(i+1).getImm(); - if (Offset == 0) { - // Turn it into a move. - MI.setDesc(TII.get(ARM::MOVr)); - MI.getOperand(i).ChangeToRegister(FrameReg, false); - MI.RemoveOperand(i+1); - return; - } else if (Offset < 0) { - Offset = -Offset; - isSub = true; - MI.setDesc(TII.get(ARM::SUBri)); - } - - // Common case: small offset, fits into instruction. - int ImmedOffset = ARM_AM::getSOImmVal(Offset); - if (ImmedOffset != -1) { - // Replace the FrameIndex with sp / fp - MI.getOperand(i).ChangeToRegister(FrameReg, false); - MI.getOperand(i+1).ChangeToImmediate(ImmedOffset); - return; - } - - // Otherwise, we fallback to common code below to form the imm offset with - // a sequence of ADDri instructions. First though, pull as much of the imm - // into this ADDri as possible. - unsigned RotAmt = ARM_AM::getSOImmValRotate(Offset); - unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xFF, RotAmt); - - // We will handle these bits from offset, clear them. - Offset &= ~ThisImmVal; - - // Get the properly encoded SOImmVal field. - int ThisSOImmVal = ARM_AM::getSOImmVal(ThisImmVal); - assert(ThisSOImmVal != -1 && "Bit extraction didn't work?"); - MI.getOperand(i+1).ChangeToImmediate(ThisSOImmVal); - } else { - unsigned ImmIdx = 0; - int InstrOffs = 0; - unsigned NumBits = 0; - unsigned Scale = 1; - switch (AddrMode) { - case ARMII::AddrMode2: { - ImmIdx = i+2; - InstrOffs = ARM_AM::getAM2Offset(MI.getOperand(ImmIdx).getImm()); - if (ARM_AM::getAM2Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub) - InstrOffs *= -1; - NumBits = 12; - break; - } - case ARMII::AddrMode3: { - ImmIdx = i+2; - InstrOffs = ARM_AM::getAM3Offset(MI.getOperand(ImmIdx).getImm()); - if (ARM_AM::getAM3Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub) - InstrOffs *= -1; - NumBits = 8; - break; - } - case ARMII::AddrMode5: { - ImmIdx = i+1; - InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm()); - if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub) - InstrOffs *= -1; - NumBits = 8; - Scale = 4; - break; - } - default: - assert(0 && "Unsupported addressing mode!"); - abort(); - break; - } - - Offset += InstrOffs * Scale; - assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!"); - if (Offset < 0) { - Offset = -Offset; - isSub = true; - } - - // Common case: small offset, fits into instruction. - MachineOperand &ImmOp = MI.getOperand(ImmIdx); - int ImmedOffset = Offset / Scale; - unsigned Mask = (1 << NumBits) - 1; - if ((unsigned)Offset <= Mask * Scale) { - // Replace the FrameIndex with sp - MI.getOperand(i).ChangeToRegister(FrameReg, false); - if (isSub) - ImmedOffset |= 1 << NumBits; - ImmOp.ChangeToImmediate(ImmedOffset); - return; - } - - // Otherwise, it didn't fit. Pull in what we can to simplify the immed. - ImmedOffset = ImmedOffset & Mask; - if (isSub) - ImmedOffset |= 1 << NumBits; - ImmOp.ChangeToImmediate(ImmedOffset); - Offset &= ~(Mask*Scale); - } - - // If we get here, the immediate doesn't fit into the instruction. We folded - // as much as possible above, handle the rest, providing a register that is - // SP+LargeImm. - assert(Offset && "This code isn't needed if offset already handled!"); - - // Insert a set of r12 with the full address: r12 = sp + offset - // If the offset we have is too large to fit into the instruction, we need - // to form it with a series of ADDri's. Do this by taking 8-bit chunks - // out of 'Offset'. - unsigned ScratchReg = findScratchRegister(RS, &ARM::GPRRegClass, AFI); - if (ScratchReg == 0) - // No register is "free". Scavenge a register. - ScratchReg = RS->scavengeRegister(&ARM::GPRRegClass, II, SPAdj); - int PIdx = MI.findFirstPredOperandIdx(); - ARMCC::CondCodes Pred = (PIdx == -1) - ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm(); - unsigned PredReg = (PIdx == -1) ? 0 : MI.getOperand(PIdx+1).getReg(); - emitARMRegPlusImmediate(MBB, II, ScratchReg, FrameReg, - isSub ? -Offset : Offset, Pred, PredReg, TII, dl); - MI.getOperand(i).ChangeToRegister(ScratchReg, false, false, true); -} - -static unsigned estimateStackSize(MachineFunction &MF, MachineFrameInfo *MFI) { - const MachineFrameInfo *FFI = MF.getFrameInfo(); - int Offset = 0; - for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) { - int FixedOff = -FFI->getObjectOffset(i); - if (FixedOff > Offset) Offset = FixedOff; - } - for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) { - if (FFI->isDeadObjectIndex(i)) - continue; - Offset += FFI->getObjectSize(i); - unsigned Align = FFI->getObjectAlignment(i); - // Adjust to alignment boundary - Offset = (Offset+Align-1)/Align*Align; - } - return (unsigned)Offset; -} - -void -ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS) const { - // This tells PEI to spill the FP as if it is any other callee-save register - // to take advantage the eliminateFrameIndex machinery. This also ensures it - // is spilled in the order specified by getCalleeSavedRegs() to make it easier - // to combine multiple loads / stores. - bool CanEliminateFrame = true; - bool CS1Spilled = false; - bool LRSpilled = false; - unsigned NumGPRSpills = 0; - SmallVector<unsigned, 4> UnspilledCS1GPRs; - SmallVector<unsigned, 4> UnspilledCS2GPRs; - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - - // Don't spill FP if the frame can be eliminated. This is determined - // by scanning the callee-save registers to see if any is used. - const unsigned *CSRegs = getCalleeSavedRegs(); - const TargetRegisterClass* const *CSRegClasses = getCalleeSavedRegClasses(); - for (unsigned i = 0; CSRegs[i]; ++i) { - unsigned Reg = CSRegs[i]; - bool Spilled = false; - if (MF.getRegInfo().isPhysRegUsed(Reg)) { - AFI->setCSRegisterIsSpilled(Reg); - Spilled = true; - CanEliminateFrame = false; - } else { - // Check alias registers too. - for (const unsigned *Aliases = getAliasSet(Reg); *Aliases; ++Aliases) { - if (MF.getRegInfo().isPhysRegUsed(*Aliases)) { - Spilled = true; - CanEliminateFrame = false; - } - } - } - - if (CSRegClasses[i] == &ARM::GPRRegClass) { - if (Spilled) { - NumGPRSpills++; - - if (!STI.isTargetDarwin()) { - if (Reg == ARM::LR) - LRSpilled = true; - CS1Spilled = true; - continue; - } - - // Keep track if LR and any of R4, R5, R6, and R7 is spilled. - switch (Reg) { - case ARM::LR: - LRSpilled = true; - // Fallthrough - case ARM::R4: - case ARM::R5: - case ARM::R6: - case ARM::R7: - CS1Spilled = true; - break; - default: - break; - } - } else { - if (!STI.isTargetDarwin()) { - UnspilledCS1GPRs.push_back(Reg); - continue; - } - - switch (Reg) { - case ARM::R4: - case ARM::R5: - case ARM::R6: - case ARM::R7: - case ARM::LR: - UnspilledCS1GPRs.push_back(Reg); - break; - default: - UnspilledCS2GPRs.push_back(Reg); - break; - } - } - } - } - - bool ForceLRSpill = false; - if (!LRSpilled && AFI->isThumbFunction()) { - unsigned FnSize = TII.GetFunctionSizeInBytes(MF); - // Force LR to be spilled if the Thumb function size is > 2048. This enables - // use of BL to implement far jump. If it turns out that it's not needed - // then the branch fix up path will undo it. - if (FnSize >= (1 << 11)) { - CanEliminateFrame = false; - ForceLRSpill = true; - } - } - - bool ExtraCSSpill = false; - if (!CanEliminateFrame || hasFP(MF)) { - AFI->setHasStackFrame(true); - - // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled. - // Spill LR as well so we can fold BX_RET to the registers restore (LDM). - if (!LRSpilled && CS1Spilled) { - MF.getRegInfo().setPhysRegUsed(ARM::LR); - AFI->setCSRegisterIsSpilled(ARM::LR); - NumGPRSpills++; - UnspilledCS1GPRs.erase(std::find(UnspilledCS1GPRs.begin(), - UnspilledCS1GPRs.end(), (unsigned)ARM::LR)); - ForceLRSpill = false; - ExtraCSSpill = true; - } - - // Darwin ABI requires FP to point to the stack slot that contains the - // previous FP. - if (STI.isTargetDarwin() || hasFP(MF)) { - MF.getRegInfo().setPhysRegUsed(FramePtr); - NumGPRSpills++; - } - - // If stack and double are 8-byte aligned and we are spilling an odd number - // of GPRs. Spill one extra callee save GPR so we won't have to pad between - // the integer and double callee save areas. - unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment(); - if (TargetAlign == 8 && (NumGPRSpills & 1)) { - if (CS1Spilled && !UnspilledCS1GPRs.empty()) { - for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) { - unsigned Reg = UnspilledCS1GPRs[i]; - // Don't spiil high register if the function is thumb - if (!AFI->isThumbFunction() || - isARMLowRegister(Reg) || Reg == ARM::LR) { - MF.getRegInfo().setPhysRegUsed(Reg); - AFI->setCSRegisterIsSpilled(Reg); - if (!isReservedReg(MF, Reg)) - ExtraCSSpill = true; - break; - } - } - } else if (!UnspilledCS2GPRs.empty() && - !AFI->isThumbFunction()) { - unsigned Reg = UnspilledCS2GPRs.front(); - MF.getRegInfo().setPhysRegUsed(Reg); - AFI->setCSRegisterIsSpilled(Reg); - if (!isReservedReg(MF, Reg)) - ExtraCSSpill = true; - } - } - - // Estimate if we might need to scavenge a register at some point in order - // to materialize a stack offset. If so, either spill one additional - // callee-saved register or reserve a special spill slot to facilitate - // register scavenging. - if (RS && !ExtraCSSpill && !AFI->isThumbFunction()) { - MachineFrameInfo *MFI = MF.getFrameInfo(); - unsigned Size = estimateStackSize(MF, MFI); - unsigned Limit = (1 << 12) - 1; - for (MachineFunction::iterator BB = MF.begin(),E = MF.end();BB != E; ++BB) - for (MachineBasicBlock::iterator I= BB->begin(); I != BB->end(); ++I) { - for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) - if (I->getOperand(i).isFI()) { - unsigned Opcode = I->getOpcode(); - const TargetInstrDesc &Desc = TII.get(Opcode); - unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); - if (AddrMode == ARMII::AddrMode3) { - Limit = (1 << 8) - 1; - goto DoneEstimating; - } else if (AddrMode == ARMII::AddrMode5) { - unsigned ThisLimit = ((1 << 8) - 1) * 4; - if (ThisLimit < Limit) - Limit = ThisLimit; - } - } - } - DoneEstimating: - if (Size >= Limit) { - // If any non-reserved CS register isn't spilled, just spill one or two - // extra. That should take care of it! - unsigned NumExtras = TargetAlign / 4; - SmallVector<unsigned, 2> Extras; - while (NumExtras && !UnspilledCS1GPRs.empty()) { - unsigned Reg = UnspilledCS1GPRs.back(); - UnspilledCS1GPRs.pop_back(); - if (!isReservedReg(MF, Reg)) { - Extras.push_back(Reg); - NumExtras--; - } - } - while (NumExtras && !UnspilledCS2GPRs.empty()) { - unsigned Reg = UnspilledCS2GPRs.back(); - UnspilledCS2GPRs.pop_back(); - if (!isReservedReg(MF, Reg)) { - Extras.push_back(Reg); - NumExtras--; - } - } - if (Extras.size() && NumExtras == 0) { - for (unsigned i = 0, e = Extras.size(); i != e; ++i) { - MF.getRegInfo().setPhysRegUsed(Extras[i]); - AFI->setCSRegisterIsSpilled(Extras[i]); - } - } else { - // Reserve a slot closest to SP or frame pointer. - const TargetRegisterClass *RC = &ARM::GPRRegClass; - RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(), - RC->getAlignment())); - } - } - } - } - - if (ForceLRSpill) { - MF.getRegInfo().setPhysRegUsed(ARM::LR); - AFI->setCSRegisterIsSpilled(ARM::LR); - AFI->setLRIsSpilledForFarJump(true); - } -} - -/// Move iterator pass the next bunch of callee save load / store ops for -/// the particular spill area (1: integer area 1, 2: integer area 2, -/// 3: fp area, 0: don't care). -static void movePastCSLoadStoreOps(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - int Opc, unsigned Area, - const ARMSubtarget &STI) { - while (MBBI != MBB.end() && - MBBI->getOpcode() == Opc && MBBI->getOperand(1).isFI()) { - if (Area != 0) { - bool Done = false; - unsigned Category = 0; - switch (MBBI->getOperand(0).getReg()) { - case ARM::R4: case ARM::R5: case ARM::R6: case ARM::R7: - case ARM::LR: - Category = 1; - break; - case ARM::R8: case ARM::R9: case ARM::R10: case ARM::R11: - Category = STI.isTargetDarwin() ? 2 : 1; - break; - case ARM::D8: case ARM::D9: case ARM::D10: case ARM::D11: - case ARM::D12: case ARM::D13: case ARM::D14: case ARM::D15: - Category = 3; - break; - default: - Done = true; - break; - } - if (Done || Category != Area) - break; - } - - ++MBBI; - } -} - -void ARMRegisterInfo::emitPrologue(MachineFunction &MF) const { - MachineBasicBlock &MBB = MF.front(); - MachineBasicBlock::iterator MBBI = MBB.begin(); - MachineFrameInfo *MFI = MF.getFrameInfo(); - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); - unsigned NumBytes = MFI->getStackSize(); - const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); - DebugLoc dl = (MBBI != MBB.end() ? - MBBI->getDebugLoc() : DebugLoc::getUnknownLoc()); - - // Determine the sizes of each callee-save spill areas and record which frame - // belongs to which callee-save spill areas. - unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0; - int FramePtrSpillFI = 0; - - if (VARegSaveSize) - emitSPUpdate(MBB, MBBI, TII, dl, -VARegSaveSize); - - if (!AFI->hasStackFrame()) { - if (NumBytes != 0) - emitSPUpdate(MBB, MBBI, TII, dl, -NumBytes); - return; - } - - for (unsigned i = 0, e = CSI.size(); i != e; ++i) { - unsigned Reg = CSI[i].getReg(); - int FI = CSI[i].getFrameIdx(); - switch (Reg) { - case ARM::R4: - case ARM::R5: - case ARM::R6: - case ARM::R7: - case ARM::LR: - if (Reg == FramePtr) - FramePtrSpillFI = FI; - AFI->addGPRCalleeSavedArea1Frame(FI); - GPRCS1Size += 4; - break; - case ARM::R8: - case ARM::R9: - case ARM::R10: - case ARM::R11: - if (Reg == FramePtr) - FramePtrSpillFI = FI; - if (STI.isTargetDarwin()) { - AFI->addGPRCalleeSavedArea2Frame(FI); - GPRCS2Size += 4; - } else { - AFI->addGPRCalleeSavedArea1Frame(FI); - GPRCS1Size += 4; - } - break; - default: - AFI->addDPRCalleeSavedAreaFrame(FI); - DPRCSSize += 8; - } - } - - // Build the new SUBri to adjust SP for integer callee-save spill area 1. - emitSPUpdate(MBB, MBBI, TII, dl, -GPRCS1Size); - movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, 1, STI); - - // Darwin ABI requires FP to point to the stack slot that contains the - // previous FP. - if (STI.isTargetDarwin() || hasFP(MF)) { - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, dl, TII.get(ARM::ADDri), FramePtr) - .addFrameIndex(FramePtrSpillFI).addImm(0); - AddDefaultCC(AddDefaultPred(MIB)); - } - - // Build the new SUBri to adjust SP for integer callee-save spill area 2. - emitSPUpdate(MBB, MBBI, TII, dl, -GPRCS2Size); - - // Build the new SUBri to adjust SP for FP callee-save spill area. - movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, 2, STI); - emitSPUpdate(MBB, MBBI, TII, dl, -DPRCSSize); - - // Determine starting offsets of spill areas. - unsigned DPRCSOffset = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize); - unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize; - unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size; - AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes); - AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset); - AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset); - AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset); - - NumBytes = DPRCSOffset; - if (NumBytes) { - // Insert it after all the callee-save spills. - movePastCSLoadStoreOps(MBB, MBBI, ARM::FSTD, 3, STI); - emitSPUpdate(MBB, MBBI, TII, dl, -NumBytes); - } - - if (STI.isTargetELF() && hasFP(MF)) { - MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() - - AFI->getFramePtrSpillOffset()); - } - - AFI->setGPRCalleeSavedArea1Size(GPRCS1Size); - AFI->setGPRCalleeSavedArea2Size(GPRCS2Size); - AFI->setDPRCalleeSavedAreaSize(DPRCSSize); -} - -static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) { - for (unsigned i = 0; CSRegs[i]; ++i) - if (Reg == CSRegs[i]) - return true; - return false; -} - -static bool isCSRestore(MachineInstr *MI, const unsigned *CSRegs) { - return ((MI->getOpcode() == ARM::FLDD || - MI->getOpcode() == ARM::LDR) && - MI->getOperand(1).isFI() && - isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs)); -} - -void ARMRegisterInfo::emitEpilogue(MachineFunction &MF, - MachineBasicBlock &MBB) const { - MachineBasicBlock::iterator MBBI = prior(MBB.end()); - assert(MBBI->getOpcode() == ARM::BX_RET && - "Can only insert epilog into returning blocks"); - DebugLoc dl = MBBI->getDebugLoc(); - MachineFrameInfo *MFI = MF.getFrameInfo(); - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); - int NumBytes = (int)MFI->getStackSize(); - - if (!AFI->hasStackFrame()) { - if (NumBytes != 0) - emitSPUpdate(MBB, MBBI, TII, dl, NumBytes); - } else { - // Unwind MBBI to point to first LDR / FLDD. - const unsigned *CSRegs = getCalleeSavedRegs(); - if (MBBI != MBB.begin()) { - do - --MBBI; - while (MBBI != MBB.begin() && isCSRestore(MBBI, CSRegs)); - if (!isCSRestore(MBBI, CSRegs)) - ++MBBI; - } - - // Move SP to start of FP callee save spill area. - NumBytes -= (AFI->getGPRCalleeSavedArea1Size() + - AFI->getGPRCalleeSavedArea2Size() + - AFI->getDPRCalleeSavedAreaSize()); - - // Darwin ABI requires FP to point to the stack slot that contains the - // previous FP. - if ((STI.isTargetDarwin() && NumBytes) || hasFP(MF)) { - NumBytes = AFI->getFramePtrSpillOffset() - NumBytes; - // Reset SP based on frame pointer only if the stack frame extends beyond - // frame pointer stack slot or target is ELF and the function has FP. - if (AFI->getGPRCalleeSavedArea2Size() || - AFI->getDPRCalleeSavedAreaSize() || - AFI->getDPRCalleeSavedAreaOffset()|| - hasFP(MF)) { - if (NumBytes) - BuildMI(MBB, MBBI, dl, TII.get(ARM::SUBri), ARM::SP).addReg(FramePtr) - .addImm(NumBytes) - .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); - else - BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP).addReg(FramePtr) - .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); - } - } else if (NumBytes) { - emitSPUpdate(MBB, MBBI, TII, dl, NumBytes); - } - - // Move SP to start of integer callee save spill area 2. - movePastCSLoadStoreOps(MBB, MBBI, ARM::FLDD, 3, STI); - emitSPUpdate(MBB, MBBI, TII, dl, AFI->getDPRCalleeSavedAreaSize()); - - // Move SP to start of integer callee save spill area 1. - movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, 2, STI); - emitSPUpdate(MBB, MBBI, TII, dl, AFI->getGPRCalleeSavedArea2Size()); - - // Move SP to SP upon entry to the function. - movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, 1, STI); - emitSPUpdate(MBB, MBBI, TII, dl, AFI->getGPRCalleeSavedArea1Size()); - } - - if (VARegSaveSize) - emitSPUpdate(MBB, MBBI, TII, dl, VARegSaveSize); - -} - -unsigned ARMBaseRegisterInfo::getRARegister() const { - return ARM::LR; -} - -unsigned ARMBaseRegisterInfo::getFrameRegister(MachineFunction &MF) const { - if (STI.isTargetDarwin() || hasFP(MF)) - return FramePtr; - return ARM::SP; -} - -unsigned ARMBaseRegisterInfo::getEHExceptionRegister() const { - assert(0 && "What is the exception register"); - return 0; -} - -unsigned ARMBaseRegisterInfo::getEHHandlerRegister() const { - assert(0 && "What is the exception handler register"); - return 0; -} - -int ARMBaseRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const { - return ARMGenRegisterInfo::getDwarfRegNumFull(RegNum, 0); -} - -unsigned ARMBaseRegisterInfo::getRegisterPairEven(unsigned Reg, - const MachineFunction &MF) const { - switch (Reg) { - default: break; - // Return 0 if either register of the pair is a special register. - // So no R12, etc. - case ARM::R1: - return ARM::R0; - case ARM::R3: - // FIXME! - return STI.isThumb() ? 0 : ARM::R2; - case ARM::R5: - return ARM::R4; - case ARM::R7: - return isReservedReg(MF, ARM::R7) ? 0 : ARM::R6; - case ARM::R9: - return isReservedReg(MF, ARM::R9) ? 0 :ARM::R8; - case ARM::R11: - return isReservedReg(MF, ARM::R11) ? 0 : ARM::R10; - - case ARM::S1: - return ARM::S0; - case ARM::S3: - return ARM::S2; - case ARM::S5: - return ARM::S4; - case ARM::S7: - return ARM::S6; - case ARM::S9: - return ARM::S8; - case ARM::S11: - return ARM::S10; - case ARM::S13: - return ARM::S12; - case ARM::S15: - return ARM::S14; - case ARM::S17: - return ARM::S16; - case ARM::S19: - return ARM::S18; - case ARM::S21: - return ARM::S20; - case ARM::S23: - return ARM::S22; - case ARM::S25: - return ARM::S24; - case ARM::S27: - return ARM::S26; - case ARM::S29: - return ARM::S28; - case ARM::S31: - return ARM::S30; - - case ARM::D1: - return ARM::D0; - case ARM::D3: - return ARM::D2; - case ARM::D5: - return ARM::D4; - case ARM::D7: - return ARM::D6; - case ARM::D9: - return ARM::D8; - case ARM::D11: - return ARM::D10; - case ARM::D13: - return ARM::D12; - case ARM::D15: - return ARM::D14; - } - - return 0; -} - -unsigned ARMBaseRegisterInfo::getRegisterPairOdd(unsigned Reg, - const MachineFunction &MF) const { - switch (Reg) { - default: break; - // Return 0 if either register of the pair is a special register. - // So no R12, etc. - case ARM::R0: - return ARM::R1; - case ARM::R2: - // FIXME! - return STI.isThumb() ? 0 : ARM::R3; - case ARM::R4: - return ARM::R5; - case ARM::R6: - return isReservedReg(MF, ARM::R7) ? 0 : ARM::R7; - case ARM::R8: - return isReservedReg(MF, ARM::R9) ? 0 :ARM::R9; - case ARM::R10: - return isReservedReg(MF, ARM::R11) ? 0 : ARM::R11; - - case ARM::S0: - return ARM::S1; - case ARM::S2: - return ARM::S3; - case ARM::S4: - return ARM::S5; - case ARM::S6: - return ARM::S7; - case ARM::S8: - return ARM::S9; - case ARM::S10: - return ARM::S11; - case ARM::S12: - return ARM::S13; - case ARM::S14: - return ARM::S15; - case ARM::S16: - return ARM::S17; - case ARM::S18: - return ARM::S19; - case ARM::S20: - return ARM::S21; - case ARM::S22: - return ARM::S23; - case ARM::S24: - return ARM::S25; - case ARM::S26: - return ARM::S27; - case ARM::S28: - return ARM::S29; - case ARM::S30: - return ARM::S31; - - case ARM::D0: - return ARM::D1; - case ARM::D2: - return ARM::D3; - case ARM::D4: - return ARM::D5; - case ARM::D6: - return ARM::D7; - case ARM::D8: - return ARM::D9; - case ARM::D10: - return ARM::D11; - case ARM::D12: - return ARM::D13; - case ARM::D14: - return ARM::D15; - } - - return 0; -} - -#include "ARMGenRegisterInfo.inc" diff --git a/lib/Target/ARM/ARMRegisterInfo.h b/lib/Target/ARM/ARMRegisterInfo.h index 7fe075a..041afd0 100644 --- a/lib/Target/ARM/ARMRegisterInfo.h +++ b/lib/Target/ARM/ARMRegisterInfo.h @@ -16,127 +16,26 @@ #include "ARM.h" #include "llvm/Target/TargetRegisterInfo.h" -#include "ARMGenRegisterInfo.h.inc" +#include "ARMBaseRegisterInfo.h" namespace llvm { class ARMSubtarget; - class TargetInstrInfo; + class ARMBaseInstrInfo; class Type; -/// Register allocation hints. -namespace ARMRI { - enum { - RegPairOdd = 1, - RegPairEven = 2 +namespace ARM { + /// SubregIndex - The index of various subregister classes. Note that + /// these indices must be kept in sync with the class indices in the + /// ARMRegisterInfo.td file. + enum SubregIndex { + SSUBREG_0 = 1, SSUBREG_1 = 2, SSUBREG_2 = 3, SSUBREG_3 = 4, + DSUBREG_0 = 5, DSUBREG_1 = 6 }; } -/// isARMLowRegister - Returns true if the register is low register r0-r7. -/// -static inline bool isARMLowRegister(unsigned Reg) { - using namespace ARM; - switch (Reg) { - case R0: case R1: case R2: case R3: - case R4: case R5: case R6: case R7: - return true; - default: - return false; - } -} - -struct ARMBaseRegisterInfo : public ARMGenRegisterInfo { -protected: - const TargetInstrInfo &TII; - const ARMSubtarget &STI; - - /// FramePtr - ARM physical register used as frame ptr. - unsigned FramePtr; -public: - ARMBaseRegisterInfo(const TargetInstrInfo &tii, const ARMSubtarget &STI); - - /// getRegisterNumbering - Given the enum value for some register, e.g. - /// ARM::LR, return the number that it corresponds to (e.g. 14). - static unsigned getRegisterNumbering(unsigned RegEnum); - - /// Same as previous getRegisterNumbering except it returns true in isSPVFP - /// if the register is a single precision VFP register. - static unsigned getRegisterNumbering(unsigned RegEnum, bool &isSPVFP); - - /// Code Generation virtual methods... - const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const; - - const TargetRegisterClass* const* - getCalleeSavedRegClasses(const MachineFunction *MF = 0) const; - - BitVector getReservedRegs(const MachineFunction &MF) const; - - bool isReservedReg(const MachineFunction &MF, unsigned Reg) const; - - const TargetRegisterClass *getPointerRegClass() const; - - std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator> - getAllocationOrder(const TargetRegisterClass *RC, - unsigned HintType, unsigned HintReg, - const MachineFunction &MF) const; - - unsigned ResolveRegAllocHint(unsigned Type, unsigned Reg, - const MachineFunction &MF) const; - - void UpdateRegAllocHint(unsigned Reg, unsigned NewReg, - MachineFunction &MF) const; - - bool hasFP(const MachineFunction &MF) const; - - void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS = NULL) const; - - // Debug information queries. - unsigned getRARegister() const; - unsigned getFrameRegister(MachineFunction &MF) const; - - // Exception handling queries. - unsigned getEHExceptionRegister() const; - unsigned getEHHandlerRegister() const; - - int getDwarfRegNum(unsigned RegNum, bool isEH) const; - - bool isLowRegister(unsigned Reg) const; - -private: - unsigned getRegisterPairEven(unsigned Reg, const MachineFunction &MF) const; - - unsigned getRegisterPairOdd(unsigned Reg, const MachineFunction &MF) const; -}; - struct ARMRegisterInfo : public ARMBaseRegisterInfo { public: - ARMRegisterInfo(const TargetInstrInfo &tii, const ARMSubtarget &STI); - - /// emitLoadConstPool - Emits a load from constpool to materialize the - /// specified immediate. - void emitLoadConstPool(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - const TargetInstrInfo *TII, DebugLoc dl, - unsigned DestReg, int Val, - ARMCC::CondCodes Pred = ARMCC::AL, - unsigned PredReg = 0) const; - - /// Code Generation virtual methods... - bool isReservedReg(const MachineFunction &MF, unsigned Reg) const; - - bool requiresRegisterScavenging(const MachineFunction &MF) const; - - bool hasReservedCallFrame(MachineFunction &MF) const; - - void eliminateCallFramePseudoInstr(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const; - - void eliminateFrameIndex(MachineBasicBlock::iterator II, - int SPAdj, RegScavenger *RS = NULL) const; - - void emitPrologue(MachineFunction &MF) const; - void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + ARMRegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI); }; } // end namespace llvm diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td index a057e5c..20a7355 100644 --- a/lib/Target/ARM/ARMRegisterInfo.td +++ b/lib/Target/ARM/ARMRegisterInfo.td @@ -18,8 +18,8 @@ class ARMReg<bits<4> num, string n, list<Register> subregs = []> : Register<n> { let SubRegs = subregs; } -class ARMFReg<bits<5> num, string n> : Register<n> { - field bits<5> Num; +class ARMFReg<bits<6> num, string n> : Register<n> { + field bits<6> Num; let Namespace = "ARM"; } @@ -58,10 +58,11 @@ def S24 : ARMFReg<24, "s24">; def S25 : ARMFReg<25, "s25">; def S26 : ARMFReg<26, "s26">; def S27 : ARMFReg<27, "s27">; def S28 : ARMFReg<28, "s28">; def S29 : ARMFReg<29, "s29">; def S30 : ARMFReg<30, "s30">; def S31 : ARMFReg<31, "s31">; +def SDummy : ARMFReg<63, "sINVALID">; // Aliases of the F* registers used to hold 64-bit fp values (doubles) def D0 : ARMReg< 0, "d0", [S0, S1]>; -def D1 : ARMReg< 1, "d1", [S2, S3]>; +def D1 : ARMReg< 1, "d1", [S2, S3]>; def D2 : ARMReg< 2, "d2", [S4, S5]>; def D3 : ARMReg< 3, "d3", [S6, S7]>; def D4 : ARMReg< 4, "d4", [S8, S9]>; @@ -78,18 +79,18 @@ def D14 : ARMReg<14, "d14", [S28, S29]>; def D15 : ARMReg<15, "d15", [S30, S31]>; // VFP3 defines 16 additional double registers -def D16 : ARMFReg<16, "d16">; def D17 : ARMFReg<17, "d16">; -def D18 : ARMFReg<18, "d16">; def D19 : ARMFReg<19, "d16">; -def D20 : ARMFReg<20, "d16">; def D21 : ARMFReg<21, "d16">; -def D22 : ARMFReg<22, "d16">; def D23 : ARMFReg<23, "d16">; -def D24 : ARMFReg<24, "d16">; def D25 : ARMFReg<25, "d16">; -def D26 : ARMFReg<26, "d16">; def D27 : ARMFReg<27, "d16">; -def D28 : ARMFReg<28, "d16">; def D29 : ARMFReg<29, "d16">; -def D30 : ARMFReg<30, "d16">; def D31 : ARMFReg<31, "d16">; +def D16 : ARMFReg<16, "d16">; def D17 : ARMFReg<17, "d17">; +def D18 : ARMFReg<18, "d18">; def D19 : ARMFReg<19, "d19">; +def D20 : ARMFReg<20, "d20">; def D21 : ARMFReg<21, "d21">; +def D22 : ARMFReg<22, "d22">; def D23 : ARMFReg<23, "d23">; +def D24 : ARMFReg<24, "d24">; def D25 : ARMFReg<25, "d25">; +def D26 : ARMFReg<26, "d26">; def D27 : ARMFReg<27, "d27">; +def D28 : ARMFReg<28, "d28">; def D29 : ARMFReg<29, "d29">; +def D30 : ARMFReg<30, "d30">; def D31 : ARMFReg<31, "d31">; // Advanced SIMD (NEON) defines 16 quad-word aliases def Q0 : ARMReg< 0, "q0", [D0, D1]>; -def Q1 : ARMReg< 1, "q1", [D2, D3]>; +def Q1 : ARMReg< 1, "q1", [D2, D3]>; def Q2 : ARMReg< 2, "q2", [D4, D5]>; def Q3 : ARMReg< 3, "q3", [D6, D7]>; def Q4 : ARMReg< 4, "q4", [D8, D9]>; @@ -106,7 +107,9 @@ def Q14 : ARMReg<14, "q14", [D28, D29]>; def Q15 : ARMReg<15, "q15", [D30, D31]>; // Current Program Status Register. -def CPSR : ARMReg<0, "cpsr">; +def CPSR : ARMReg<0, "cpsr">; + +def FPSCR : ARMReg<1, "fpscr">; // Register classes. // @@ -158,6 +161,13 @@ def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, ARM::R4, ARM::R5, ARM::R6, ARM::R8, ARM::R10,ARM::R11, ARM::R7 }; + // FP is R7, R9 is available as callee-saved register. + // This is used by non-Darwin platform in Thumb mode. + static const unsigned ARM_GPR_AO_5[] = { + ARM::R0, ARM::R1, ARM::R2, ARM::R3, + ARM::R12,ARM::LR, + ARM::R4, ARM::R5, ARM::R6, + ARM::R8, ARM::R9, ARM::R10,ARM::R11,ARM::R7 }; GPRClass::iterator GPRClass::allocation_order_begin(const MachineFunction &MF) const { @@ -171,6 +181,8 @@ def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, } else { if (Subtarget.isR9Reserved()) return ARM_GPR_AO_2; + else if (Subtarget.isThumb()) + return ARM_GPR_AO_5; else return ARM_GPR_AO_1; } @@ -191,6 +203,8 @@ def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, } else { if (Subtarget.isR9Reserved()) I = ARM_GPR_AO_2 + (sizeof(ARM_GPR_AO_2)/sizeof(unsigned)); + else if (Subtarget.isThumb()) + I = ARM_GPR_AO_5 + (sizeof(ARM_GPR_AO_5)/sizeof(unsigned)); else I = ARM_GPR_AO_1 + (sizeof(ARM_GPR_AO_1)/sizeof(unsigned)); } @@ -240,32 +254,45 @@ def SPR : RegisterClass<"ARM", [f32], 32, [S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S30, S31]>; +// Subset of SPR which can be used as a source of NEON scalars for 16-bit +// operations +def SPR_8 : RegisterClass<"ARM", [f32], 32, + [S0, S1, S2, S3, S4, S5, S6, S7, + S8, S9, S10, S11, S12, S13, S14, S15]>; + +// Dummy f32 regclass to represent impossible subreg indices. +def SPR_INVALID : RegisterClass<"ARM", [f32], 32, [SDummy]> { + let CopyCost = -1; +} + // Scalar double precision floating point / generic 64-bit vector register // class. // ARM requires only word alignment for double. It's more performant if it // is double-word alignment though. def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, [D0, D1, D2, D3, D4, D5, D6, D7, - D8, D9, D10, D11, D12, D13, D14, D15]> { - let SubRegClassList = [SPR, SPR]; + D8, D9, D10, D11, D12, D13, D14, D15, + D16, D17, D18, D19, D20, D21, D22, D23, + D24, D25, D26, D27, D28, D29, D30, D31]> { + let SubRegClassList = [SPR_INVALID, SPR_INVALID]; let MethodProtos = [{ iterator allocation_order_begin(const MachineFunction &MF) const; iterator allocation_order_end(const MachineFunction &MF) const; }]; let MethodBodies = [{ // VFP2 - static const unsigned ARM_DPR_VFP2[] = { - ARM::D0, ARM::D1, ARM::D2, ARM::D3, - ARM::D4, ARM::D5, ARM::D6, ARM::D7, - ARM::D8, ARM::D9, ARM::D10, ARM::D11, + static const unsigned ARM_DPR_VFP2[] = { + ARM::D0, ARM::D1, ARM::D2, ARM::D3, + ARM::D4, ARM::D5, ARM::D6, ARM::D7, + ARM::D8, ARM::D9, ARM::D10, ARM::D11, ARM::D12, ARM::D13, ARM::D14, ARM::D15 }; // VFP3 static const unsigned ARM_DPR_VFP3[] = { - ARM::D0, ARM::D1, ARM::D2, ARM::D3, - ARM::D4, ARM::D5, ARM::D6, ARM::D7, - ARM::D8, ARM::D9, ARM::D10, ARM::D11, + ARM::D0, ARM::D1, ARM::D2, ARM::D3, + ARM::D4, ARM::D5, ARM::D6, ARM::D7, + ARM::D8, ARM::D9, ARM::D10, ARM::D11, ARM::D12, ARM::D13, ARM::D14, ARM::D15, - ARM::D16, ARM::D17, ARM::D18, ARM::D15, + ARM::D16, ARM::D17, ARM::D18, ARM::D19, ARM::D20, ARM::D21, ARM::D22, ARM::D23, ARM::D24, ARM::D25, ARM::D26, ARM::D27, ARM::D28, ARM::D29, ARM::D30, ARM::D31 }; @@ -290,11 +317,34 @@ def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, }]; } +// Subset of DPR that are accessible with VFP2 (and so that also have +// 32-bit SPR subregs). +def DPR_VFP2 : RegisterClass<"ARM", [f64, v2i32, v2f32], 64, + [D0, D1, D2, D3, D4, D5, D6, D7, + D8, D9, D10, D11, D12, D13, D14, D15]> { + let SubRegClassList = [SPR, SPR]; +} + +// Subset of DPR which can be used as a source of NEON scalars for 16-bit +// operations +def DPR_8 : RegisterClass<"ARM", [f64, v4i16, v2f32], 64, + [D0, D1, D2, D3, D4, D5, D6, D7]> { + let SubRegClassList = [SPR_8, SPR_8]; +} + // Generic 128-bit vector register class. def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128, [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15]> { - let SubRegClassList = [SPR, SPR, SPR, SPR, DPR, DPR]; + let SubRegClassList = [SPR_INVALID, SPR_INVALID, SPR_INVALID, SPR_INVALID, + DPR, DPR]; +} + +// Subset of QPR that have 32-bit SPR subregs. +def QPR_VFP2 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + 128, + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]> { + let SubRegClassList = [SPR, SPR, SPR, SPR, DPR_VFP2, DPR_VFP2]; } // Condition code registers. @@ -341,4 +391,3 @@ def : SubRegSet<6, [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15], [D1, D3, D5, D7, D9, D11, D13, D15, D17, D19, D21, D23, D25, D27, D29, D31]>; - diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td index 75fa707..fc4c5f5 100644 --- a/lib/Target/ARM/ARMSchedule.td +++ b/lib/Target/ARM/ARMSchedule.td @@ -10,26 +10,151 @@ //===----------------------------------------------------------------------===// // Functional units across ARM processors // -def FU_iALU : FuncUnit; // Integer alu unit -def FU_iLdSt : FuncUnit; // Integer load / store unit -def FU_FpALU : FuncUnit; // FP alu unit -def FU_FpLdSt : FuncUnit; // FP load / store unit -def FU_Br : FuncUnit; // Branch unit +def FU_Issue : FuncUnit; // issue +def FU_Pipe0 : FuncUnit; // pipeline 0 +def FU_Pipe1 : FuncUnit; // pipeline 1 +def FU_LdSt0 : FuncUnit; // pipeline 0 load/store +def FU_LdSt1 : FuncUnit; // pipeline 1 load/store +def FU_NPipe : FuncUnit; // NEON ALU/MUL pipe +def FU_NLSPipe : FuncUnit; // NEON LS pipe //===----------------------------------------------------------------------===// // Instruction Itinerary classes used for ARM // -def IIC_iALU : InstrItinClass; -def IIC_iLoad : InstrItinClass; -def IIC_iStore : InstrItinClass; -def IIC_fpALU : InstrItinClass; -def IIC_fpLoad : InstrItinClass; -def IIC_fpStore : InstrItinClass; -def IIC_Br : InstrItinClass; +def IIC_iALUx : InstrItinClass; +def IIC_iALUi : InstrItinClass; +def IIC_iALUr : InstrItinClass; +def IIC_iALUsi : InstrItinClass; +def IIC_iALUsr : InstrItinClass; +def IIC_iUNAr : InstrItinClass; +def IIC_iUNAsi : InstrItinClass; +def IIC_iUNAsr : InstrItinClass; +def IIC_iCMPi : InstrItinClass; +def IIC_iCMPr : InstrItinClass; +def IIC_iCMPsi : InstrItinClass; +def IIC_iCMPsr : InstrItinClass; +def IIC_iMOVi : InstrItinClass; +def IIC_iMOVr : InstrItinClass; +def IIC_iMOVsi : InstrItinClass; +def IIC_iMOVsr : InstrItinClass; +def IIC_iCMOVi : InstrItinClass; +def IIC_iCMOVr : InstrItinClass; +def IIC_iCMOVsi : InstrItinClass; +def IIC_iCMOVsr : InstrItinClass; +def IIC_iMUL16 : InstrItinClass; +def IIC_iMAC16 : InstrItinClass; +def IIC_iMUL32 : InstrItinClass; +def IIC_iMAC32 : InstrItinClass; +def IIC_iMUL64 : InstrItinClass; +def IIC_iMAC64 : InstrItinClass; +def IIC_iLoadi : InstrItinClass; +def IIC_iLoadr : InstrItinClass; +def IIC_iLoadsi : InstrItinClass; +def IIC_iLoadiu : InstrItinClass; +def IIC_iLoadru : InstrItinClass; +def IIC_iLoadsiu : InstrItinClass; +def IIC_iLoadm : InstrItinClass; +def IIC_iStorei : InstrItinClass; +def IIC_iStorer : InstrItinClass; +def IIC_iStoresi : InstrItinClass; +def IIC_iStoreiu : InstrItinClass; +def IIC_iStoreru : InstrItinClass; +def IIC_iStoresiu : InstrItinClass; +def IIC_iStorem : InstrItinClass; +def IIC_Br : InstrItinClass; +def IIC_fpSTAT : InstrItinClass; +def IIC_fpUNA32 : InstrItinClass; +def IIC_fpUNA64 : InstrItinClass; +def IIC_fpCMP32 : InstrItinClass; +def IIC_fpCMP64 : InstrItinClass; +def IIC_fpCVTSD : InstrItinClass; +def IIC_fpCVTDS : InstrItinClass; +def IIC_fpCVTIS : InstrItinClass; +def IIC_fpCVTID : InstrItinClass; +def IIC_fpCVTSI : InstrItinClass; +def IIC_fpCVTDI : InstrItinClass; +def IIC_fpALU32 : InstrItinClass; +def IIC_fpALU64 : InstrItinClass; +def IIC_fpMUL32 : InstrItinClass; +def IIC_fpMUL64 : InstrItinClass; +def IIC_fpMAC32 : InstrItinClass; +def IIC_fpMAC64 : InstrItinClass; +def IIC_fpDIV32 : InstrItinClass; +def IIC_fpDIV64 : InstrItinClass; +def IIC_fpSQRT32 : InstrItinClass; +def IIC_fpSQRT64 : InstrItinClass; +def IIC_fpLoad32 : InstrItinClass; +def IIC_fpLoad64 : InstrItinClass; +def IIC_fpLoadm : InstrItinClass; +def IIC_fpStore32 : InstrItinClass; +def IIC_fpStore64 : InstrItinClass; +def IIC_fpStorem : InstrItinClass; +def IIC_VLD1 : InstrItinClass; +def IIC_VLD2 : InstrItinClass; +def IIC_VLD3 : InstrItinClass; +def IIC_VLD4 : InstrItinClass; +def IIC_VST : InstrItinClass; +def IIC_VUNAD : InstrItinClass; +def IIC_VUNAQ : InstrItinClass; +def IIC_VBIND : InstrItinClass; +def IIC_VBINQ : InstrItinClass; +def IIC_VMOVImm : InstrItinClass; +def IIC_VMOVD : InstrItinClass; +def IIC_VMOVQ : InstrItinClass; +def IIC_VMOVIS : InstrItinClass; +def IIC_VMOVID : InstrItinClass; +def IIC_VMOVISL : InstrItinClass; +def IIC_VMOVSI : InstrItinClass; +def IIC_VMOVDI : InstrItinClass; +def IIC_VPERMD : InstrItinClass; +def IIC_VPERMQ : InstrItinClass; +def IIC_VPERMQ3 : InstrItinClass; +def IIC_VMACD : InstrItinClass; +def IIC_VMACQ : InstrItinClass; +def IIC_VRECSD : InstrItinClass; +def IIC_VRECSQ : InstrItinClass; +def IIC_VCNTiD : InstrItinClass; +def IIC_VCNTiQ : InstrItinClass; +def IIC_VUNAiD : InstrItinClass; +def IIC_VUNAiQ : InstrItinClass; +def IIC_VQUNAiD : InstrItinClass; +def IIC_VQUNAiQ : InstrItinClass; +def IIC_VBINiD : InstrItinClass; +def IIC_VBINiQ : InstrItinClass; +def IIC_VSUBiD : InstrItinClass; +def IIC_VSUBiQ : InstrItinClass; +def IIC_VBINi4D : InstrItinClass; +def IIC_VBINi4Q : InstrItinClass; +def IIC_VSHLiD : InstrItinClass; +def IIC_VSHLiQ : InstrItinClass; +def IIC_VSHLi4D : InstrItinClass; +def IIC_VSHLi4Q : InstrItinClass; +def IIC_VPALiD : InstrItinClass; +def IIC_VPALiQ : InstrItinClass; +def IIC_VMULi16D : InstrItinClass; +def IIC_VMULi32D : InstrItinClass; +def IIC_VMULi16Q : InstrItinClass; +def IIC_VMULi32Q : InstrItinClass; +def IIC_VMACi16D : InstrItinClass; +def IIC_VMACi32D : InstrItinClass; +def IIC_VMACi16Q : InstrItinClass; +def IIC_VMACi32Q : InstrItinClass; +def IIC_VEXTD : InstrItinClass; +def IIC_VEXTQ : InstrItinClass; +def IIC_VTB1 : InstrItinClass; +def IIC_VTB2 : InstrItinClass; +def IIC_VTB3 : InstrItinClass; +def IIC_VTB4 : InstrItinClass; +def IIC_VTBX1 : InstrItinClass; +def IIC_VTBX2 : InstrItinClass; +def IIC_VTBX3 : InstrItinClass; +def IIC_VTBX4 : InstrItinClass; //===----------------------------------------------------------------------===// // Processor instruction itineraries. def GenericItineraries : ProcessorItineraries<[]>; + include "ARMScheduleV6.td" +include "ARMScheduleV7.td" diff --git a/lib/Target/ARM/ARMScheduleV6.td b/lib/Target/ARM/ARMScheduleV6.td index 596a57f..1ace718 100644 --- a/lib/Target/ARM/ARMScheduleV6.td +++ b/lib/Target/ARM/ARMScheduleV6.td @@ -1,4 +1,4 @@ -//===- ARMSchedule.td - ARM v6 Scheduling Definitions ------*- tablegen -*-===// +//===- ARMScheduleV6.td - ARM v6 Scheduling Definitions ----*- tablegen -*-===// // // The LLVM Compiler Infrastructure // @@ -11,12 +11,4 @@ // //===----------------------------------------------------------------------===// -def V6Itineraries : ProcessorItineraries<[ - InstrItinData<IIC_iALU , [InstrStage<1, [FU_iALU]>]>, - InstrItinData<IIC_iLoad , [InstrStage<2, [FU_iLdSt]>]>, - InstrItinData<IIC_iStore , [InstrStage<1, [FU_iLdSt]>]>, - InstrItinData<IIC_fpALU , [InstrStage<6, [FU_FpALU]>]>, - InstrItinData<IIC_fpLoad , [InstrStage<2, [FU_FpLdSt]>]>, - InstrItinData<IIC_fpStore , [InstrStage<1, [FU_FpLdSt]>]>, - InstrItinData<IIC_Br , [InstrStage<3, [FU_Br]>]> -]>; +// TODO: Add model for an ARM11 diff --git a/lib/Target/ARM/ARMScheduleV7.td b/lib/Target/ARM/ARMScheduleV7.td new file mode 100644 index 0000000..e565813 --- /dev/null +++ b/lib/Target/ARM/ARMScheduleV7.td @@ -0,0 +1,587 @@ +//===- ARMScheduleV7.td - ARM v7 Scheduling Definitions ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the ARM v7 processors. +// +//===----------------------------------------------------------------------===// + +// +// Scheduling information derived from "Cortex-A8 Technical Reference Manual". +// +// Dual issue pipeline represented by FU_Pipe0 | FU_Pipe1 +// +def CortexA8Itineraries : ProcessorItineraries<[ + + // Two fully-pipelined integer ALU pipelines + // + // No operand cycles + InstrItinData<IIC_iALUx , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>]>, + // + // Binary Instructions that produce a result + InstrItinData<IIC_iALUi , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iALUr , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2, 2]>, + InstrItinData<IIC_iALUsi , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2, 1]>, + InstrItinData<IIC_iALUsr , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2, 1, 1]>, + // + // Unary Instructions that produce a result + InstrItinData<IIC_iUNAr , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iUNAsi , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iUNAsr , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1, 1]>, + // + // Compare instructions + InstrItinData<IIC_iCMPi , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2]>, + InstrItinData<IIC_iCMPr , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iCMPsi , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iCMPsr , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1, 1]>, + // + // Move instructions, unconditional + InstrItinData<IIC_iMOVi , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [1]>, + InstrItinData<IIC_iMOVr , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [1, 1]>, + InstrItinData<IIC_iMOVsi , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [1, 1]>, + InstrItinData<IIC_iMOVsr , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [1, 1, 1]>, + // + // Move instructions, conditional + InstrItinData<IIC_iCMOVi , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2]>, + InstrItinData<IIC_iCMOVr , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iCMOVsi , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iCMOVsr , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1, 1]>, + + // Integer multiply pipeline + // Result written in E5, but that is relative to the last cycle of multicycle, + // so we use 6 for those cases + // + InstrItinData<IIC_iMUL16 , [InstrStage<1, [FU_Pipe0]>], [5, 1, 1]>, + InstrItinData<IIC_iMAC16 , [InstrStage<1, [FU_Pipe1], 0>, + InstrStage<2, [FU_Pipe0]>], [6, 1, 1, 4]>, + InstrItinData<IIC_iMUL32 , [InstrStage<1, [FU_Pipe1], 0>, + InstrStage<2, [FU_Pipe0]>], [6, 1, 1]>, + InstrItinData<IIC_iMAC32 , [InstrStage<1, [FU_Pipe1], 0>, + InstrStage<2, [FU_Pipe0]>], [6, 1, 1, 4]>, + InstrItinData<IIC_iMUL64 , [InstrStage<2, [FU_Pipe1], 0>, + InstrStage<3, [FU_Pipe0]>], [6, 6, 1, 1]>, + InstrItinData<IIC_iMAC64 , [InstrStage<2, [FU_Pipe1], 0>, + InstrStage<3, [FU_Pipe0]>], [6, 6, 1, 1]>, + + // Integer load pipeline + // + // loads have an extra cycle of latency, but are fully pipelined + // use FU_Issue to enforce the 1 load/store per cycle limit + // + // Immediate offset + InstrItinData<IIC_iLoadi , [InstrStage<1, [FU_Issue], 0>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0]>], [3, 1]>, + // + // Register offset + InstrItinData<IIC_iLoadr , [InstrStage<1, [FU_Issue], 0>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0]>], [3, 1, 1]>, + // + // Scaled register offset, issues over 2 cycles + InstrItinData<IIC_iLoadsi , [InstrStage<2, [FU_Issue], 0>, + InstrStage<1, [FU_Pipe0], 0>, + InstrStage<1, [FU_Pipe1]>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0]>], [4, 1, 1]>, + // + // Immediate offset with update + InstrItinData<IIC_iLoadiu , [InstrStage<1, [FU_Issue], 0>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0]>], [3, 2, 1]>, + // + // Register offset with update + InstrItinData<IIC_iLoadru , [InstrStage<1, [FU_Issue], 0>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0]>], [3, 2, 1, 1]>, + // + // Scaled register offset with update, issues over 2 cycles + InstrItinData<IIC_iLoadsiu , [InstrStage<2, [FU_Issue], 0>, + InstrStage<1, [FU_Pipe0], 0>, + InstrStage<1, [FU_Pipe1]>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0]>], [4, 3, 1, 1]>, + // + // Load multiple + InstrItinData<IIC_iLoadm , [InstrStage<2, [FU_Issue], 0>, + InstrStage<2, [FU_Pipe0], 0>, + InstrStage<2, [FU_Pipe1]>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0]>]>, + + // Integer store pipeline + // + // use FU_Issue to enforce the 1 load/store per cycle limit + // + // Immediate offset + InstrItinData<IIC_iStorei , [InstrStage<1, [FU_Issue], 0>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0]>], [3, 1]>, + // + // Register offset + InstrItinData<IIC_iStorer , [InstrStage<1, [FU_Issue], 0>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0]>], [3, 1, 1]>, + // + // Scaled register offset, issues over 2 cycles + InstrItinData<IIC_iStoresi , [InstrStage<2, [FU_Issue], 0>, + InstrStage<1, [FU_Pipe0], 0>, + InstrStage<1, [FU_Pipe1]>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0]>], [3, 1, 1]>, + // + // Immediate offset with update + InstrItinData<IIC_iStoreiu , [InstrStage<1, [FU_Issue], 0>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0]>], [2, 3, 1]>, + // + // Register offset with update + InstrItinData<IIC_iStoreru , [InstrStage<1, [FU_Issue], 0>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0]>], [2, 3, 1, 1]>, + // + // Scaled register offset with update, issues over 2 cycles + InstrItinData<IIC_iStoresiu, [InstrStage<2, [FU_Issue], 0>, + InstrStage<1, [FU_Pipe0], 0>, + InstrStage<1, [FU_Pipe1]>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0]>], [3, 3, 1, 1]>, + // + // Store multiple + InstrItinData<IIC_iStorem , [InstrStage<2, [FU_Issue], 0>, + InstrStage<2, [FU_Pipe0], 0>, + InstrStage<2, [FU_Pipe1]>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0]>]>, + + // Branch + // + // no delay slots, so the latency of a branch is unimportant + InstrItinData<IIC_Br , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>]>, + + // VFP + // Issue through integer pipeline, and execute in NEON unit. We assume + // RunFast mode so that NFP pipeline is used for single-precision when + // possible. + // + // FP Special Register to Integer Register File Move + InstrItinData<IIC_fpSTAT , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NLSPipe]>]>, + // + // Single-precision FP Unary + InstrItinData<IIC_fpUNA32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [7, 1]>, + // + // Double-precision FP Unary + InstrItinData<IIC_fpUNA64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<4, [FU_NPipe], 0>, + InstrStage<4, [FU_NLSPipe]>]>, + // + // Single-precision FP Compare + InstrItinData<IIC_fpCMP32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [7, 1]>, + // + // Double-precision FP Compare + InstrItinData<IIC_fpCMP64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<4, [FU_NPipe], 0>, + InstrStage<4, [FU_NLSPipe]>]>, + // + // Single to Double FP Convert + InstrItinData<IIC_fpCVTSD , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<7, [FU_NPipe], 0>, + InstrStage<7, [FU_NLSPipe]>]>, + // + // Double to Single FP Convert + InstrItinData<IIC_fpCVTDS , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<5, [FU_NPipe], 0>, + InstrStage<5, [FU_NLSPipe]>]>, + // + // Single-Precision FP to Integer Convert + InstrItinData<IIC_fpCVTSI , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [7, 1]>, + // + // Double-Precision FP to Integer Convert + InstrItinData<IIC_fpCVTDI , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<8, [FU_NPipe], 0>, + InstrStage<8, [FU_NLSPipe]>]>, + // + // Integer to Single-Precision FP Convert + InstrItinData<IIC_fpCVTIS , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [7, 1]>, + // + // Integer to Double-Precision FP Convert + InstrItinData<IIC_fpCVTID , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<8, [FU_NPipe], 0>, + InstrStage<8, [FU_NLSPipe]>]>, + // + // Single-precision FP ALU + InstrItinData<IIC_fpALU32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [7, 1]>, + // + // Double-precision FP ALU + InstrItinData<IIC_fpALU64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<9, [FU_NPipe], 0>, + InstrStage<9, [FU_NLSPipe]>]>, + // + // Single-precision FP Multiply + InstrItinData<IIC_fpMUL32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [7, 1]>, + // + // Double-precision FP Multiply + InstrItinData<IIC_fpMUL64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<11, [FU_NPipe], 0>, + InstrStage<11, [FU_NLSPipe]>]>, + // + // Single-precision FP MAC + InstrItinData<IIC_fpMAC32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [7, 1]>, + // + // Double-precision FP MAC + InstrItinData<IIC_fpMAC64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<19, [FU_NPipe], 0>, + InstrStage<19, [FU_NLSPipe]>]>, + // + // Single-precision FP DIV + InstrItinData<IIC_fpDIV32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<20, [FU_NPipe], 0>, + InstrStage<20, [FU_NLSPipe]>]>, + // + // Double-precision FP DIV + InstrItinData<IIC_fpDIV64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<29, [FU_NPipe], 0>, + InstrStage<29, [FU_NLSPipe]>]>, + // + // Single-precision FP SQRT + InstrItinData<IIC_fpSQRT32, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<19, [FU_NPipe], 0>, + InstrStage<19, [FU_NLSPipe]>]>, + // + // Double-precision FP SQRT + InstrItinData<IIC_fpSQRT64, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<29, [FU_NPipe], 0>, + InstrStage<29, [FU_NLSPipe]>]>, + // + // Single-precision FP Load + // use FU_Issue to enforce the 1 load/store per cycle limit + InstrItinData<IIC_fpLoad32, [InstrStage<1, [FU_Issue], 0>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0], 0>, + InstrStage<1, [FU_NLSPipe]>]>, + // + // Double-precision FP Load + // use FU_Issue to enforce the 1 load/store per cycle limit + InstrItinData<IIC_fpLoad64, [InstrStage<2, [FU_Issue], 0>, + InstrStage<1, [FU_Pipe0], 0>, + InstrStage<1, [FU_Pipe1]>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0], 0>, + InstrStage<1, [FU_NLSPipe]>]>, + // + // FP Load Multiple + // use FU_Issue to enforce the 1 load/store per cycle limit + InstrItinData<IIC_fpLoadm, [InstrStage<3, [FU_Issue], 0>, + InstrStage<2, [FU_Pipe0], 0>, + InstrStage<2, [FU_Pipe1]>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0], 0>, + InstrStage<1, [FU_NLSPipe]>]>, + // + // Single-precision FP Store + // use FU_Issue to enforce the 1 load/store per cycle limit + InstrItinData<IIC_fpStore32,[InstrStage<1, [FU_Issue], 0>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0], 0>, + InstrStage<1, [FU_NLSPipe]>]>, + // + // Double-precision FP Store + // use FU_Issue to enforce the 1 load/store per cycle limit + InstrItinData<IIC_fpStore64,[InstrStage<2, [FU_Issue], 0>, + InstrStage<1, [FU_Pipe0], 0>, + InstrStage<1, [FU_Pipe1]>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0], 0>, + InstrStage<1, [FU_NLSPipe]>]>, + // + // FP Store Multiple + // use FU_Issue to enforce the 1 load/store per cycle limit + InstrItinData<IIC_fpStorem, [InstrStage<3, [FU_Issue], 0>, + InstrStage<2, [FU_Pipe0], 0>, + InstrStage<2, [FU_Pipe1]>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0], 0>, + InstrStage<1, [FU_NLSPipe]>]>, + + // NEON + // Issue through integer pipeline, and execute in NEON unit. + // + // VLD1 + InstrItinData<IIC_VLD1, [InstrStage<1, [FU_Issue], 0>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0], 0>, + InstrStage<1, [FU_NLSPipe]>]>, + // + // VLD2 + InstrItinData<IIC_VLD2, [InstrStage<1, [FU_Issue], 0>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0], 0>, + InstrStage<1, [FU_NLSPipe]>], [2, 2, 1]>, + // + // VLD3 + InstrItinData<IIC_VLD3, [InstrStage<1, [FU_Issue], 0>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0], 0>, + InstrStage<1, [FU_NLSPipe]>], [2, 2, 2, 1]>, + // + // VLD4 + InstrItinData<IIC_VLD4, [InstrStage<1, [FU_Issue], 0>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0], 0>, + InstrStage<1, [FU_NLSPipe]>], [2, 2, 2, 2, 1]>, + // + // VST + InstrItinData<IIC_VST, [InstrStage<1, [FU_Issue], 0>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_LdSt0], 0>, + InstrStage<1, [FU_NLSPipe]>]>, + // + // Double-register FP Unary + InstrItinData<IIC_VUNAD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [5, 2]>, + // + // Quad-register FP Unary + // Result written in N5, but that is relative to the last cycle of multicycle, + // so we use 6 for those cases + InstrItinData<IIC_VUNAQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<2, [FU_NPipe]>], [6, 2]>, + // + // Double-register FP Binary + InstrItinData<IIC_VBIND, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [5, 2, 2]>, + // + // Quad-register FP Binary + // Result written in N5, but that is relative to the last cycle of multicycle, + // so we use 6 for those cases + InstrItinData<IIC_VBINQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<2, [FU_NPipe]>], [6, 2, 2]>, + // + // Move Immediate + InstrItinData<IIC_VMOVImm, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [3]>, + // + // Double-register Permute Move + InstrItinData<IIC_VMOVD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NLSPipe]>], [2, 1]>, + // + // Quad-register Permute Move + // Result written in N2, but that is relative to the last cycle of multicycle, + // so we use 3 for those cases + InstrItinData<IIC_VMOVQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<2, [FU_NLSPipe]>], [3, 1]>, + // + // Integer to Single-precision Move + InstrItinData<IIC_VMOVIS , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NLSPipe]>], [2, 1]>, + // + // Integer to Double-precision Move + InstrItinData<IIC_VMOVID , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NLSPipe]>], [2, 1, 1]>, + // + // Single-precision to Integer Move + InstrItinData<IIC_VMOVSI , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NLSPipe]>], [20, 1]>, + // + // Double-precision to Integer Move + InstrItinData<IIC_VMOVDI , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NLSPipe]>], [20, 20, 1]>, + // + // Integer to Lane Move + InstrItinData<IIC_VMOVISL , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<2, [FU_NLSPipe]>], [3, 1, 1]>, + // + // Double-register Permute + InstrItinData<IIC_VPERMD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NLSPipe]>], [2, 2, 1, 1]>, + // + // Quad-register Permute + // Result written in N2, but that is relative to the last cycle of multicycle, + // so we use 3 for those cases + InstrItinData<IIC_VPERMQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<2, [FU_NLSPipe]>], [3, 3, 1, 1]>, + // + // Quad-register Permute (3 cycle issue) + // Result written in N2, but that is relative to the last cycle of multicycle, + // so we use 4 for those cases + InstrItinData<IIC_VPERMQ3, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NLSPipe]>, + InstrStage<1, [FU_NPipe], 0>, + InstrStage<2, [FU_NLSPipe]>], [4, 4, 1, 1]>, + // + // Double-register FP Multiple-Accumulate + InstrItinData<IIC_VMACD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [9, 2, 2, 3]>, + // + // Quad-register FP Multiple-Accumulate + // Result written in N9, but that is relative to the last cycle of multicycle, + // so we use 10 for those cases + InstrItinData<IIC_VMACQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<2, [FU_NPipe]>], [10, 2, 2, 3]>, + // + // Double-register Reciprical Step + InstrItinData<IIC_VRECSD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [9, 2, 2]>, + // + // Quad-register Reciprical Step + InstrItinData<IIC_VRECSQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<2, [FU_NPipe]>], [10, 2, 2]>, + // + // Double-register Integer Count + InstrItinData<IIC_VCNTiD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [3, 2, 2]>, + // + // Quad-register Integer Count + // Result written in N3, but that is relative to the last cycle of multicycle, + // so we use 4 for those cases + InstrItinData<IIC_VCNTiQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<2, [FU_NPipe]>], [4, 2, 2]>, + // + // Double-register Integer Unary + InstrItinData<IIC_VUNAiD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [4, 2]>, + // + // Quad-register Integer Unary + InstrItinData<IIC_VUNAiQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [4, 2]>, + // + // Double-register Integer Q-Unary + InstrItinData<IIC_VQUNAiD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [4, 1]>, + // + // Quad-register Integer CountQ-Unary + InstrItinData<IIC_VQUNAiQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [4, 1]>, + // + // Double-register Integer Binary + InstrItinData<IIC_VBINiD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [3, 2, 2]>, + // + // Quad-register Integer Binary + InstrItinData<IIC_VBINiQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [3, 2, 2]>, + // + // Double-register Integer Binary (4 cycle) + InstrItinData<IIC_VBINi4D, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [4, 2, 1]>, + // + // Quad-register Integer Binary (4 cycle) + InstrItinData<IIC_VBINi4Q, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [4, 2, 1]>, + // + // Double-register Integer Subtract + InstrItinData<IIC_VSUBiD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [3, 2, 1]>, + // + // Quad-register Integer Subtract + InstrItinData<IIC_VSUBiQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [3, 2, 1]>, + // + // Double-register Integer Shift + InstrItinData<IIC_VSHLiD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [3, 1, 1]>, + // + // Quad-register Integer Shift + InstrItinData<IIC_VSHLiQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<2, [FU_NPipe]>], [4, 1, 1]>, + // + // Double-register Integer Shift (4 cycle) + InstrItinData<IIC_VSHLi4D, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [4, 1, 1]>, + // + // Quad-register Integer Shift (4 cycle) + InstrItinData<IIC_VSHLi4Q, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<2, [FU_NPipe]>], [5, 1, 1]>, + // + // Double-register Integer Pair Add Long + InstrItinData<IIC_VPALiD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [6, 3, 2, 1]>, + // + // Quad-register Integer Pair Add Long + InstrItinData<IIC_VPALiQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<2, [FU_NPipe]>], [7, 3, 2, 1]>, + // + // Double-register Integer Multiply (.8, .16) + InstrItinData<IIC_VMULi16D, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [6, 2, 2]>, + // + // Double-register Integer Multiply (.32) + InstrItinData<IIC_VMULi32D, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<2, [FU_NPipe]>], [7, 2, 1]>, + // + // Quad-register Integer Multiply (.8, .16) + InstrItinData<IIC_VMULi16Q, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<2, [FU_NPipe]>], [7, 2, 2]>, + // + // Quad-register Integer Multiply (.32) + InstrItinData<IIC_VMULi32Q, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>, + InstrStage<2, [FU_NLSPipe], 0>, + InstrStage<3, [FU_NPipe]>], [9, 2, 1]>, + // + // Double-register Integer Multiply-Accumulate (.8, .16) + InstrItinData<IIC_VMACi16D, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [6, 2, 2, 3]>, + // + // Double-register Integer Multiply-Accumulate (.32) + InstrItinData<IIC_VMACi32D, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<2, [FU_NPipe]>], [7, 2, 1, 3]>, + // + // Quad-register Integer Multiply-Accumulate (.8, .16) + InstrItinData<IIC_VMACi16Q, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<2, [FU_NPipe]>], [7, 2, 2, 3]>, + // + // Quad-register Integer Multiply-Accumulate (.32) + InstrItinData<IIC_VMACi32Q, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>, + InstrStage<2, [FU_NLSPipe], 0>, + InstrStage<3, [FU_NPipe]>], [9, 2, 1, 3]>, + // + // Double-register VEXT + InstrItinData<IIC_VEXTD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NLSPipe]>], [2, 1, 1]>, + // + // Quad-register VEXT + InstrItinData<IIC_VEXTQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<2, [FU_NLSPipe]>], [3, 1, 1]>, + // + // VTB + InstrItinData<IIC_VTB1, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<2, [FU_NLSPipe]>], [3, 2, 1]>, + InstrItinData<IIC_VTB2, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<2, [FU_NLSPipe]>], [3, 2, 2, 1]>, + InstrItinData<IIC_VTB3, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NLSPipe]>, + InstrStage<1, [FU_NPipe], 0>, + InstrStage<2, [FU_NLSPipe]>], [4, 2, 2, 3, 1]>, + InstrItinData<IIC_VTB4, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NLSPipe]>, + InstrStage<1, [FU_NPipe], 0>, + InstrStage<2, [FU_NLSPipe]>], [4, 2, 2, 3, 3, 1]>, + // + // VTBX + InstrItinData<IIC_VTBX1, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<2, [FU_NLSPipe]>], [3, 1, 2, 1]>, + InstrItinData<IIC_VTBX2, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<2, [FU_NLSPipe]>], [3, 1, 2, 2, 1]>, + InstrItinData<IIC_VTBX3, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NLSPipe]>, + InstrStage<1, [FU_NPipe], 0>, + InstrStage<2, [FU_NLSPipe]>], [4, 1, 2, 2, 3, 1]>, + InstrItinData<IIC_VTBX4, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NLSPipe]>, + InstrStage<1, [FU_NPipe], 0>, + InstrStage<2, [FU_NLSPipe]>], [4, 1, 2, 2, 3, 3, 1]> +]>; diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index e611088..cf1ee3f 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -13,8 +13,7 @@ #include "ARMSubtarget.h" #include "ARMGenSubtarget.inc" -#include "llvm/Module.h" -#include "llvm/Target/TargetMachine.h" +#include "llvm/GlobalValue.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Support/CommandLine.h" using namespace llvm; @@ -22,13 +21,19 @@ using namespace llvm; static cl::opt<bool> ReserveR9("arm-reserve-r9", cl::Hidden, cl::desc("Reserve R9, making it unavailable as GPR")); +static cl::opt<bool> +UseNEONFP("arm-use-neon-fp", + cl::desc("Use NEON for single-precision FP"), + cl::init(false), cl::Hidden); -ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS, +ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS, bool isThumb) : ARMArchVersion(V4T) , ARMFPUType(None) + , UseNEONForSinglePrecisionFP(UseNEONFP) , IsThumb(isThumb) , ThumbMode(Thumb1) + , PostRAScheduler(false) , IsR9Reserved(ReserveR9) , stackAlignment(4) , CPUString("generic") @@ -45,7 +50,6 @@ ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS, // Set the boolean corresponding to the current target triple, or the default // if one cannot be determined, to true. - const std::string& TT = M.getTargetTriple(); unsigned Len = TT.length(); unsigned Idx = 0; @@ -75,14 +79,14 @@ ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS, } } + // Thumb2 implies at least V6T2. + if (ARMArchVersion < V6T2 && ThumbMode >= Thumb2) + ARMArchVersion = V6T2; + if (Len >= 10) { if (TT.find("-darwin") != std::string::npos) // arm-darwin TargetType = isDarwin; - } else if (TT.empty()) { -#if defined(__APPLE__) - TargetType = isDarwin; -#endif } if (TT.find("eabi") != std::string::npos) @@ -93,4 +97,61 @@ ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS, if (isTargetDarwin()) IsR9Reserved = ReserveR9 | (ARMArchVersion < V6); + + // Set CPU specific features. + if (CPUString == "cortex-a8") { + PostRAScheduler = true; + if (UseNEONFP.getPosition() == 0) + UseNEONForSinglePrecisionFP = true; + } +} + +/// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol. +bool +ARMSubtarget::GVIsIndirectSymbol(GlobalValue *GV, Reloc::Model RelocM) const { + if (RelocM == Reloc::Static) + return false; + + // GV with ghost linkage (in JIT lazy compilation mode) do not require an + // extra load from stub. + bool isDecl = GV->isDeclaration() && !GV->hasNotBeenReadFromBitcode(); + + if (!isTargetDarwin()) { + // Extra load is needed for all externally visible. + if (GV->hasLocalLinkage() || GV->hasHiddenVisibility()) + return false; + return true; + } else { + if (RelocM == Reloc::PIC_) { + // If this is a strong reference to a definition, it is definitely not + // through a stub. + if (!isDecl && !GV->isWeakForLinker()) + return false; + + // Unless we have a symbol with hidden visibility, we have to go through a + // normal $non_lazy_ptr stub because this symbol might be resolved late. + if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference. + return true; + + // If symbol visibility is hidden, we have a stub for common symbol + // references and external declarations. + if (isDecl || GV->hasCommonLinkage()) + // Hidden $non_lazy_ptr reference. + return true; + + return false; + } else { + // If this is a strong reference to a definition, it is definitely not + // through a stub. + if (!isDecl && !GV->isWeakForLinker()) + return false; + + // Unless we have a symbol with hidden visibility, we have to go through a + // normal $non_lazy_ptr stub because this symbol might be resolved late. + if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference. + return true; + } + } + + return false; } diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 5110b31..7098fd4 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -15,11 +15,12 @@ #define ARMSUBTARGET_H #include "llvm/Target/TargetInstrItineraries.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetSubtarget.h" #include <string> namespace llvm { -class Module; +class GlobalValue; class ARMSubtarget : public TargetSubtarget { protected: @@ -43,12 +44,20 @@ protected: /// ARMFPUType - Floating Point Unit type. ARMFPEnum ARMFPUType; + /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been + /// specified. Use the method useNEONForSinglePrecisionFP() to + /// determine if NEON should actually be used. + bool UseNEONForSinglePrecisionFP; + /// IsThumb - True if we are in thumb mode, false if in ARM mode. bool IsThumb; /// ThumbMode - Indicates supported Thumb version. ThumbTypeEnum ThumbMode; + /// PostRAScheduler - True if using post-register-allocation scheduler. + bool PostRAScheduler; + /// IsR9Reserved - True if R9 is a not available as general purpose register. bool IsR9Reserved; @@ -61,7 +70,7 @@ protected: /// Selected instruction itineraries (one entry per itinerary class.) InstrItineraryData InstrItins; - + public: enum { isELF, isDarwin @@ -73,9 +82,9 @@ protected: } TargetABI; /// This constructor initializes the data members to match that - /// of the specified module. + /// of the specified triple. /// - ARMSubtarget(const Module &M, const std::string &FS, bool isThumb); + ARMSubtarget(const std::string &TT, const std::string &FS, bool isThumb); /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size /// that still makes it profitable to inline the call. @@ -99,6 +108,8 @@ protected: bool hasVFP2() const { return ARMFPUType >= VFPv2; } bool hasVFP3() const { return ARMFPUType >= VFPv3; } bool hasNEON() const { return ARMFPUType >= NEON; } + bool useNEONForSinglePrecisionFP() const { + return hasNEON() && UseNEONForSinglePrecisionFP; } bool isTargetDarwin() const { return TargetType == isDarwin; } bool isTargetELF() const { return TargetType == isELF; } @@ -108,14 +119,18 @@ protected: bool isThumb() const { return IsThumb; } bool isThumb1Only() const { return IsThumb && (ThumbMode == Thumb1); } - bool isThumb2() const { return IsThumb && (ThumbMode >= Thumb2); } + bool isThumb2() const { return IsThumb && (ThumbMode == Thumb2); } bool hasThumb2() const { return ThumbMode >= Thumb2; } bool isR9Reserved() const { return IsR9Reserved; } const std::string & getCPUString() const { return CPUString; } + + /// enablePostRAScheduler - From TargetSubtarget, return true to + /// enable post-RA scheduler. + bool enablePostRAScheduler() const { return PostRAScheduler; } - /// getInstrItins - Return the instruction itineraies based on subtarget + /// getInstrItins - Return the instruction itineraies based on subtarget /// selection. const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } @@ -123,6 +138,10 @@ protected: /// stack frame on entry to the function and which must be maintained by every /// function for this subtarget. unsigned getStackAlignment() const { return stackAlignment; } + + /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect + /// symbol. + bool GVIsIndirectSymbol(GlobalValue *GV, Reloc::Model RelocM) const; }; } // End llvm namespace diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index 2344733..32ddc20 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -11,188 +11,122 @@ //===----------------------------------------------------------------------===// #include "ARMTargetMachine.h" -#include "ARMTargetAsmInfo.h" +#include "ARMMCAsmInfo.h" #include "ARMFrameInfo.h" #include "ARM.h" -#include "llvm/Module.h" #include "llvm/PassManager.h" #include "llvm/CodeGen/Passes.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachineRegistry.h" +#include "llvm/Support/FormattedStream.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetRegistry.h" using namespace llvm; -static cl::opt<bool> DisableLdStOpti("disable-arm-loadstore-opti", cl::Hidden, - cl::desc("Disable load store optimization pass")); -static cl::opt<bool> DisableIfConversion("disable-arm-if-conversion",cl::Hidden, - cl::desc("Disable if-conversion pass")); - -/// ARMTargetMachineModule - Note that this is used on hosts that cannot link -/// in a library unless there are references into the library. In particular, -/// it seems that it is not possible to get things to work on Win32 without -/// this. Though it is unused, do not remove it. -extern "C" int ARMTargetMachineModule; -int ARMTargetMachineModule = 0; - -// Register the target. -static RegisterTarget<ARMTargetMachine> X("arm", "ARM"); -static RegisterTarget<ThumbTargetMachine> Y("thumb", "Thumb"); - -// Force static initialization. -extern "C" void LLVMInitializeARMTarget() { } - -// No assembler printer by default -ARMBaseTargetMachine::AsmPrinterCtorFn ARMBaseTargetMachine::AsmPrinterCtor = 0; - -/// ThumbTargetMachine - Create an Thumb architecture model. -/// -unsigned ThumbTargetMachine::getJITMatchQuality() { -#if defined(__thumb__) - return 10; -#endif - return 0; +static const MCAsmInfo *createMCAsmInfo(const Target &T, + const StringRef &TT) { + Triple TheTriple(TT); + switch (TheTriple.getOS()) { + case Triple::Darwin: + return new ARMMCAsmInfoDarwin(); + default: + return new ARMELFMCAsmInfo(); + } } -unsigned ThumbTargetMachine::getModuleMatchQuality(const Module &M) { - std::string TT = M.getTargetTriple(); - // Match thumb-foo-bar, as well as things like thumbv5blah-* - if (TT.size() >= 6 && - (TT.substr(0, 6) == "thumb-" || TT.substr(0, 6) == "thumbv")) - return 20; - // If the target triple is something non-thumb, we don't match. - if (!TT.empty()) return 0; +extern "C" void LLVMInitializeARMTarget() { + // Register the target. + RegisterTargetMachine<ARMTargetMachine> X(TheARMTarget); + RegisterTargetMachine<ThumbTargetMachine> Y(TheThumbTarget); - if (M.getEndianness() == Module::LittleEndian && - M.getPointerSize() == Module::Pointer32) - return 10; // Weak match - else if (M.getEndianness() != Module::AnyEndianness || - M.getPointerSize() != Module::AnyPointerSize) - return 0; // Match for some other target - - return getJITMatchQuality()/2; + // Register the target asm info. + RegisterAsmInfoFn A(TheARMTarget, createMCAsmInfo); + RegisterAsmInfoFn B(TheThumbTarget, createMCAsmInfo); } /// TargetMachine ctor - Create an ARM architecture model. /// -ARMBaseTargetMachine::ARMBaseTargetMachine(const Module &M, +ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, + const std::string &TT, const std::string &FS, bool isThumb) - : Subtarget(M, FS, isThumb), + : LLVMTargetMachine(T, TT), + Subtarget(TT, FS, isThumb), FrameInfo(Subtarget), JITInfo(), InstrItins(Subtarget.getInstrItineraryData()) { DefRelocModel = getRelocationModel(); } -ARMTargetMachine::ARMTargetMachine(const Module &M, const std::string &FS) - : ARMBaseTargetMachine(M, FS, false), InstrInfo(Subtarget), +ARMTargetMachine::ARMTargetMachine(const Target &T, const std::string &TT, + const std::string &FS) + : ARMBaseTargetMachine(T, TT, FS, false), InstrInfo(Subtarget), DataLayout(Subtarget.isAPCS_ABI() ? std::string("e-p:32:32-f64:32:32-i64:32:32") : std::string("e-p:32:32-f64:64:64-i64:64:64")), TLInfo(*this) { } -ThumbTargetMachine::ThumbTargetMachine(const Module &M, const std::string &FS) - : ARMBaseTargetMachine(M, FS, true), +ThumbTargetMachine::ThumbTargetMachine(const Target &T, const std::string &TT, + const std::string &FS) + : ARMBaseTargetMachine(T, TT, FS, true), + InstrInfo(Subtarget.hasThumb2() + ? ((ARMBaseInstrInfo*)new Thumb2InstrInfo(Subtarget)) + : ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))), DataLayout(Subtarget.isAPCS_ABI() ? std::string("e-p:32:32-f64:32:32-i64:32:32-" "i16:16:32-i8:8:32-i1:8:32-a:0:32") : std::string("e-p:32:32-f64:64:64-i64:64:64-" "i16:16:32-i8:8:32-i1:8:32-a:0:32")), TLInfo(*this) { - // Create the approriate type of Thumb InstrInfo - if (Subtarget.hasThumb2()) - InstrInfo = new Thumb2InstrInfo(Subtarget); - else - InstrInfo = new Thumb1InstrInfo(Subtarget); -} - -unsigned ARMTargetMachine::getJITMatchQuality() { -#if defined(__arm__) - return 10; -#endif - return 0; -} - -unsigned ARMTargetMachine::getModuleMatchQuality(const Module &M) { - std::string TT = M.getTargetTriple(); - // Match arm-foo-bar, as well as things like armv5blah-* - if (TT.size() >= 4 && - (TT.substr(0, 4) == "arm-" || TT.substr(0, 4) == "armv")) - return 20; - // If the target triple is something non-arm, we don't match. - if (!TT.empty()) return 0; - - if (M.getEndianness() == Module::LittleEndian && - M.getPointerSize() == Module::Pointer32) - return 10; // Weak match - else if (M.getEndianness() != Module::AnyEndianness || - M.getPointerSize() != Module::AnyPointerSize) - return 0; // Match for some other target - - return getJITMatchQuality()/2; } -const TargetAsmInfo *ARMBaseTargetMachine::createTargetAsmInfo() const { - switch (Subtarget.TargetType) { - case ARMSubtarget::isDarwin: - return new ARMDarwinTargetAsmInfo(*this); - case ARMSubtarget::isELF: - return new ARMELFTargetAsmInfo(*this); - default: - return new ARMGenericTargetAsmInfo(*this); - } -} - // Pass Pipeline Configuration bool ARMBaseTargetMachine::addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel) { - PM.add(createARMISelDag(*this)); + PM.add(createARMISelDag(*this, OptLevel)); return false; } bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel) { - // FIXME: temporarily disabling load / store optimization pass for Thumb mode. - if (OptLevel != CodeGenOpt::None && !DisableLdStOpti && !Subtarget.isThumb()) + if (Subtarget.hasNEON()) + PM.add(createNEONPreAllocPass()); + + // FIXME: temporarily disabling load / store optimization pass for Thumb1. + if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only()) PM.add(createARMLoadStoreOptimizationPass(true)); return true; } -bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM, - CodeGenOpt::Level OptLevel) { - // FIXME: temporarily disabling load / store optimization pass for Thumb mode. - if (OptLevel != CodeGenOpt::None && !DisableLdStOpti && !Subtarget.isThumb()) +bool ARMBaseTargetMachine::addPreSched2(PassManagerBase &PM, + CodeGenOpt::Level OptLevel) { + // FIXME: temporarily disabling load / store optimization pass for Thumb1. + if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only()) PM.add(createARMLoadStoreOptimizationPass()); - if (OptLevel != CodeGenOpt::None && - !DisableIfConversion && !Subtarget.isThumb()) - PM.add(createIfConverterPass()); - - PM.add(createARMConstantIslandPass()); return true; } -bool ARMBaseTargetMachine::addAssemblyEmitter(PassManagerBase &PM, - CodeGenOpt::Level OptLevel, - bool Verbose, - raw_ostream &Out) { - // Output assembly language. - assert(AsmPrinterCtor && "AsmPrinter was not linked in"); - if (AsmPrinterCtor) - PM.add(AsmPrinterCtor(Out, *this, Verbose)); +bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM, + CodeGenOpt::Level OptLevel) { + // FIXME: temporarily disabling load / store optimization pass for Thumb1. + if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only()) + PM.add(createIfConverterPass()); + + if (Subtarget.isThumb2()) { + PM.add(createThumb2ITBlockPass()); + PM.add(createThumb2SizeReductionPass()); + } - return false; + PM.add(createARMConstantIslandPass()); + return true; } - bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel, - bool DumpAsm, MachineCodeEmitter &MCE) { // FIXME: Move this to TargetJITInfo! if (DefRelocModel == Reloc::Default) @@ -200,18 +134,11 @@ bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM, // Machine code emitter pass for ARM. PM.add(createARMCodeEmitterPass(*this, MCE)); - if (DumpAsm) { - assert(AsmPrinterCtor && "AsmPrinter was not linked in"); - if (AsmPrinterCtor) - PM.add(AsmPrinterCtor(errs(), *this, true)); - } - return false; } bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel, - bool DumpAsm, JITCodeEmitter &JCE) { // FIXME: Move this to TargetJITInfo! if (DefRelocModel == Reloc::Default) @@ -219,43 +146,42 @@ bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM, // Machine code emitter pass for ARM. PM.add(createARMJITCodeEmitterPass(*this, JCE)); - if (DumpAsm) { - assert(AsmPrinterCtor && "AsmPrinter was not linked in"); - if (AsmPrinterCtor) - PM.add(AsmPrinterCtor(errs(), *this, true)); - } + return false; +} + +bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM, + CodeGenOpt::Level OptLevel, + ObjectCodeEmitter &OCE) { + // FIXME: Move this to TargetJITInfo! + if (DefRelocModel == Reloc::Default) + setRelocationModel(Reloc::Static); + // Machine code emitter pass for ARM. + PM.add(createARMObjectCodeEmitterPass(*this, OCE)); return false; } bool ARMBaseTargetMachine::addSimpleCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel, - bool DumpAsm, MachineCodeEmitter &MCE) { // Machine code emitter pass for ARM. PM.add(createARMCodeEmitterPass(*this, MCE)); - if (DumpAsm) { - assert(AsmPrinterCtor && "AsmPrinter was not linked in"); - if (AsmPrinterCtor) - PM.add(AsmPrinterCtor(errs(), *this, true)); - } - return false; } bool ARMBaseTargetMachine::addSimpleCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel, - bool DumpAsm, JITCodeEmitter &JCE) { // Machine code emitter pass for ARM. PM.add(createARMJITCodeEmitterPass(*this, JCE)); - if (DumpAsm) { - assert(AsmPrinterCtor && "AsmPrinter was not linked in"); - if (AsmPrinterCtor) - PM.add(AsmPrinterCtor(errs(), *this, true)); - } - return false; } +bool ARMBaseTargetMachine::addSimpleCodeEmitter(PassManagerBase &PM, + CodeGenOpt::Level OptLevel, + ObjectCodeEmitter &OCE) { + // Machine code emitter pass for ARM. + PM.add(createARMObjectCodeEmitterPass(*this, OCE)); + return false; +} diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h index a0df54d..71a5348 100644 --- a/lib/Target/ARM/ARMTargetMachine.h +++ b/lib/Target/ARM/ARMTargetMachine.h @@ -16,7 +16,6 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetFrameInfo.h" #include "ARMInstrInfo.h" #include "ARMFrameInfo.h" #include "ARMJITInfo.h" @@ -27,8 +26,6 @@ namespace llvm { -class Module; - class ARMBaseTargetMachine : public LLVMTargetMachine { protected: ARMSubtarget Subtarget; @@ -39,16 +36,9 @@ private: InstrItineraryData InstrItins; Reloc::Model DefRelocModel; // Reloc model before it's overridden. -protected: - // To avoid having target depend on the asmprinter stuff libraries, asmprinter - // set this functions to ctor pointer at startup time if they are linked in. - typedef FunctionPass *(*AsmPrinterCtorFn)(raw_ostream &o, - ARMBaseTargetMachine &tm, - bool verbose); - static AsmPrinterCtorFn AsmPrinterCtor; - public: - ARMBaseTargetMachine(const Module &M, const std::string &FS, bool isThumb); + ARMBaseTargetMachine(const Target &T, const std::string &TT, + const std::string &FS, bool isThumb); virtual const ARMFrameInfo *getFrameInfo() const { return &FrameInfo; } virtual ARMJITInfo *getJITInfo() { return &JITInfo; } @@ -57,34 +47,26 @@ public: return InstrItins; } - static void registerAsmPrinter(AsmPrinterCtorFn F) { - AsmPrinterCtor = F; - } - - static unsigned getModuleMatchQuality(const Module &M); - static unsigned getJITMatchQuality(); - - virtual const TargetAsmInfo *createTargetAsmInfo() const; - // Pass Pipeline Configuration virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel); virtual bool addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel); + virtual bool addPreSched2(PassManagerBase &PM, CodeGenOpt::Level OptLevel); virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel); - virtual bool addAssemblyEmitter(PassManagerBase &PM, - CodeGenOpt::Level OptLevel, - bool Verbose, raw_ostream &Out); virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel, - bool DumpAsm, MachineCodeEmitter &MCE); + MachineCodeEmitter &MCE); + virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel, + JITCodeEmitter &MCE); virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel, - bool DumpAsm, JITCodeEmitter &MCE); + ObjectCodeEmitter &OCE); virtual bool addSimpleCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel, - bool DumpAsm, MachineCodeEmitter &MCE); virtual bool addSimpleCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel, - bool DumpAsm, JITCodeEmitter &MCE); + virtual bool addSimpleCodeEmitter(PassManagerBase &PM, + CodeGenOpt::Level OptLevel, + ObjectCodeEmitter &OCE); }; /// ARMTargetMachine - ARM target machine. @@ -94,7 +76,8 @@ class ARMTargetMachine : public ARMBaseTargetMachine { const TargetData DataLayout; // Calculates type size & alignment ARMTargetLowering TLInfo; public: - ARMTargetMachine(const Module &M, const std::string &FS); + ARMTargetMachine(const Target &T, const std::string &TT, + const std::string &FS); virtual const ARMRegisterInfo *getRegisterInfo() const { return &InstrInfo.getRegisterInfo(); @@ -106,9 +89,6 @@ public: virtual const ARMInstrInfo *getInstrInfo() const { return &InstrInfo; } virtual const TargetData *getTargetData() const { return &DataLayout; } - - static unsigned getJITMatchQuality(); - static unsigned getModuleMatchQuality(const Module &M); }; /// ThumbTargetMachine - Thumb target machine. @@ -120,7 +100,8 @@ class ThumbTargetMachine : public ARMBaseTargetMachine { const TargetData DataLayout; // Calculates type size & alignment ARMTargetLowering TLInfo; public: - ThumbTargetMachine(const Module &M, const std::string &FS); + ThumbTargetMachine(const Target &T, const std::string &TT, + const std::string &FS); /// returns either Thumb1RegisterInfo of Thumb2RegisterInfo virtual const ARMBaseRegisterInfo *getRegisterInfo() const { @@ -134,9 +115,6 @@ public: /// returns either Thumb1InstrInfo or Thumb2InstrInfo virtual const ARMBaseInstrInfo *getInstrInfo() const { return InstrInfo; } virtual const TargetData *getTargetData() const { return &DataLayout; } - - static unsigned getJITMatchQuality(); - static unsigned getModuleMatchQuality(const Module &M); }; } // end namespace llvm diff --git a/lib/Target/ARM/ARMTargetObjectFile.h b/lib/Target/ARM/ARMTargetObjectFile.h new file mode 100644 index 0000000..9703403 --- /dev/null +++ b/lib/Target/ARM/ARMTargetObjectFile.h @@ -0,0 +1,39 @@ +//===-- llvm/Target/ARMTargetObjectFile.h - ARM Object Info -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_ARM_TARGETOBJECTFILE_H +#define LLVM_TARGET_ARM_TARGETOBJECTFILE_H + +#include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/MC/MCSectionELF.h" + +namespace llvm { + + class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF { + public: + ARMElfTargetObjectFile() : TargetLoweringObjectFileELF() {} + + void Initialize(MCContext &Ctx, const TargetMachine &TM) { + TargetLoweringObjectFileELF::Initialize(Ctx, TM); + + if (TM.getSubtarget<ARMSubtarget>().isAAPCS_ABI()) { + StaticCtorSection = + getELFSection(".init_array", MCSectionELF::SHT_INIT_ARRAY, + MCSectionELF::SHF_WRITE | MCSectionELF::SHF_ALLOC, + SectionKind::getDataRel()); + StaticDtorSection = + getELFSection(".fini_array", MCSectionELF::SHT_FINI_ARRAY, + MCSectionELF::SHF_WRITE | MCSectionELF::SHF_ALLOC, + SectionKind::getDataRel()); + } + } + }; +} // end namespace llvm + +#endif diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp new file mode 100644 index 0000000..7438ea9 --- /dev/null +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -0,0 +1,618 @@ +//===-- ARMAsmParser.cpp - Parse ARM assembly to MCInst instructions ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCAsmLexer.h" +#include "llvm/MC/MCAsmParser.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Target/TargetRegistry.h" +#include "llvm/Target/TargetAsmParser.h" +using namespace llvm; + +namespace { +struct ARMOperand; + +// The shift types for register controlled shifts in arm memory addressing +enum ShiftType { + Lsl, + Lsr, + Asr, + Ror, + Rrx +}; + +class ARMAsmParser : public TargetAsmParser { + MCAsmParser &Parser; + +private: + MCAsmParser &getParser() const { return Parser; } + + MCAsmLexer &getLexer() const { return Parser.getLexer(); } + + void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); } + + bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); } + + bool ParseRegister(ARMOperand &Op); + + bool ParseRegisterList(ARMOperand &Op); + + bool ParseMemory(ARMOperand &Op); + + bool ParseShift(enum ShiftType *St, const MCExpr *&ShiftAmount); + + bool ParseOperand(ARMOperand &Op); + + bool ParseDirectiveWord(unsigned Size, SMLoc L); + + // TODO - For now hacked versions of the next two are in here in this file to + // allow some parser testing until the table gen versions are implemented. + + /// @name Auto-generated Match Functions + /// { + bool MatchInstruction(SmallVectorImpl<ARMOperand> &Operands, + MCInst &Inst); + + /// MatchRegisterName - Match the given string to a register name and return + /// its register number, or -1 if there is no match. To allow return values + /// to be used directly in register lists, arm registers have values between + /// 0 and 15. + int MatchRegisterName(const StringRef &Name); + + /// } + + +public: + ARMAsmParser(const Target &T, MCAsmParser &_Parser) + : TargetAsmParser(T), Parser(_Parser) {} + + virtual bool ParseInstruction(const StringRef &Name, MCInst &Inst); + + virtual bool ParseDirective(AsmToken DirectiveID); +}; + +} // end anonymous namespace + +namespace { + +/// ARMOperand - Instances of this class represent a parsed ARM machine +/// instruction. +struct ARMOperand { + enum { + Token, + Register, + Immediate, + Memory + } Kind; + + + union { + struct { + const char *Data; + unsigned Length; + } Tok; + + struct { + unsigned RegNum; + bool Writeback; + } Reg; + + struct { + const MCExpr *Val; + } Imm; + + // This is for all forms of ARM address expressions + struct { + unsigned BaseRegNum; + bool OffsetIsReg; + const MCExpr *Offset; // used when OffsetIsReg is false + unsigned OffsetRegNum; // used when OffsetIsReg is true + bool OffsetRegShifted; // only used when OffsetIsReg is true + enum ShiftType ShiftType; // used when OffsetRegShifted is true + const MCExpr *ShiftAmount; // used when OffsetRegShifted is true + bool Preindexed; + bool Postindexed; + bool Negative; // only used when OffsetIsReg is true + bool Writeback; + } Mem; + + }; + + StringRef getToken() const { + assert(Kind == Token && "Invalid access!"); + return StringRef(Tok.Data, Tok.Length); + } + + unsigned getReg() const { + assert(Kind == Register && "Invalid access!"); + return Reg.RegNum; + } + + const MCExpr *getImm() const { + assert(Kind == Immediate && "Invalid access!"); + return Imm.Val; + } + + bool isToken() const {return Kind == Token; } + + bool isReg() const { return Kind == Register; } + + void addRegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(getReg())); + } + + static ARMOperand CreateToken(StringRef Str) { + ARMOperand Res; + Res.Kind = Token; + Res.Tok.Data = Str.data(); + Res.Tok.Length = Str.size(); + return Res; + } + + static ARMOperand CreateReg(unsigned RegNum, bool Writeback) { + ARMOperand Res; + Res.Kind = Register; + Res.Reg.RegNum = RegNum; + Res.Reg.Writeback = Writeback; + return Res; + } + + static ARMOperand CreateImm(const MCExpr *Val) { + ARMOperand Res; + Res.Kind = Immediate; + Res.Imm.Val = Val; + return Res; + } + + static ARMOperand CreateMem(unsigned BaseRegNum, bool OffsetIsReg, + const MCExpr *Offset, unsigned OffsetRegNum, + bool OffsetRegShifted, enum ShiftType ShiftType, + const MCExpr *ShiftAmount, bool Preindexed, + bool Postindexed, bool Negative, bool Writeback) { + ARMOperand Res; + Res.Kind = Memory; + Res.Mem.BaseRegNum = BaseRegNum; + Res.Mem.OffsetIsReg = OffsetIsReg; + Res.Mem.Offset = Offset; + Res.Mem.OffsetRegNum = OffsetRegNum; + Res.Mem.OffsetRegShifted = OffsetRegShifted; + Res.Mem.ShiftType = ShiftType; + Res.Mem.ShiftAmount = ShiftAmount; + Res.Mem.Preindexed = Preindexed; + Res.Mem.Postindexed = Postindexed; + Res.Mem.Negative = Negative; + Res.Mem.Writeback = Writeback; + return Res; + } +}; + +} // end anonymous namespace. + +// Try to parse a register name. The token must be an Identifier when called, +// and if it is a register name a Reg operand is created, the token is eaten +// and false is returned. Else true is returned and no token is eaten. +// TODO this is likely to change to allow different register types and or to +// parse for a specific register type. +bool ARMAsmParser::ParseRegister(ARMOperand &Op) { + const AsmToken &Tok = getLexer().getTok(); + assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); + + // FIXME: Validate register for the current architecture; we have to do + // validation later, so maybe there is no need for this here. + int RegNum; + + RegNum = MatchRegisterName(Tok.getString()); + if (RegNum == -1) + return true; + getLexer().Lex(); // Eat identifier token. + + bool Writeback = false; + const AsmToken &ExclaimTok = getLexer().getTok(); + if (ExclaimTok.is(AsmToken::Exclaim)) { + Writeback = true; + getLexer().Lex(); // Eat exclaim token + } + + Op = ARMOperand::CreateReg(RegNum, Writeback); + + return false; +} + +// Try to parse a register list. The first token must be a '{' when called +// for now. +bool ARMAsmParser::ParseRegisterList(ARMOperand &Op) { + assert(getLexer().getTok().is(AsmToken::LCurly) && + "Token is not an Left Curly Brace"); + getLexer().Lex(); // Eat left curly brace token. + + const AsmToken &RegTok = getLexer().getTok(); + SMLoc RegLoc = RegTok.getLoc(); + if (RegTok.isNot(AsmToken::Identifier)) + return Error(RegLoc, "register expected"); + int RegNum = MatchRegisterName(RegTok.getString()); + if (RegNum == -1) + return Error(RegLoc, "register expected"); + getLexer().Lex(); // Eat identifier token. + unsigned RegList = 1 << RegNum; + + int HighRegNum = RegNum; + // TODO ranges like "{Rn-Rm}" + while (getLexer().getTok().is(AsmToken::Comma)) { + getLexer().Lex(); // Eat comma token. + + const AsmToken &RegTok = getLexer().getTok(); + SMLoc RegLoc = RegTok.getLoc(); + if (RegTok.isNot(AsmToken::Identifier)) + return Error(RegLoc, "register expected"); + int RegNum = MatchRegisterName(RegTok.getString()); + if (RegNum == -1) + return Error(RegLoc, "register expected"); + + if (RegList & (1 << RegNum)) + Warning(RegLoc, "register duplicated in register list"); + else if (RegNum <= HighRegNum) + Warning(RegLoc, "register not in ascending order in register list"); + RegList |= 1 << RegNum; + HighRegNum = RegNum; + + getLexer().Lex(); // Eat identifier token. + } + const AsmToken &RCurlyTok = getLexer().getTok(); + if (RCurlyTok.isNot(AsmToken::RCurly)) + return Error(RCurlyTok.getLoc(), "'}' expected"); + getLexer().Lex(); // Eat left curly brace token. + + return false; +} + +// Try to parse an arm memory expression. It must start with a '[' token. +// TODO Only preindexing and postindexing addressing are started, unindexed +// with option, etc are still to do. +bool ARMAsmParser::ParseMemory(ARMOperand &Op) { + assert(getLexer().getTok().is(AsmToken::LBrac) && + "Token is not an Left Bracket"); + getLexer().Lex(); // Eat left bracket token. + + const AsmToken &BaseRegTok = getLexer().getTok(); + if (BaseRegTok.isNot(AsmToken::Identifier)) + return Error(BaseRegTok.getLoc(), "register expected"); + int BaseRegNum = MatchRegisterName(BaseRegTok.getString()); + if (BaseRegNum == -1) + return Error(BaseRegTok.getLoc(), "register expected"); + getLexer().Lex(); // Eat identifier token. + + bool Preindexed = false; + bool Postindexed = false; + bool OffsetIsReg = false; + bool Negative = false; + bool Writeback = false; + + // First look for preindexed address forms: + // [Rn, +/-Rm] + // [Rn, #offset] + // [Rn, +/-Rm, shift] + // that is after the "[Rn" we now have see if the next token is a comma. + const AsmToken &Tok = getLexer().getTok(); + if (Tok.is(AsmToken::Comma)) { + Preindexed = true; + getLexer().Lex(); // Eat comma token. + + const AsmToken &NextTok = getLexer().getTok(); + if (NextTok.is(AsmToken::Plus)) + getLexer().Lex(); // Eat plus token. + else if (NextTok.is(AsmToken::Minus)) { + Negative = true; + getLexer().Lex(); // Eat minus token + } + + // See if there is a register following the "[Rn," we have so far. + const AsmToken &OffsetRegTok = getLexer().getTok(); + int OffsetRegNum = MatchRegisterName(OffsetRegTok.getString()); + bool OffsetRegShifted = false; + enum ShiftType ShiftType; + const MCExpr *ShiftAmount; + const MCExpr *Offset; + if (OffsetRegNum != -1) { + OffsetIsReg = true; + getLexer().Lex(); // Eat identifier token for the offset register. + // Look for a comma then a shift + const AsmToken &Tok = getLexer().getTok(); + if (Tok.is(AsmToken::Comma)) { + getLexer().Lex(); // Eat comma token. + + const AsmToken &Tok = getLexer().getTok(); + if (ParseShift(&ShiftType, ShiftAmount)) + return Error(Tok.getLoc(), "shift expected"); + OffsetRegShifted = true; + } + } + else { // "[Rn," we have so far was not followed by "Rm" + // Look for #offset following the "[Rn," + const AsmToken &HashTok = getLexer().getTok(); + if (HashTok.isNot(AsmToken::Hash)) + return Error(HashTok.getLoc(), "'#' expected"); + getLexer().Lex(); // Eat hash token. + + if (getParser().ParseExpression(Offset)) + return true; + } + const AsmToken &RBracTok = getLexer().getTok(); + if (RBracTok.isNot(AsmToken::RBrac)) + return Error(RBracTok.getLoc(), "']' expected"); + getLexer().Lex(); // Eat right bracket token. + + const AsmToken &ExclaimTok = getLexer().getTok(); + if (ExclaimTok.is(AsmToken::Exclaim)) { + Writeback = true; + getLexer().Lex(); // Eat exclaim token + } + Op = ARMOperand::CreateMem(BaseRegNum, OffsetIsReg, Offset, OffsetRegNum, + OffsetRegShifted, ShiftType, ShiftAmount, + Preindexed, Postindexed, Negative, Writeback); + return false; + } + // The "[Rn" we have so far was not followed by a comma. + else if (Tok.is(AsmToken::RBrac)) { + // This is a post indexing addressing forms: + // [Rn], #offset + // [Rn], +/-Rm + // [Rn], +/-Rm, shift + // that is a ']' follows after the "[Rn". + Postindexed = true; + Writeback = true; + getLexer().Lex(); // Eat right bracket token. + + const AsmToken &CommaTok = getLexer().getTok(); + if (CommaTok.isNot(AsmToken::Comma)) + return Error(CommaTok.getLoc(), "',' expected"); + getLexer().Lex(); // Eat comma token. + + const AsmToken &NextTok = getLexer().getTok(); + if (NextTok.is(AsmToken::Plus)) + getLexer().Lex(); // Eat plus token. + else if (NextTok.is(AsmToken::Minus)) { + Negative = true; + getLexer().Lex(); // Eat minus token + } + + // See if there is a register following the "[Rn]," we have so far. + const AsmToken &OffsetRegTok = getLexer().getTok(); + int OffsetRegNum = MatchRegisterName(OffsetRegTok.getString()); + bool OffsetRegShifted = false; + enum ShiftType ShiftType; + const MCExpr *ShiftAmount; + const MCExpr *Offset; + if (OffsetRegNum != -1) { + OffsetIsReg = true; + getLexer().Lex(); // Eat identifier token for the offset register. + // Look for a comma then a shift + const AsmToken &Tok = getLexer().getTok(); + if (Tok.is(AsmToken::Comma)) { + getLexer().Lex(); // Eat comma token. + + const AsmToken &Tok = getLexer().getTok(); + if (ParseShift(&ShiftType, ShiftAmount)) + return Error(Tok.getLoc(), "shift expected"); + OffsetRegShifted = true; + } + } + else { // "[Rn]," we have so far was not followed by "Rm" + // Look for #offset following the "[Rn]," + const AsmToken &HashTok = getLexer().getTok(); + if (HashTok.isNot(AsmToken::Hash)) + return Error(HashTok.getLoc(), "'#' expected"); + getLexer().Lex(); // Eat hash token. + + if (getParser().ParseExpression(Offset)) + return true; + } + Op = ARMOperand::CreateMem(BaseRegNum, OffsetIsReg, Offset, OffsetRegNum, + OffsetRegShifted, ShiftType, ShiftAmount, + Preindexed, Postindexed, Negative, Writeback); + return false; + } + + return true; +} + +/// ParseShift as one of these two: +/// ( lsl | lsr | asr | ror ) , # shift_amount +/// rrx +/// and returns true if it parses a shift otherwise it returns false. +bool ARMAsmParser::ParseShift(ShiftType *St, const MCExpr *&ShiftAmount) { + const AsmToken &Tok = getLexer().getTok(); + if (Tok.isNot(AsmToken::Identifier)) + return true; + const StringRef &ShiftName = Tok.getString(); + if (ShiftName == "lsl" || ShiftName == "LSL") + *St = Lsl; + else if (ShiftName == "lsr" || ShiftName == "LSR") + *St = Lsr; + else if (ShiftName == "asr" || ShiftName == "ASR") + *St = Asr; + else if (ShiftName == "ror" || ShiftName == "ROR") + *St = Ror; + else if (ShiftName == "rrx" || ShiftName == "RRX") + *St = Rrx; + else + return true; + getLexer().Lex(); // Eat shift type token. + + // For all but a Rotate right there must be a '#' and a shift amount + if (*St != Rrx) { + // Look for # following the shift type + const AsmToken &HashTok = getLexer().getTok(); + if (HashTok.isNot(AsmToken::Hash)) + return Error(HashTok.getLoc(), "'#' expected"); + getLexer().Lex(); // Eat hash token. + + if (getParser().ParseExpression(ShiftAmount)) + return true; + } + + return false; +} + +// A hack to allow some testing +int ARMAsmParser::MatchRegisterName(const StringRef &Name) { + if (Name == "r0" || Name == "R0") + return 0; + else if (Name == "r1" || Name == "R1") + return 1; + else if (Name == "r2" || Name == "R2") + return 2; + else if (Name == "r3" || Name == "R3") + return 3; + else if (Name == "r3" || Name == "R3") + return 3; + else if (Name == "r4" || Name == "R4") + return 4; + else if (Name == "r5" || Name == "R5") + return 5; + else if (Name == "r6" || Name == "R6") + return 6; + else if (Name == "r7" || Name == "R7") + return 7; + else if (Name == "r8" || Name == "R8") + return 8; + else if (Name == "r9" || Name == "R9") + return 9; + else if (Name == "r10" || Name == "R10") + return 10; + else if (Name == "r11" || Name == "R11" || Name == "fp") + return 11; + else if (Name == "r12" || Name == "R12" || Name == "ip") + return 12; + else if (Name == "r13" || Name == "R13" || Name == "sp") + return 13; + else if (Name == "r14" || Name == "R14" || Name == "lr") + return 14; + else if (Name == "r15" || Name == "R15" || Name == "pc") + return 15; + return -1; +} + +// A hack to allow some testing +bool ARMAsmParser::MatchInstruction(SmallVectorImpl<ARMOperand> &Operands, + MCInst &Inst) { + struct ARMOperand Op0 = Operands[0]; + assert(Op0.Kind == ARMOperand::Token && "First operand not a Token"); + const StringRef &Mnemonic = Op0.getToken(); + if (Mnemonic == "add" || + Mnemonic == "stmfd" || + Mnemonic == "str" || + Mnemonic == "ldmfd" || + Mnemonic == "ldr" || + Mnemonic == "mov" || + Mnemonic == "sub") + return false; + + return true; +} + +// TODO - this is a work in progress +bool ARMAsmParser::ParseOperand(ARMOperand &Op) { + switch (getLexer().getKind()) { + case AsmToken::Identifier: + if (!ParseRegister(Op)) + return false; + // TODO parse other operands that start with an identifier like labels + return Error(getLexer().getTok().getLoc(), "labels not yet supported"); + case AsmToken::LBrac: + if (!ParseMemory(Op)) + return false; + case AsmToken::LCurly: + if (!ParseRegisterList(Op)) + return false; + case AsmToken::Hash: + // #42 -> immediate. + // TODO: ":lower16:" and ":upper16:" modifiers after # before immediate + getLexer().Lex(); + const MCExpr *Val; + if (getParser().ParseExpression(Val)) + return true; + Op = ARMOperand::CreateImm(Val); + return false; + default: + return Error(getLexer().getTok().getLoc(), "unexpected token in operand"); + } +} + +bool ARMAsmParser::ParseInstruction(const StringRef &Name, MCInst &Inst) { + SmallVector<ARMOperand, 7> Operands; + + Operands.push_back(ARMOperand::CreateToken(Name)); + + SMLoc Loc = getLexer().getTok().getLoc(); + if (getLexer().isNot(AsmToken::EndOfStatement)) { + + // Read the first operand. + Operands.push_back(ARMOperand()); + if (ParseOperand(Operands.back())) + return true; + + while (getLexer().is(AsmToken::Comma)) { + getLexer().Lex(); // Eat the comma. + + // Parse and remember the operand. + Operands.push_back(ARMOperand()); + if (ParseOperand(Operands.back())) + return true; + } + } + if (!MatchInstruction(Operands, Inst)) + return false; + + Error(Loc, "ARMAsmParser::ParseInstruction only partly implemented"); + return true; +} + +bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) { + StringRef IDVal = DirectiveID.getIdentifier(); + if (IDVal == ".word") + return ParseDirectiveWord(4, DirectiveID.getLoc()); + return true; +} + +/// ParseDirectiveWord +/// ::= .word [ expression (, expression)* ] +bool ARMAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { + if (getLexer().isNot(AsmToken::EndOfStatement)) { + for (;;) { + const MCExpr *Value; + if (getParser().ParseExpression(Value)) + return true; + + getParser().getStreamer().EmitValue(Value, Size); + + if (getLexer().is(AsmToken::EndOfStatement)) + break; + + // FIXME: Improve diagnostic. + if (getLexer().isNot(AsmToken::Comma)) + return Error(L, "unexpected token in directive"); + getLexer().Lex(); + } + } + + getLexer().Lex(); + return false; +} + +// Force static initialization. +extern "C" void LLVMInitializeARMAsmParser() { + RegisterAsmParser<ARMAsmParser> X(TheARMTarget); + RegisterAsmParser<ARMAsmParser> Y(TheThumbTarget); +} diff --git a/lib/Target/ARM/AsmParser/CMakeLists.txt b/lib/Target/ARM/AsmParser/CMakeLists.txt new file mode 100644 index 0000000..308c6cf --- /dev/null +++ b/lib/Target/ARM/AsmParser/CMakeLists.txt @@ -0,0 +1,6 @@ +include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) + +add_llvm_library(LLVMARMAsmParser + ARMAsmParser.cpp + ) + diff --git a/lib/Target/ARM/AsmParser/Makefile b/lib/Target/ARM/AsmParser/Makefile new file mode 100644 index 0000000..97e5612 --- /dev/null +++ b/lib/Target/ARM/AsmParser/Makefile @@ -0,0 +1,15 @@ +##===- lib/Target/ARM/AsmParser/Makefile -------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../../.. +LIBRARYNAME = LLVMARMAsmParser + +# Hack: we need to include 'main' ARM target directory to grab private headers +CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp index 434a19a..546731b 100644 --- a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp +++ b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp @@ -1,5 +1,3 @@ -//===-- ARMAsmPrinter.cpp - ARM LLVM assembly writer ----------------------===// -// // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source @@ -21,23 +19,30 @@ #include "ARMMachineFunctionInfo.h" #include "llvm/Constants.h" #include "llvm/Module.h" -#include "llvm/MDNode.h" +#include "llvm/Assembly/Writer.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/DwarfWriter.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" -#include "llvm/Target/TargetAsmInfo.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCSymbol.h" #include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetRegistry.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallString.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSet.h" #include "llvm/Support/Compiler.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Mangler.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/FormattedStream.h" #include <cctype> using namespace llvm; @@ -45,7 +50,6 @@ STATISTIC(EmittedInsts, "Number of machine instrs printed"); namespace { class VISIBILITY_HIDDEN ARMAsmPrinter : public AsmPrinter { - DwarfWriter *DW; /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can /// make the right decision when printing asm code for different targets. @@ -68,22 +72,18 @@ namespace { /// GVNonLazyPtrs - Keeps the set of GlobalValues that require /// non-lazy-pointers for indirect access. - StringSet<> GVNonLazyPtrs; + StringMap<std::string> GVNonLazyPtrs; /// HiddenGVNonLazyPtrs - Keeps the set of GlobalValues with hidden /// visibility that require non-lazy-pointers for indirect access. - StringSet<> HiddenGVNonLazyPtrs; - - /// FnStubs - Keeps the set of external function GlobalAddresses that the - /// asm printer should generate stubs for. - StringSet<> FnStubs; + StringMap<std::string> HiddenGVNonLazyPtrs; /// True if asm printer is printing a series of CONSTPOOL_ENTRY. bool InCPMode; public: - explicit ARMAsmPrinter(raw_ostream &O, TargetMachine &TM, - const TargetAsmInfo *T, bool V) - : AsmPrinter(O, TM, T, V), DW(0), AFI(NULL), MCP(NULL), + explicit ARMAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM, + const MCAsmInfo *T, bool V) + : AsmPrinter(O, TM, T, V), AFI(NULL), MCP(NULL), InCPMode(false) { Subtarget = &TM.getSubtarget<ARMSubtarget>(); } @@ -110,6 +110,7 @@ namespace { const char *Modifier = 0); void printBitfieldInvMaskImmOperand (const MachineInstr *MI, int OpNum); + void printThumbITMask(const MachineInstr *MI, int OpNum); void printThumbAddrModeRROperand(const MachineInstr *MI, int OpNum); void printThumbAddrModeRI5Operand(const MachineInstr *MI, int OpNum, unsigned Scale); @@ -118,10 +119,10 @@ namespace { void printThumbAddrModeS4Operand(const MachineInstr *MI, int OpNum); void printThumbAddrModeSPOperand(const MachineInstr *MI, int OpNum); - void printT2SOImmOperand(const MachineInstr *MI, int OpNum); void printT2SOOperand(const MachineInstr *MI, int OpNum); void printT2AddrModeImm12Operand(const MachineInstr *MI, int OpNum); void printT2AddrModeImm8Operand(const MachineInstr *MI, int OpNum); + void printT2AddrModeImm8s4Operand(const MachineInstr *MI, int OpNum); void printT2AddrModeImm8OffsetOperand(const MachineInstr *MI, int OpNum); void printT2AddrModeSoRegOperand(const MachineInstr *MI, int OpNum); @@ -132,6 +133,9 @@ namespace { void printCPInstOperand(const MachineInstr *MI, int OpNum, const char *Modifier); void printJTBlockOperand(const MachineInstr *MI, int OpNum); + void printJT2BlockOperand(const MachineInstr *MI, int OpNum); + void printTBAddrMode(const MachineInstr *MI, int OpNum); + void printNoHashImmediate(const MachineInstr *MI, int OpNum); virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, unsigned AsmVariant, const char *ExtraCode); @@ -139,12 +143,14 @@ namespace { unsigned AsmVariant, const char *ExtraCode); - void printModuleLevelGV(const GlobalVariable* GVar); - bool printInstruction(const MachineInstr *MI); // autogenerated. + void PrintGlobalVariable(const GlobalVariable* GVar); + void printInstruction(const MachineInstr *MI); // autogenerated. + static const char *getRegisterName(unsigned RegNo); + void printMachineInstruction(const MachineInstr *MI); bool runOnMachineFunction(MachineFunction &F); - bool doInitialization(Module &M); bool doFinalization(Module &M); + void EmitStartOfAsmFile(Module &M); /// EmitMachineConstantPoolValue - Print a machine constantpool value to /// the .s file. @@ -153,24 +159,35 @@ namespace { ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV); GlobalValue *GV = ACPV->getGV(); - std::string Name = GV ? Mang->getValueName(GV) : TAI->getGlobalPrefix(); - if (!GV) - Name += ACPV->getSymbol(); - if (ACPV->isNonLazyPointer()) { - if (GV->hasHiddenVisibility()) - HiddenGVNonLazyPtrs.insert(Name); - else - GVNonLazyPtrs.insert(Name); - printSuffixedName(Name, "$non_lazy_ptr"); - } else if (ACPV->isStub()) { - FnStubs.insert(Name); - printSuffixedName(Name, "$stub"); + std::string Name; + + if (ACPV->isLSDA()) { + SmallString<16> LSDAName; + raw_svector_ostream(LSDAName) << MAI->getPrivateGlobalPrefix() << + "_LSDA_" << getFunctionNumber(); + Name = LSDAName.str(); + } else if (GV) { + bool isIndirect = Subtarget->isTargetDarwin() && + Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel()); + if (!isIndirect) + Name = Mang->getMangledName(GV); + else { + // FIXME: Remove this when Darwin transition to @GOT like syntax. + std::string SymName = Mang->getMangledName(GV); + Name = Mang->getMangledName(GV, "$non_lazy_ptr", true); + if (GV->hasHiddenVisibility()) + HiddenGVNonLazyPtrs[SymName] = Name; + else + GVNonLazyPtrs[SymName] = Name; + } } else - O << Name; + Name = Mang->makeNameProper(ACPV->getSymbol()); + O << Name; + if (ACPV->hasModifier()) O << "(" << ACPV->getModifier() << ")"; if (ACPV->getPCAdjustment() != 0) { - O << "-(" << TAI->getPrivateGlobalPrefix() << "PC" - << utostr(ACPV->getLabelId()) + O << "-(" << MAI->getPrivateGlobalPrefix() << "PC" + << ACPV->getLabelId() << "+" << (unsigned)ACPV->getPCAdjustment(); if (ACPV->mustAddCurrentAddress()) O << "-."; @@ -178,7 +195,7 @@ namespace { } O << "\n"; } - + void getAnalysisUsage(AnalysisUsage &AU) const { AsmPrinter::getAnalysisUsage(AU); AU.setPreservesAll(); @@ -205,38 +222,39 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) { // NOTE: we don't print out constant pools here, they are handled as // instructions. - O << "\n"; + O << '\n'; + // Print out labels for the function. const Function *F = MF.getFunction(); + OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(F, Mang, TM)); + switch (F->getLinkage()) { - default: assert(0 && "Unknown linkage type!"); + default: llvm_unreachable("Unknown linkage type!"); case Function::PrivateLinkage: case Function::InternalLinkage: - SwitchToTextSection("\t.text", F); break; case Function::ExternalLinkage: - SwitchToTextSection("\t.text", F); O << "\t.globl\t" << CurrentFnName << "\n"; break; + case Function::LinkerPrivateLinkage: case Function::WeakAnyLinkage: case Function::WeakODRLinkage: case Function::LinkOnceAnyLinkage: case Function::LinkOnceODRLinkage: if (Subtarget->isTargetDarwin()) { - SwitchToTextSection( - ".section __TEXT,__textcoal_nt,coalesced,pure_instructions", F); O << "\t.globl\t" << CurrentFnName << "\n"; O << "\t.weak_definition\t" << CurrentFnName << "\n"; } else { - O << TAI->getWeakRefDirective() << CurrentFnName << "\n"; + O << MAI->getWeakRefDirective() << CurrentFnName << "\n"; } break; } printVisibility(CurrentFnName, F->getVisibility()); + unsigned FnAlign = 1 << MF.getAlignment(); // MF alignment is log2. if (AFI->isThumbFunction()) { - EmitAlignment(MF.getAlignment(), F, AFI->getAlign()); + EmitAlignment(FnAlign, F, AFI->getAlign()); O << "\t.code\t16\n"; O << "\t.thumb_func"; if (Subtarget->isTargetDarwin()) @@ -244,7 +262,7 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) { O << "\n"; InCPMode = false; } else { - EmitAlignment(MF.getAlignment(), F); + EmitAlignment(FnAlign, F); } O << CurrentFnName << ":\n"; @@ -266,8 +284,7 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) { I != E; ++I) { // Print a label for the basic block. if (I != MF.begin()) { - printBasicBlockLabel(I, true, true, VerboseAsm); - O << '\n'; + EmitBasicBlockStart(I); } for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end(); II != E; ++II) { @@ -276,14 +293,12 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) { } } - if (TAI->hasDotTypeDotSizeDirective()) + if (MAI->hasDotTypeDotSizeDirective()) O << "\t.size " << CurrentFnName << ", .-" << CurrentFnName << "\n"; // Emit post-function debug information. DW->EndFunction(&MF); - O.flush(); - return false; } @@ -298,37 +313,39 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, unsigned DRegLo = TRI->getSubReg(Reg, 5); // arm_dsubreg_0 unsigned DRegHi = TRI->getSubReg(Reg, 6); // arm_dsubreg_1 O << '{' - << TRI->getAsmName(DRegLo) << "-" << TRI->getAsmName(DRegHi) + << getRegisterName(DRegLo) << ',' << getRegisterName(DRegHi) << '}'; + } else if (Modifier && strcmp(Modifier, "lane") == 0) { + unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg); + unsigned DReg = TRI->getMatchingSuperReg(Reg, RegNum & 1 ? 2 : 1, + &ARM::DPR_VFP2RegClass); + O << getRegisterName(DReg) << '[' << (RegNum & 1) << ']'; } else { - O << TRI->getAsmName(Reg); + O << getRegisterName(Reg); } } else - assert(0 && "not implemented"); + llvm_unreachable("not implemented"); break; } case MachineOperand::MO_Immediate: { - if (!Modifier || strcmp(Modifier, "no_hash") != 0) - O << "#"; - - O << MO.getImm(); + int64_t Imm = MO.getImm(); + O << '#'; + if (Modifier) { + if (strcmp(Modifier, "lo16") == 0) + O << ":lower16:"; + else if (strcmp(Modifier, "hi16") == 0) + O << ":upper16:"; + } + O << Imm; break; } case MachineOperand::MO_MachineBasicBlock: - printBasicBlockLabel(MO.getMBB()); + GetMBBSymbol(MO.getMBB()->getNumber())->print(O, MAI); return; case MachineOperand::MO_GlobalAddress: { bool isCallOp = Modifier && !strcmp(Modifier, "call"); GlobalValue *GV = MO.getGlobal(); - std::string Name = Mang->getValueName(GV); - bool isExt = (GV->isDeclaration() || GV->hasWeakLinkage() || - GV->hasLinkOnceLinkage()); - if (isExt && isCallOp && Subtarget->isTargetDarwin() && - TM.getRelocationModel() != Reloc::Static) { - printSuffixedName(Name, "$stub"); - FnStubs.insert(Name); - } else - O << Name; + O << Mang->getMangledName(GV); printOffset(MO.getOffset()); @@ -339,25 +356,20 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, } case MachineOperand::MO_ExternalSymbol: { bool isCallOp = Modifier && !strcmp(Modifier, "call"); - std::string Name(TAI->getGlobalPrefix()); - Name += MO.getSymbolName(); - if (isCallOp && Subtarget->isTargetDarwin() && - TM.getRelocationModel() != Reloc::Static) { - printSuffixedName(Name, "$stub"); - FnStubs.insert(Name); - } else - O << Name; + std::string Name = Mang->makeNameProper(MO.getSymbolName()); + + O << Name; if (isCallOp && Subtarget->isTargetELF() && TM.getRelocationModel() == Reloc::PIC_) O << "(PLT)"; break; } case MachineOperand::MO_ConstantPoolIndex: - O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() + O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_' << MO.getIndex(); break; case MachineOperand::MO_JumpTableIndex: - O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() + O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() << '_' << MO.getIndex(); break; default: @@ -365,9 +377,12 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, } } -static void printSOImm(raw_ostream &O, int64_t V, bool VerboseAsm, - const TargetAsmInfo *TAI) { - assert(V < (1 << 12) && "Not a valid so_imm value!"); +static void printSOImm(formatted_raw_ostream &O, int64_t V, bool VerboseAsm, + const MCAsmInfo *MAI) { + // Break it up into two parts that make up a shifter immediate. + V = ARM_AM::getSOImmVal(V); + assert(V != -1 && "Not a valid so_imm value!"); + unsigned Imm = ARM_AM::getSOImmValImm(V); unsigned Rot = ARM_AM::getSOImmValRot(V); @@ -377,7 +392,7 @@ static void printSOImm(raw_ostream &O, int64_t V, bool VerboseAsm, O << "#" << Imm << ", " << Rot; // Pretty printed version. if (VerboseAsm) - O << ' ' << TAI->getCommentString() + O << ' ' << MAI->getCommentString() << ' ' << (int)ARM_AM::rotr32(Imm, Rot); } else { O << "#" << Imm; @@ -389,7 +404,7 @@ static void printSOImm(raw_ostream &O, int64_t V, bool VerboseAsm, void ARMAsmPrinter::printSOImmOperand(const MachineInstr *MI, int OpNum) { const MachineOperand &MO = MI->getOperand(OpNum); assert(MO.isImm() && "Not a valid so_imm value!"); - printSOImm(O, MO.getImm(), VerboseAsm, TAI); + printSOImm(O, MO.getImm(), VerboseAsm, MAI); } /// printSOImm2PartOperand - SOImm is broken into two pieces using a 'mov' @@ -399,15 +414,15 @@ void ARMAsmPrinter::printSOImm2PartOperand(const MachineInstr *MI, int OpNum) { assert(MO.isImm() && "Not a valid so_imm value!"); unsigned V1 = ARM_AM::getSOImmTwoPartFirst(MO.getImm()); unsigned V2 = ARM_AM::getSOImmTwoPartSecond(MO.getImm()); - printSOImm(O, ARM_AM::getSOImmVal(V1), VerboseAsm, TAI); + printSOImm(O, V1, VerboseAsm, MAI); O << "\n\torr"; printPredicateOperand(MI, 2); O << " "; - printOperand(MI, 0); + printOperand(MI, 0); O << ", "; - printOperand(MI, 0); + printOperand(MI, 0); O << ", "; - printSOImm(O, ARM_AM::getSOImmVal(V2), VerboseAsm, TAI); + printSOImm(O, V2, VerboseAsm, MAI); } // so_reg is a 4-operand unit corresponding to register forms of the A5.1 @@ -420,8 +435,7 @@ void ARMAsmPrinter::printSORegOperand(const MachineInstr *MI, int Op) { const MachineOperand &MO2 = MI->getOperand(Op+1); const MachineOperand &MO3 = MI->getOperand(Op+2); - assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg())); - O << TM.getRegisterInfo()->get(MO1.getReg()).AsmName; + O << getRegisterName(MO1.getReg()); // Print the shift opc. O << ", " @@ -429,8 +443,7 @@ void ARMAsmPrinter::printSORegOperand(const MachineInstr *MI, int Op) { << " "; if (MO2.getReg()) { - assert(TargetRegisterInfo::isPhysicalRegister(MO2.getReg())); - O << TM.getRegisterInfo()->get(MO2.getReg()).AsmName; + O << getRegisterName(MO2.getReg()); assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0); } else { O << "#" << ARM_AM::getSORegOffset(MO3.getImm()); @@ -447,7 +460,7 @@ void ARMAsmPrinter::printAddrMode2Operand(const MachineInstr *MI, int Op) { return; } - O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName; + O << "[" << getRegisterName(MO1.getReg()); if (!MO2.getReg()) { if (ARM_AM::getAM2Offset(MO3.getImm())) // Don't print +0. @@ -460,8 +473,8 @@ void ARMAsmPrinter::printAddrMode2Operand(const MachineInstr *MI, int Op) { O << ", " << (char)ARM_AM::getAM2Op(MO3.getImm()) - << TM.getRegisterInfo()->get(MO2.getReg()).AsmName; - + << getRegisterName(MO2.getReg()); + if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm())) O << ", " << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImm())) @@ -483,8 +496,8 @@ void ARMAsmPrinter::printAddrMode2OffsetOperand(const MachineInstr *MI, int Op){ } O << (char)ARM_AM::getAM2Op(MO2.getImm()) - << TM.getRegisterInfo()->get(MO1.getReg()).AsmName; - + << getRegisterName(MO1.getReg()); + if (unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm())) O << ", " << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO2.getImm())) @@ -495,18 +508,18 @@ void ARMAsmPrinter::printAddrMode3Operand(const MachineInstr *MI, int Op) { const MachineOperand &MO1 = MI->getOperand(Op); const MachineOperand &MO2 = MI->getOperand(Op+1); const MachineOperand &MO3 = MI->getOperand(Op+2); - + assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg())); - O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName; + O << "[" << getRegisterName(MO1.getReg()); if (MO2.getReg()) { O << ", " << (char)ARM_AM::getAM3Op(MO3.getImm()) - << TM.getRegisterInfo()->get(MO2.getReg()).AsmName + << getRegisterName(MO2.getReg()) << "]"; return; } - + if (unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm())) O << ", #" << (char)ARM_AM::getAM3Op(MO3.getImm()) @@ -520,7 +533,7 @@ void ARMAsmPrinter::printAddrMode3OffsetOperand(const MachineInstr *MI, int Op){ if (MO1.getReg()) { O << (char)ARM_AM::getAM3Op(MO2.getImm()) - << TM.getRegisterInfo()->get(MO1.getReg()).AsmName; + << getRegisterName(MO1.getReg()); return; } @@ -530,7 +543,7 @@ void ARMAsmPrinter::printAddrMode3OffsetOperand(const MachineInstr *MI, int Op){ << (char)ARM_AM::getAM3Op(MO2.getImm()) << ImmOffs; } - + void ARMAsmPrinter::printAddrMode4Operand(const MachineInstr *MI, int Op, const char *Modifier) { const MachineOperand &MO1 = MI->getOperand(Op); @@ -538,11 +551,18 @@ void ARMAsmPrinter::printAddrMode4Operand(const MachineInstr *MI, int Op, ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm()); if (Modifier && strcmp(Modifier, "submode") == 0) { if (MO1.getReg() == ARM::SP) { + // FIXME bool isLDM = (MI->getOpcode() == ARM::LDM || - MI->getOpcode() == ARM::LDM_RET); + MI->getOpcode() == ARM::LDM_RET || + MI->getOpcode() == ARM::t2LDM || + MI->getOpcode() == ARM::t2LDM_RET); O << ARM_AM::getAMSubModeAltStr(Mode, isLDM); } else O << ARM_AM::getAMSubModeStr(Mode); + } else if (Modifier && strcmp(Modifier, "wide") == 0) { + ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm()); + if (Mode == ARM_AM::ia) + O << ".w"; } else { printOperand(MI, Op); if (ARM_AM::getAM4WBFlag(MO2.getImm())) @@ -559,7 +579,7 @@ void ARMAsmPrinter::printAddrMode5Operand(const MachineInstr *MI, int Op, printOperand(MI, Op); return; } - + assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg())); if (Modifier && strcmp(Modifier, "submode") == 0) { @@ -573,14 +593,14 @@ void ARMAsmPrinter::printAddrMode5Operand(const MachineInstr *MI, int Op, return; } else if (Modifier && strcmp(Modifier, "base") == 0) { // Used for FSTM{D|S} and LSTM{D|S} operations. - O << TM.getRegisterInfo()->get(MO1.getReg()).AsmName; + O << getRegisterName(MO1.getReg()); if (ARM_AM::getAM5WBFlag(MO2.getImm())) O << "!"; return; } - - O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName; - + + O << "[" << getRegisterName(MO1.getReg()); + if (unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm())) { O << ", #" << (char)ARM_AM::getAM5Op(MO2.getImm()) @@ -595,13 +615,13 @@ void ARMAsmPrinter::printAddrMode6Operand(const MachineInstr *MI, int Op) { const MachineOperand &MO3 = MI->getOperand(Op+2); // FIXME: No support yet for specifying alignment. - O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName << "]"; + O << "[" << getRegisterName(MO1.getReg()) << "]"; if (ARM_AM::getAM6WBFlag(MO3.getImm())) { if (MO2.getReg() == 0) O << "!"; else - O << ", " << TM.getRegisterInfo()->get(MO2.getReg()).AsmName; + O << ", " << getRegisterName(MO2.getReg()); } } @@ -614,7 +634,7 @@ void ARMAsmPrinter::printAddrModePCOperand(const MachineInstr *MI, int Op, const MachineOperand &MO1 = MI->getOperand(Op); assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg())); - O << "[pc, +" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName << "]"; + O << "[pc, +" << getRegisterName(MO1.getReg()) << "]"; } void @@ -630,11 +650,26 @@ ARMAsmPrinter::printBitfieldInvMaskImmOperand(const MachineInstr *MI, int Op) { //===--------------------------------------------------------------------===// void +ARMAsmPrinter::printThumbITMask(const MachineInstr *MI, int Op) { + // (3 - the number of trailing zeros) is the number of then / else. + unsigned Mask = MI->getOperand(Op).getImm(); + unsigned NumTZ = CountTrailingZeros_32(Mask); + assert(NumTZ <= 3 && "Invalid IT mask!"); + for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) { + bool T = (Mask & (1 << Pos)) == 0; + if (T) + O << 't'; + else + O << 'e'; + } +} + +void ARMAsmPrinter::printThumbAddrModeRROperand(const MachineInstr *MI, int Op) { const MachineOperand &MO1 = MI->getOperand(Op); const MachineOperand &MO2 = MI->getOperand(Op+1); - O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName; - O << ", " << TM.getRegisterInfo()->get(MO2.getReg()).AsmName << "]"; + O << "[" << getRegisterName(MO1.getReg()); + O << ", " << getRegisterName(MO2.getReg()) << "]"; } void @@ -649,9 +684,9 @@ ARMAsmPrinter::printThumbAddrModeRI5Operand(const MachineInstr *MI, int Op, return; } - O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName; + O << "[" << getRegisterName(MO1.getReg()); if (MO3.getReg()) - O << ", " << TM.getRegisterInfo()->get(MO3.getReg()).AsmName; + O << ", " << getRegisterName(MO3.getReg()); else if (unsigned ImmOffs = MO2.getImm()) { O << ", #" << ImmOffs; if (Scale > 1) @@ -676,7 +711,7 @@ ARMAsmPrinter::printThumbAddrModeS4Operand(const MachineInstr *MI, int Op) { void ARMAsmPrinter::printThumbAddrModeSPOperand(const MachineInstr *MI,int Op) { const MachineOperand &MO1 = MI->getOperand(Op); const MachineOperand &MO2 = MI->getOperand(Op+1); - O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName; + O << "[" << getRegisterName(MO1.getReg()); if (unsigned ImmOffs = MO2.getImm()) O << ", #" << ImmOffs << " * 4"; O << "]"; @@ -684,20 +719,6 @@ void ARMAsmPrinter::printThumbAddrModeSPOperand(const MachineInstr *MI,int Op) { //===--------------------------------------------------------------------===// -/// printT2SOImmOperand - T2SOImm is: -/// 1. a 4-bit splat control value and 8 bit immediate value -/// 2. a 5-bit rotate amount and a non-zero 8-bit immediate value -/// represented by a normalizedin 7-bit value (msb is always 1) -void ARMAsmPrinter::printT2SOImmOperand(const MachineInstr *MI, int OpNum) { - const MachineOperand &MO = MI->getOperand(OpNum); - assert(MO.isImm() && "Not a valid so_imm value!"); - - unsigned Imm = ARM_AM::getT2SOImmValDecode(MO.getImm()); - // Always print the immediate directly, as the "rotate" form - // is deprecated in some contexts. - O << "#" << Imm; -} - // Constant shifts t2_so_reg is a 2-operand unit corresponding to the Thumb2 // register with shift forms. // REG 0 0 - e.g. R5 @@ -708,7 +729,7 @@ void ARMAsmPrinter::printT2SOOperand(const MachineInstr *MI, int OpNum) { unsigned Reg = MO1.getReg(); assert(TargetRegisterInfo::isPhysicalRegister(Reg)); - O << TM.getRegisterInfo()->getAsmName(Reg); + O << getRegisterName(Reg); // Print the shift opc. O << ", " @@ -724,7 +745,7 @@ void ARMAsmPrinter::printT2AddrModeImm12Operand(const MachineInstr *MI, const MachineOperand &MO1 = MI->getOperand(OpNum); const MachineOperand &MO2 = MI->getOperand(OpNum+1); - O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName; + O << "[" << getRegisterName(MO1.getReg()); unsigned OffImm = MO2.getImm(); if (OffImm) // Don't print +0. @@ -737,7 +758,7 @@ void ARMAsmPrinter::printT2AddrModeImm8Operand(const MachineInstr *MI, const MachineOperand &MO1 = MI->getOperand(OpNum); const MachineOperand &MO2 = MI->getOperand(OpNum+1); - O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName; + O << "[" << getRegisterName(MO1.getReg()); int32_t OffImm = (int32_t)MO2.getImm(); // Don't print +0. @@ -748,6 +769,22 @@ void ARMAsmPrinter::printT2AddrModeImm8Operand(const MachineInstr *MI, O << "]"; } +void ARMAsmPrinter::printT2AddrModeImm8s4Operand(const MachineInstr *MI, + int OpNum) { + const MachineOperand &MO1 = MI->getOperand(OpNum); + const MachineOperand &MO2 = MI->getOperand(OpNum+1); + + O << "[" << getRegisterName(MO1.getReg()); + + int32_t OffImm = (int32_t)MO2.getImm() / 4; + // Don't print +0. + if (OffImm < 0) + O << ", #-" << -OffImm << " * 4"; + else if (OffImm > 0) + O << ", #+" << OffImm << " * 4"; + O << "]"; +} + void ARMAsmPrinter::printT2AddrModeImm8OffsetOperand(const MachineInstr *MI, int OpNum) { const MachineOperand &MO1 = MI->getOperand(OpNum); @@ -765,17 +802,15 @@ void ARMAsmPrinter::printT2AddrModeSoRegOperand(const MachineInstr *MI, const MachineOperand &MO2 = MI->getOperand(OpNum+1); const MachineOperand &MO3 = MI->getOperand(OpNum+2); - O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName; + O << "[" << getRegisterName(MO1.getReg()); - if (MO2.getReg()) { - O << ", +" - << TM.getRegisterInfo()->get(MO2.getReg()).AsmName; + assert(MO2.getReg() && "Invalid so_reg load / store address!"); + O << ", " << getRegisterName(MO2.getReg()); - unsigned ShAmt = MO3.getImm(); - if (ShAmt) { - assert(ShAmt <= 3 && "Not a valid Thumb2 addressing mode!"); - O << ", lsl #" << ShAmt; - } + unsigned ShAmt = MO3.getImm(); + if (ShAmt) { + assert(ShAmt <= 3 && "Not a valid Thumb2 addressing mode!"); + O << ", lsl #" << ShAmt; } O << "]"; } @@ -799,14 +834,17 @@ void ARMAsmPrinter::printSBitModifierOperand(const MachineInstr *MI, int OpNum){ void ARMAsmPrinter::printPCLabel(const MachineInstr *MI, int OpNum) { int Id = (int)MI->getOperand(OpNum).getImm(); - O << TAI->getPrivateGlobalPrefix() << "PC" << Id; + O << MAI->getPrivateGlobalPrefix() << "PC" << Id; } void ARMAsmPrinter::printRegisterList(const MachineInstr *MI, int OpNum) { O << "{"; - for (unsigned i = OpNum, e = MI->getNumOperands(); i != e; ++i) { + // Always skip the first operand, it's the optional (and implicit writeback). + for (unsigned i = OpNum+1, e = MI->getNumOperands(); i != e; ++i) { + if (MI->getOperand(i).isImplicit()) + continue; + if ((int)i != OpNum+1) O << ", "; printOperand(MI, i); - if (i != e-1) O << ", "; } O << "}"; } @@ -818,14 +856,14 @@ void ARMAsmPrinter::printCPInstOperand(const MachineInstr *MI, int OpNum, // data itself. if (!strcmp(Modifier, "label")) { unsigned ID = MI->getOperand(OpNum).getImm(); - O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() + O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_' << ID << ":\n"; } else { assert(!strcmp(Modifier, "cpentry") && "Unknown modifier for CPE"); unsigned CPI = MI->getOperand(OpNum).getIndex(); const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPI]; - + if (MCPE.isMachineConstantPoolEntry()) { EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal); } else { @@ -835,57 +873,119 @@ void ARMAsmPrinter::printCPInstOperand(const MachineInstr *MI, int OpNum, } void ARMAsmPrinter::printJTBlockOperand(const MachineInstr *MI, int OpNum) { + assert(!Subtarget->isThumb2() && "Thumb2 should use double-jump jumptables!"); + const MachineOperand &MO1 = MI->getOperand(OpNum); const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id unsigned JTI = MO1.getIndex(); - O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() + O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() << '_' << JTI << '_' << MO2.getImm() << ":\n"; - const char *JTEntryDirective = TAI->getJumpTableDirective(); - if (!JTEntryDirective) - JTEntryDirective = TAI->getData32bitsDirective(); + const char *JTEntryDirective = MAI->getData32bitsDirective(); const MachineFunction *MF = MI->getParent()->getParent(); const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; - bool UseSet= TAI->getSetDirective() && TM.getRelocationModel() == Reloc::PIC_; - std::set<MachineBasicBlock*> JTSets; + bool UseSet= MAI->getSetDirective() && TM.getRelocationModel() == Reloc::PIC_; + SmallPtrSet<MachineBasicBlock*, 8> JTSets; for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) { MachineBasicBlock *MBB = JTBBs[i]; - if (UseSet && JTSets.insert(MBB).second) + bool isNew = JTSets.insert(MBB); + + if (UseSet && isNew) printPICJumpTableSetLabel(JTI, MO2.getImm(), MBB); O << JTEntryDirective << ' '; if (UseSet) - O << TAI->getPrivateGlobalPrefix() << getFunctionNumber() + O << MAI->getPrivateGlobalPrefix() << getFunctionNumber() << '_' << JTI << '_' << MO2.getImm() << "_set_" << MBB->getNumber(); else if (TM.getRelocationModel() == Reloc::PIC_) { - printBasicBlockLabel(MBB, false, false, false); - // If the arch uses custom Jump Table directives, don't calc relative to JT - if (!TAI->getJumpTableDirective()) - O << '-' << TAI->getPrivateGlobalPrefix() << "JTI" - << getFunctionNumber() << '_' << JTI << '_' << MO2.getImm(); - } else - printBasicBlockLabel(MBB, false, false, false); + GetMBBSymbol(MBB->getNumber())->print(O, MAI); + O << '-' << MAI->getPrivateGlobalPrefix() << "JTI" + << getFunctionNumber() << '_' << JTI << '_' << MO2.getImm(); + } else { + GetMBBSymbol(MBB->getNumber())->print(O, MAI); + } + if (i != e-1) + O << '\n'; + } +} + +void ARMAsmPrinter::printJT2BlockOperand(const MachineInstr *MI, int OpNum) { + const MachineOperand &MO1 = MI->getOperand(OpNum); + const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id + unsigned JTI = MO1.getIndex(); + O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() + << '_' << JTI << '_' << MO2.getImm() << ":\n"; + + const MachineFunction *MF = MI->getParent()->getParent(); + const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); + const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; + bool ByteOffset = false, HalfWordOffset = false; + if (MI->getOpcode() == ARM::t2TBB) + ByteOffset = true; + else if (MI->getOpcode() == ARM::t2TBH) + HalfWordOffset = true; + + for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) { + MachineBasicBlock *MBB = JTBBs[i]; + if (ByteOffset) + O << MAI->getData8bitsDirective(); + else if (HalfWordOffset) + O << MAI->getData16bitsDirective(); + if (ByteOffset || HalfWordOffset) { + O << '('; + GetMBBSymbol(MBB->getNumber())->print(O, MAI); + O << "-" << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() + << '_' << JTI << '_' << MO2.getImm() << ")/2"; + } else { + O << "\tb.w "; + GetMBBSymbol(MBB->getNumber())->print(O, MAI); + } if (i != e-1) O << '\n'; } + + // Make sure the instruction that follows TBB is 2-byte aligned. + // FIXME: Constant island pass should insert an "ALIGN" instruction instead. + if (ByteOffset && (JTBBs.size() & 1)) { + O << '\n'; + EmitAlignment(1); + } +} + +void ARMAsmPrinter::printTBAddrMode(const MachineInstr *MI, int OpNum) { + O << "[pc, " << getRegisterName(MI->getOperand(OpNum).getReg()); + if (MI->getOpcode() == ARM::t2TBH) + O << ", lsl #1"; + O << ']'; } +void ARMAsmPrinter::printNoHashImmediate(const MachineInstr *MI, int OpNum) { + O << MI->getOperand(OpNum).getImm(); +} bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, unsigned AsmVariant, const char *ExtraCode){ // Does this asm operand have a single letter operand modifier? if (ExtraCode && ExtraCode[0]) { if (ExtraCode[1] != 0) return true; // Unknown modifier. - + switch (ExtraCode[0]) { default: return true; // Unknown modifier. - case 'a': // Don't print "#" before a global var name or constant. - case 'c': // Don't print "$" before a global var name or constant. - printOperand(MI, OpNum, "no_hash"); + case 'a': // Print as a memory address. + if (MI->getOperand(OpNum).isReg()) { + O << "[" << getRegisterName(MI->getOperand(OpNum).getReg()) << "]"; + return false; + } + // Fallthrough + case 'c': // Don't print "#" before an immediate operand. + if (!MI->getOperand(OpNum).isImm()) + return true; + printNoHashImmediate(MI, OpNum); return false; case 'P': // Print a VFP double precision register. printOperand(MI, OpNum); @@ -898,7 +998,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, if (TM.getTargetData()->isBigEndian()) break; // Fallthrough - case 'H': // Write second word of DI / DF reference. + case 'H': // Write second word of DI / DF reference. // Verify that this operand has two consecutive registers. if (!MI->getOperand(OpNum).isReg() || OpNum+1 == MI->getNumOperands() || @@ -907,7 +1007,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, ++OpNum; // Return the high-part. } } - + printOperand(MI, OpNum); return false; } @@ -917,7 +1017,10 @@ bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, const char *ExtraCode) { if (ExtraCode && ExtraCode[0]) return true; // Unknown modifier. - printAddrMode2Operand(MI, OpNum); + + const MachineOperand &MO = MI->getOperand(OpNum); + assert(MO.isReg() && "unexpected inline asm memory operand"); + O << "[" << getRegisterName(MO.getReg()) << "]"; return false; } @@ -938,16 +1041,47 @@ void ARMAsmPrinter::printMachineInstruction(const MachineInstr *MI) { }} // Call the autogenerated instruction printer routines. + processDebugLoc(MI, true); printInstruction(MI); + if (VerboseAsm && !MI->getDebugLoc().isUnknown()) + EmitComments(*MI); + O << '\n'; + processDebugLoc(MI, false); } -bool ARMAsmPrinter::doInitialization(Module &M) { - - bool Result = AsmPrinter::doInitialization(M); - DW = getAnalysisIfAvailable<DwarfWriter>(); +void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) { + if (Subtarget->isTargetDarwin()) { + Reloc::Model RelocM = TM.getRelocationModel(); + if (RelocM == Reloc::PIC_ || RelocM == Reloc::DynamicNoPIC) { + // Declare all the text sections up front (before the DWARF sections + // emitted by AsmPrinter::doInitialization) so the assembler will keep + // them together at the beginning of the object file. This helps + // avoid out-of-range branches that are due a fundamental limitation of + // the way symbol offsets are encoded with the current Darwin ARM + // relocations. + TargetLoweringObjectFileMachO &TLOFMacho = + static_cast<TargetLoweringObjectFileMachO &>(getObjFileLowering()); + OutStreamer.SwitchSection(TLOFMacho.getTextSection()); + OutStreamer.SwitchSection(TLOFMacho.getTextCoalSection()); + OutStreamer.SwitchSection(TLOFMacho.getConstTextCoalSection()); + if (RelocM == Reloc::DynamicNoPIC) { + const MCSection *sect = + TLOFMacho.getMachOSection("__TEXT", "__symbol_stub4", + MCSectionMachO::S_SYMBOL_STUBS, + 12, SectionKind::getText()); + OutStreamer.SwitchSection(sect); + } else { + const MCSection *sect = + TLOFMacho.getMachOSection("__TEXT", "__picsymbolstub4", + MCSectionMachO::S_SYMBOL_STUBS, + 16, SectionKind::getText()); + OutStreamer.SwitchSection(sect); + } + } + } - // Thumb-2 instructions are supported only in unified assembler syntax mode. - if (Subtarget->hasThumb2()) + // Use unified assembler syntax mode for Thumb. + if (Subtarget->isThumb()) O << "\t.syntax unified\n"; // Emit ARM Build Attributes @@ -975,22 +1109,16 @@ bool ARMAsmPrinter::doInitialization(Module &M) { O << "\t.eabi_attribute " << ARMBuildAttrs::ABI_align8_needed << ", 1\n" << "\t.eabi_attribute " << ARMBuildAttrs::ABI_align8_preserved << ", 1\n"; + // Hard float. Use both S and D registers and conform to AAPCS-VFP. + if (Subtarget->isAAPCS_ABI() && FloatABIType == FloatABI::Hard) + O << "\t.eabi_attribute " << ARMBuildAttrs::ABI_HardFP_use << ", 3\n" + << "\t.eabi_attribute " << ARMBuildAttrs::ABI_VFP_args << ", 1\n"; + // FIXME: Should we signal R9 usage? } - - return Result; -} - -/// PrintUnmangledNameSafely - Print out the printable characters in the name. -/// Don't print things like \\n or \\0. -static void PrintUnmangledNameSafely(const Value *V, raw_ostream &OS) { - for (const char *Name = V->getNameStart(), *E = Name+V->getNameLen(); - Name != E; ++Name) - if (isprint(*Name)) - OS << *Name; } -void ARMAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) { +void ARMAsmPrinter::PrintGlobalVariable(const GlobalVariable* GVar) { const TargetData *TD = TM.getTargetData(); if (!GVar->hasInitializer()) // External global require no code @@ -1009,10 +1137,8 @@ void ARMAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) { return; } - std::string name = Mang->getValueName(GVar); + std::string name = Mang->getMangledName(GVar); Constant *C = GVar->getInitializer(); - if (isa<MDNode>(C) || isa<MDString>(C)) - return; const Type *Type = C->getType(); unsigned Size = TD->getTypeAllocSize(Type); unsigned Align = TD->getPreferredAlignmentLog(GVar); @@ -1023,14 +1149,16 @@ void ARMAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) { if (Subtarget->isTargetELF()) O << "\t.type " << name << ",%object\n"; - if (C->isNullValue() && !GVar->hasSection() && !GVar->isThreadLocal() && - !(isDarwin && - TAI->SectionKindForGlobal(GVar) == SectionKind::RODataMergeStr)) { - // FIXME: This seems to be pretty darwin-specific + const MCSection *TheSection = + getObjFileLowering().SectionForGlobal(GVar, Mang, TM); + OutStreamer.SwitchSection(TheSection); + // FIXME: get this stuff from section kind flags. + if (C->isNullValue() && !GVar->hasSection() && !GVar->isThreadLocal() && + // Don't put things that should go in the cstring section into "comm". + !TheSection->getKind().isMergeableCString()) { if (GVar->hasExternalLinkage()) { - SwitchToSection(TAI->SectionForGlobal(GVar)); - if (const char *Directive = TAI->getZeroFillDirective()) { + if (const char *Directive = MAI->getZeroFillDirective()) { O << "\t.globl\t" << name << "\n"; O << Directive << "__DATA, __common, " << name << ", " << Size << ", " << Align << "\n"; @@ -1043,57 +1171,56 @@ void ARMAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) { if (isDarwin) { if (GVar->hasLocalLinkage()) { - O << TAI->getLCOMMDirective() << name << "," << Size + O << MAI->getLCOMMDirective() << name << "," << Size << ',' << Align; } else if (GVar->hasCommonLinkage()) { - O << TAI->getCOMMDirective() << name << "," << Size + O << MAI->getCOMMDirective() << name << "," << Size << ',' << Align; } else { - SwitchToSection(TAI->SectionForGlobal(GVar)); + OutStreamer.SwitchSection(TheSection); O << "\t.globl " << name << '\n' - << TAI->getWeakDefDirective() << name << '\n'; + << MAI->getWeakDefDirective() << name << '\n'; EmitAlignment(Align, GVar); O << name << ":"; if (VerboseAsm) { - O << "\t\t\t\t" << TAI->getCommentString() << ' '; - PrintUnmangledNameSafely(GVar, O); + O << "\t\t\t\t" << MAI->getCommentString() << ' '; + WriteAsOperand(O, GVar, /*PrintType=*/false, GVar->getParent()); } O << '\n'; EmitGlobalConstant(C); return; } - } else if (TAI->getLCOMMDirective() != NULL) { + } else if (MAI->getLCOMMDirective() != NULL) { if (GVar->hasLocalLinkage()) { - O << TAI->getLCOMMDirective() << name << "," << Size; + O << MAI->getLCOMMDirective() << name << "," << Size; } else { - O << TAI->getCOMMDirective() << name << "," << Size; - if (TAI->getCOMMDirectiveTakesAlignment()) - O << ',' << (TAI->getAlignmentIsInBytes() ? (1 << Align) : Align); + O << MAI->getCOMMDirective() << name << "," << Size; + if (MAI->getCOMMDirectiveTakesAlignment()) + O << ',' << (MAI->getAlignmentIsInBytes() ? (1 << Align) : Align); } } else { - SwitchToSection(TAI->SectionForGlobal(GVar)); if (GVar->hasLocalLinkage()) O << "\t.local\t" << name << "\n"; - O << TAI->getCOMMDirective() << name << "," << Size; - if (TAI->getCOMMDirectiveTakesAlignment()) - O << "," << (TAI->getAlignmentIsInBytes() ? (1 << Align) : Align); + O << MAI->getCOMMDirective() << name << "," << Size; + if (MAI->getCOMMDirectiveTakesAlignment()) + O << "," << (MAI->getAlignmentIsInBytes() ? (1 << Align) : Align); } if (VerboseAsm) { - O << "\t\t" << TAI->getCommentString() << " "; - PrintUnmangledNameSafely(GVar, O); + O << "\t\t" << MAI->getCommentString() << " "; + WriteAsOperand(O, GVar, /*PrintType=*/false, GVar->getParent()); } O << "\n"; return; } } - SwitchToSection(TAI->SectionForGlobal(GVar)); switch (GVar->getLinkage()) { - case GlobalValue::CommonLinkage: - case GlobalValue::LinkOnceAnyLinkage: - case GlobalValue::LinkOnceODRLinkage: - case GlobalValue::WeakAnyLinkage: - case GlobalValue::WeakODRLinkage: + case GlobalValue::CommonLinkage: + case GlobalValue::LinkOnceAnyLinkage: + case GlobalValue::LinkOnceODRLinkage: + case GlobalValue::WeakAnyLinkage: + case GlobalValue::WeakODRLinkage: + case GlobalValue::LinkerPrivateLinkage: if (isDarwin) { O << "\t.globl " << name << "\n" << "\t.weak_definition " << name << "\n"; @@ -1101,28 +1228,27 @@ void ARMAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) { O << "\t.weak " << name << "\n"; } break; - case GlobalValue::AppendingLinkage: - // FIXME: appending linkage variables should go into a section of - // their name or something. For now, just emit them as external. - case GlobalValue::ExternalLinkage: + case GlobalValue::AppendingLinkage: + // FIXME: appending linkage variables should go into a section of + // their name or something. For now, just emit them as external. + case GlobalValue::ExternalLinkage: O << "\t.globl " << name << "\n"; - // FALL THROUGH - case GlobalValue::PrivateLinkage: - case GlobalValue::InternalLinkage: break; - default: - assert(0 && "Unknown linkage type!"); + case GlobalValue::PrivateLinkage: + case GlobalValue::InternalLinkage: break; + default: + llvm_unreachable("Unknown linkage type!"); } EmitAlignment(Align, GVar); O << name << ":"; if (VerboseAsm) { - O << "\t\t\t\t" << TAI->getCommentString() << " "; - PrintUnmangledNameSafely(GVar, O); + O << "\t\t\t\t" << MAI->getCommentString() << " "; + WriteAsOperand(O, GVar, /*PrintType=*/false, GVar->getParent()); } O << "\n"; - if (TAI->hasDotTypeDotSizeDirective()) + if (MAI->hasDotTypeDotSizeDirective()) O << "\t.size " << name << ", " << Size << "\n"; EmitGlobalConstant(C); @@ -1131,83 +1257,36 @@ void ARMAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) { bool ARMAsmPrinter::doFinalization(Module &M) { - for (Module::const_global_iterator I = M.global_begin(), E = M.global_end(); - I != E; ++I) - printModuleLevelGV(I); - if (Subtarget->isTargetDarwin()) { - SwitchToDataSection(""); - - // Output stubs for dynamically-linked functions - for (StringSet<>::iterator i = FnStubs.begin(), e = FnStubs.end(); - i != e; ++i) { - if (TM.getRelocationModel() == Reloc::PIC_) - SwitchToTextSection(".section __TEXT,__picsymbolstub4,symbol_stubs," - "none,16", 0); - else - SwitchToTextSection(".section __TEXT,__symbol_stub4,symbol_stubs," - "none,12", 0); + // All darwin targets use mach-o. + TargetLoweringObjectFileMachO &TLOFMacho = + static_cast<TargetLoweringObjectFileMachO &>(getObjFileLowering()); - EmitAlignment(2); - O << "\t.code\t32\n"; - - const char *p = i->getKeyData(); - printSuffixedName(p, "$stub"); - O << ":\n"; - O << "\t.indirect_symbol " << p << "\n"; - O << "\tldr ip, "; - printSuffixedName(p, "$slp"); - O << "\n"; - if (TM.getRelocationModel() == Reloc::PIC_) { - printSuffixedName(p, "$scv"); - O << ":\n"; - O << "\tadd ip, pc, ip\n"; - } - O << "\tldr pc, [ip, #0]\n"; - printSuffixedName(p, "$slp"); - O << ":\n"; - O << "\t.long\t"; - printSuffixedName(p, "$lazy_ptr"); - if (TM.getRelocationModel() == Reloc::PIC_) { - O << "-("; - printSuffixedName(p, "$scv"); - O << "+8)\n"; - } else - O << "\n"; - SwitchToDataSection(".lazy_symbol_pointer", 0); - printSuffixedName(p, "$lazy_ptr"); - O << ":\n"; - O << "\t.indirect_symbol " << p << "\n"; - O << "\t.long\tdyld_stub_binding_helper\n"; - } - O << "\n"; + O << '\n'; // Output non-lazy-pointers for external and common global variables. if (!GVNonLazyPtrs.empty()) { - SwitchToDataSection("\t.non_lazy_symbol_pointer", 0); - for (StringSet<>::iterator i = GVNonLazyPtrs.begin(), - e = GVNonLazyPtrs.end(); i != e; ++i) { - const char *p = i->getKeyData(); - printSuffixedName(p, "$non_lazy_ptr"); - O << ":\n"; - O << "\t.indirect_symbol " << p << "\n"; + // Switch with ".non_lazy_symbol_pointer" directive. + OutStreamer.SwitchSection(TLOFMacho.getNonLazySymbolPointerSection()); + EmitAlignment(2); + for (StringMap<std::string>::iterator I = GVNonLazyPtrs.begin(), + E = GVNonLazyPtrs.end(); I != E; ++I) { + O << I->second << ":\n"; + O << "\t.indirect_symbol " << I->getKeyData() << "\n"; O << "\t.long\t0\n"; } } if (!HiddenGVNonLazyPtrs.empty()) { - SwitchToSection(TAI->getDataSection()); - for (StringSet<>::iterator i = HiddenGVNonLazyPtrs.begin(), - e = HiddenGVNonLazyPtrs.end(); i != e; ++i) { - const char *p = i->getKeyData(); - EmitAlignment(2); - printSuffixedName(p, "$non_lazy_ptr"); - O << ":\n"; - O << "\t.long " << p << "\n"; + OutStreamer.SwitchSection(getObjFileLowering().getDataSection()); + EmitAlignment(2); + for (StringMap<std::string>::iterator I = HiddenGVNonLazyPtrs.begin(), + E = HiddenGVNonLazyPtrs.end(); I != E; ++I) { + O << I->second << ":\n"; + O << "\t.long " << I->getKeyData() << "\n"; } } - // Funny Darwin hack: This flag tells the linker that no global symbols // contain code that falls through to other global symbols (e.g. the obvious // implementation of multiple entry points). If this doesn't occur, the @@ -1219,24 +1298,8 @@ bool ARMAsmPrinter::doFinalization(Module &M) { return AsmPrinter::doFinalization(M); } -/// createARMCodePrinterPass - Returns a pass that prints the ARM -/// assembly code for a MachineFunction to the given output stream, -/// using the given target machine description. This should work -/// regardless of whether the function is in SSA form. -/// -FunctionPass *llvm::createARMCodePrinterPass(raw_ostream &o, - ARMBaseTargetMachine &tm, - bool verbose) { - return new ARMAsmPrinter(o, tm, tm.getTargetAsmInfo(), verbose); -} - -namespace { - static struct Register { - Register() { - ARMBaseTargetMachine::registerAsmPrinter(createARMCodePrinterPass); - } - } Registrator; -} - // Force static initialization. -extern "C" void LLVMInitializeARMAsmPrinter() { } +extern "C" void LLVMInitializeARMAsmPrinter() { + RegisterAsmPrinter<ARMAsmPrinter> X(TheARMTarget); + RegisterAsmPrinter<ARMAsmPrinter> Y(TheThumbTarget); +} diff --git a/lib/Target/ARM/AsmPrinter/Makefile b/lib/Target/ARM/AsmPrinter/Makefile index ce36cec..208becc 100644 --- a/lib/Target/ARM/AsmPrinter/Makefile +++ b/lib/Target/ARM/AsmPrinter/Makefile @@ -1,4 +1,4 @@ -##===- lib/Target/ARM/Makefile -----------------------------*- Makefile -*-===## +##===- lib/Target/ARM/AsmPrinter/Makefile ------------------*- Makefile -*-===## # # The LLVM Compiler Infrastructure # diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt index 9c46fe0..6e09eb2 100644 --- a/lib/Target/ARM/CMakeLists.txt +++ b/lib/Target/ARM/CMakeLists.txt @@ -12,6 +12,8 @@ tablegen(ARMGenCallingConv.inc -gen-callingconv) tablegen(ARMGenSubtarget.inc -gen-subtarget) add_llvm_target(ARMCodeGen + ARMBaseInstrInfo.cpp + ARMBaseRegisterInfo.cpp ARMCodeEmitter.cpp ARMConstantIslandPass.cpp ARMConstantPoolValue.cpp @@ -20,14 +22,17 @@ add_llvm_target(ARMCodeGen ARMISelLowering.cpp ARMJITInfo.cpp ARMLoadStoreOptimizer.cpp + ARMMCAsmInfo.cpp ARMRegisterInfo.cpp ARMSubtarget.cpp - ARMTargetAsmInfo.cpp ARMTargetMachine.cpp + NEONPreAllocPass.cpp Thumb1InstrInfo.cpp Thumb1RegisterInfo.cpp + Thumb2ITBlockPass.cpp Thumb2InstrInfo.cpp Thumb2RegisterInfo.cpp + Thumb2SizeReduction.cpp ) target_link_libraries (LLVMARMCodeGen LLVMSelectionDAG) diff --git a/lib/Target/ARM/Makefile b/lib/Target/ARM/Makefile index 9a3b9be..a8dd38c 100644 --- a/lib/Target/ARM/Makefile +++ b/lib/Target/ARM/Makefile @@ -18,6 +18,6 @@ BUILT_SOURCES = ARMGenRegisterInfo.h.inc ARMGenRegisterNames.inc \ ARMGenDAGISel.inc ARMGenSubtarget.inc \ ARMGenCodeEmitter.inc ARMGenCallingConv.inc -DIRS = AsmPrinter +DIRS = AsmPrinter AsmParser TargetInfo include $(LEVEL)/Makefile.common diff --git a/lib/Target/ARM/NEONPreAllocPass.cpp b/lib/Target/ARM/NEONPreAllocPass.cpp new file mode 100644 index 0000000..821b872 --- /dev/null +++ b/lib/Target/ARM/NEONPreAllocPass.cpp @@ -0,0 +1,394 @@ +//===-- NEONPreAllocPass.cpp - Allocate adjacent NEON registers--*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "neon-prealloc" +#include "ARM.h" +#include "ARMInstrInfo.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +using namespace llvm; + +namespace { + class VISIBILITY_HIDDEN NEONPreAllocPass : public MachineFunctionPass { + const TargetInstrInfo *TII; + + public: + static char ID; + NEONPreAllocPass() : MachineFunctionPass(&ID) {} + + virtual bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { + return "NEON register pre-allocation pass"; + } + + private: + bool PreAllocNEONRegisters(MachineBasicBlock &MBB); + }; + + char NEONPreAllocPass::ID = 0; +} + +static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs, + unsigned &Offset, unsigned &Stride) { + // Default to unit stride with no offset. + Stride = 1; + Offset = 0; + + switch (Opcode) { + default: + break; + + case ARM::VLD2d8: + case ARM::VLD2d16: + case ARM::VLD2d32: + case ARM::VLD2d64: + case ARM::VLD2LNd8: + case ARM::VLD2LNd16: + case ARM::VLD2LNd32: + FirstOpnd = 0; + NumRegs = 2; + return true; + + case ARM::VLD2q8: + case ARM::VLD2q16: + case ARM::VLD2q32: + FirstOpnd = 0; + NumRegs = 4; + return true; + + case ARM::VLD2LNq16a: + case ARM::VLD2LNq32a: + FirstOpnd = 0; + NumRegs = 2; + Offset = 0; + Stride = 2; + return true; + + case ARM::VLD2LNq16b: + case ARM::VLD2LNq32b: + FirstOpnd = 0; + NumRegs = 2; + Offset = 1; + Stride = 2; + return true; + + case ARM::VLD3d8: + case ARM::VLD3d16: + case ARM::VLD3d32: + case ARM::VLD3d64: + case ARM::VLD3LNd8: + case ARM::VLD3LNd16: + case ARM::VLD3LNd32: + FirstOpnd = 0; + NumRegs = 3; + return true; + + case ARM::VLD3q8a: + case ARM::VLD3q16a: + case ARM::VLD3q32a: + FirstOpnd = 0; + NumRegs = 3; + Offset = 0; + Stride = 2; + return true; + + case ARM::VLD3q8b: + case ARM::VLD3q16b: + case ARM::VLD3q32b: + FirstOpnd = 0; + NumRegs = 3; + Offset = 1; + Stride = 2; + return true; + + case ARM::VLD3LNq16a: + case ARM::VLD3LNq32a: + FirstOpnd = 0; + NumRegs = 3; + Offset = 0; + Stride = 2; + return true; + + case ARM::VLD3LNq16b: + case ARM::VLD3LNq32b: + FirstOpnd = 0; + NumRegs = 3; + Offset = 1; + Stride = 2; + return true; + + case ARM::VLD4d8: + case ARM::VLD4d16: + case ARM::VLD4d32: + case ARM::VLD4d64: + case ARM::VLD4LNd8: + case ARM::VLD4LNd16: + case ARM::VLD4LNd32: + FirstOpnd = 0; + NumRegs = 4; + return true; + + case ARM::VLD4q8a: + case ARM::VLD4q16a: + case ARM::VLD4q32a: + FirstOpnd = 0; + NumRegs = 4; + Offset = 0; + Stride = 2; + return true; + + case ARM::VLD4q8b: + case ARM::VLD4q16b: + case ARM::VLD4q32b: + FirstOpnd = 0; + NumRegs = 4; + Offset = 1; + Stride = 2; + return true; + + case ARM::VLD4LNq16a: + case ARM::VLD4LNq32a: + FirstOpnd = 0; + NumRegs = 4; + Offset = 0; + Stride = 2; + return true; + + case ARM::VLD4LNq16b: + case ARM::VLD4LNq32b: + FirstOpnd = 0; + NumRegs = 4; + Offset = 1; + Stride = 2; + return true; + + case ARM::VST2d8: + case ARM::VST2d16: + case ARM::VST2d32: + case ARM::VST2d64: + case ARM::VST2LNd8: + case ARM::VST2LNd16: + case ARM::VST2LNd32: + FirstOpnd = 3; + NumRegs = 2; + return true; + + case ARM::VST2q8: + case ARM::VST2q16: + case ARM::VST2q32: + FirstOpnd = 3; + NumRegs = 4; + return true; + + case ARM::VST2LNq16a: + case ARM::VST2LNq32a: + FirstOpnd = 3; + NumRegs = 2; + Offset = 0; + Stride = 2; + return true; + + case ARM::VST2LNq16b: + case ARM::VST2LNq32b: + FirstOpnd = 3; + NumRegs = 2; + Offset = 1; + Stride = 2; + return true; + + case ARM::VST3d8: + case ARM::VST3d16: + case ARM::VST3d32: + case ARM::VST3d64: + case ARM::VST3LNd8: + case ARM::VST3LNd16: + case ARM::VST3LNd32: + FirstOpnd = 3; + NumRegs = 3; + return true; + + case ARM::VST3q8a: + case ARM::VST3q16a: + case ARM::VST3q32a: + FirstOpnd = 4; + NumRegs = 3; + Offset = 0; + Stride = 2; + return true; + + case ARM::VST3q8b: + case ARM::VST3q16b: + case ARM::VST3q32b: + FirstOpnd = 4; + NumRegs = 3; + Offset = 1; + Stride = 2; + return true; + + case ARM::VST3LNq16a: + case ARM::VST3LNq32a: + FirstOpnd = 3; + NumRegs = 3; + Offset = 0; + Stride = 2; + return true; + + case ARM::VST3LNq16b: + case ARM::VST3LNq32b: + FirstOpnd = 3; + NumRegs = 3; + Offset = 1; + Stride = 2; + return true; + + case ARM::VST4d8: + case ARM::VST4d16: + case ARM::VST4d32: + case ARM::VST4d64: + case ARM::VST4LNd8: + case ARM::VST4LNd16: + case ARM::VST4LNd32: + FirstOpnd = 3; + NumRegs = 4; + return true; + + case ARM::VST4q8a: + case ARM::VST4q16a: + case ARM::VST4q32a: + FirstOpnd = 4; + NumRegs = 4; + Offset = 0; + Stride = 2; + return true; + + case ARM::VST4q8b: + case ARM::VST4q16b: + case ARM::VST4q32b: + FirstOpnd = 4; + NumRegs = 4; + Offset = 1; + Stride = 2; + return true; + + case ARM::VST4LNq16a: + case ARM::VST4LNq32a: + FirstOpnd = 3; + NumRegs = 4; + Offset = 0; + Stride = 2; + return true; + + case ARM::VST4LNq16b: + case ARM::VST4LNq32b: + FirstOpnd = 3; + NumRegs = 4; + Offset = 1; + Stride = 2; + return true; + + case ARM::VTBL2: + FirstOpnd = 1; + NumRegs = 2; + return true; + + case ARM::VTBL3: + FirstOpnd = 1; + NumRegs = 3; + return true; + + case ARM::VTBL4: + FirstOpnd = 1; + NumRegs = 4; + return true; + + case ARM::VTBX2: + FirstOpnd = 2; + NumRegs = 2; + return true; + + case ARM::VTBX3: + FirstOpnd = 2; + NumRegs = 3; + return true; + + case ARM::VTBX4: + FirstOpnd = 2; + NumRegs = 4; + return true; + } + + return false; +} + +bool NEONPreAllocPass::PreAllocNEONRegisters(MachineBasicBlock &MBB) { + bool Modified = false; + + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + for (; MBBI != E; ++MBBI) { + MachineInstr *MI = &*MBBI; + unsigned FirstOpnd, NumRegs, Offset, Stride; + if (!isNEONMultiRegOp(MI->getOpcode(), FirstOpnd, NumRegs, Offset, Stride)) + continue; + + MachineBasicBlock::iterator NextI = next(MBBI); + for (unsigned R = 0; R < NumRegs; ++R) { + MachineOperand &MO = MI->getOperand(FirstOpnd + R); + assert(MO.isReg() && MO.getSubReg() == 0 && "unexpected operand"); + unsigned VirtReg = MO.getReg(); + assert(TargetRegisterInfo::isVirtualRegister(VirtReg) && + "expected a virtual register"); + + // For now, just assign a fixed set of adjacent registers. + // This leaves plenty of room for future improvements. + static const unsigned NEONDRegs[] = { + ARM::D0, ARM::D1, ARM::D2, ARM::D3, + ARM::D4, ARM::D5, ARM::D6, ARM::D7 + }; + MO.setReg(NEONDRegs[Offset + R * Stride]); + + if (MO.isUse()) { + // Insert a copy from VirtReg. + TII->copyRegToReg(MBB, MBBI, MO.getReg(), VirtReg, + ARM::DPRRegisterClass, ARM::DPRRegisterClass); + if (MO.isKill()) { + MachineInstr *CopyMI = prior(MBBI); + CopyMI->findRegisterUseOperand(VirtReg)->setIsKill(); + } + MO.setIsKill(); + } else if (MO.isDef() && !MO.isDead()) { + // Add a copy to VirtReg. + TII->copyRegToReg(MBB, NextI, VirtReg, MO.getReg(), + ARM::DPRRegisterClass, ARM::DPRRegisterClass); + } + } + } + + return Modified; +} + +bool NEONPreAllocPass::runOnMachineFunction(MachineFunction &MF) { + TII = MF.getTarget().getInstrInfo(); + + bool Modified = false; + for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E; + ++MFI) { + MachineBasicBlock &MBB = *MFI; + Modified |= PreAllocNEONRegisters(MBB); + } + + return Modified; +} + +/// createNEONPreAllocPass - returns an instance of the NEON register +/// pre-allocation pass. +FunctionPass *llvm::createNEONPreAllocPass() { + return new NEONPreAllocPass(); +} diff --git a/lib/Target/ARM/README-Thumb.txt b/lib/Target/ARM/README-Thumb.txt index 4d3200b..a961a57 100644 --- a/lib/Target/ARM/README-Thumb.txt +++ b/lib/Target/ARM/README-Thumb.txt @@ -226,3 +226,31 @@ etc. Almost all Thumb instructions clobber condition code. //===---------------------------------------------------------------------===// Add ldmia, stmia support. + +//===---------------------------------------------------------------------===// + +Thumb load / store address mode offsets are scaled. The values kept in the +instruction operands are pre-scale values. This probably ought to be changed +to avoid extra work when we convert Thumb2 instructions to Thumb1 instructions. + +//===---------------------------------------------------------------------===// + +We need to make (some of the) Thumb1 instructions predicable. That will allow +shrinking of predicated Thumb2 instructions. To allow this, we need to be able +to toggle the 's' bit since they do not set CPSR when they are inside IT blocks. + +//===---------------------------------------------------------------------===// + +Make use of hi register variants of cmp: tCMPhir / tCMPZhir. + +//===---------------------------------------------------------------------===// + +Thumb1 immediate field sometimes keep pre-scaled values. See +Thumb1RegisterInfo::eliminateFrameIndex. This is inconsistent from ARM and +Thumb2. + +//===---------------------------------------------------------------------===// + +Rather than having tBR_JTr print a ".align 2" and constant island pass pad it, +add a target specific ALIGN instruction instead. That way, GetInstSizeInBytes +won't have to over-estimate. It can also be used for loop alignment pass. diff --git a/lib/Target/ARM/README-Thumb2.txt b/lib/Target/ARM/README-Thumb2.txt new file mode 100644 index 0000000..e7c2552 --- /dev/null +++ b/lib/Target/ARM/README-Thumb2.txt @@ -0,0 +1,6 @@ +//===---------------------------------------------------------------------===// +// Random ideas for the ARM backend (Thumb2 specific). +//===---------------------------------------------------------------------===// + +Make sure jumptable destinations are below the jumptable in order to make use +of tbb / tbh. diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt index f3377f9..8fb1da3 100644 --- a/lib/Target/ARM/README.txt +++ b/lib/Target/ARM/README.txt @@ -537,3 +537,66 @@ Split out LDR (literal) from normal ARM LDR instruction. Also consider spliting LDR into imm12 and so_reg forms. This allows us to clean up some code. e.g. ARMLoadStoreOptimizer does not need to look at LDR (literal) and LDR (so_reg) while ARMConstantIslandPass only need to worry about LDR (literal). + +//===---------------------------------------------------------------------===// + +We need to fix constant isel for ARMv6t2 to use MOVT. + +//===---------------------------------------------------------------------===// + +Constant island pass should make use of full range SoImm values for LEApcrel. +Be careful though as the last attempt caused infinite looping on lencod. + +//===---------------------------------------------------------------------===// + +Predication issue. This function: + +extern unsigned array[ 128 ]; +int foo( int x ) { + int y; + y = array[ x & 127 ]; + if ( x & 128 ) + y = 123456789 & ( y >> 2 ); + else + y = 123456789 & y; + return y; +} + +compiles to: + +_foo: + and r1, r0, #127 + ldr r2, LCPI1_0 + ldr r2, [r2] + ldr r1, [r2, +r1, lsl #2] + mov r2, r1, lsr #2 + tst r0, #128 + moveq r2, r1 + ldr r0, LCPI1_1 + and r0, r2, r0 + bx lr + +It would be better to do something like this, to fold the shift into the +conditional move: + + and r1, r0, #127 + ldr r2, LCPI1_0 + ldr r2, [r2] + ldr r1, [r2, +r1, lsl #2] + tst r0, #128 + movne r1, r1, lsr #2 + ldr r0, LCPI1_1 + and r0, r1, r0 + bx lr + +it saves an instruction and a register. + +//===---------------------------------------------------------------------===// + +add/sub/and/or + i32 imm can be simplified by folding part of the immediate +into the operation. + +//===---------------------------------------------------------------------===// + +It might be profitable to cse MOVi16 if there are lots of 32-bit immediates +with the same bottom half. diff --git a/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp new file mode 100644 index 0000000..163a0a9 --- /dev/null +++ b/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp @@ -0,0 +1,23 @@ +//===-- ARMTargetInfo.cpp - ARM Target Implementation ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "llvm/Module.h" +#include "llvm/Target/TargetRegistry.h" +using namespace llvm; + +Target llvm::TheARMTarget, llvm::TheThumbTarget; + +extern "C" void LLVMInitializeARMTargetInfo() { + RegisterTarget<Triple::arm, /*HasJIT=*/true> + X(TheARMTarget, "arm", "ARM"); + + RegisterTarget<Triple::thumb, /*HasJIT=*/true> + Y(TheThumbTarget, "thumb", "Thumb"); +} diff --git a/lib/Target/ARM/TargetInfo/CMakeLists.txt b/lib/Target/ARM/TargetInfo/CMakeLists.txt new file mode 100644 index 0000000..3910bb0 --- /dev/null +++ b/lib/Target/ARM/TargetInfo/CMakeLists.txt @@ -0,0 +1,7 @@ +include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) + +add_llvm_library(LLVMARMInfo + ARMTargetInfo.cpp + ) + +add_dependencies(LLVMARMInfo ARMCodeGenTable_gen) diff --git a/lib/Target/ARM/TargetInfo/Makefile b/lib/Target/ARM/TargetInfo/Makefile new file mode 100644 index 0000000..6292ab1 --- /dev/null +++ b/lib/Target/ARM/TargetInfo/Makefile @@ -0,0 +1,15 @@ +##===- lib/Target/ARM/TargetInfo/Makefile ------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../../.. +LIBRARYNAME = LLVMARMInfo + +# Hack: we need to include 'main' target directory to grab private headers +CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp index e13a811..7eed30e 100644 --- a/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -22,63 +22,29 @@ using namespace llvm; -Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI) - : ARMBaseInstrInfo(STI), RI(*this, STI) { +Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI) : RI(*this, STI) { } -bool Thumb1InstrInfo::isMoveInstr(const MachineInstr &MI, - unsigned &SrcReg, unsigned &DstReg, - unsigned& SrcSubIdx, unsigned& DstSubIdx) const { - SrcSubIdx = DstSubIdx = 0; // No sub-registers. - - unsigned oc = MI.getOpcode(); - switch (oc) { - default: - return false; - case ARM::tMOVr: - case ARM::tMOVhir2lor: - case ARM::tMOVlor2hir: - case ARM::tMOVhir2hir: - assert(MI.getDesc().getNumOperands() >= 2 && - MI.getOperand(0).isReg() && - MI.getOperand(1).isReg() && - "Invalid Thumb MOV instruction"); - SrcReg = MI.getOperand(1).getReg(); - DstReg = MI.getOperand(0).getReg(); - return true; - } -} - -unsigned Thumb1InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const { - switch (MI->getOpcode()) { - default: break; - case ARM::tRestore: - if (MI->getOperand(1).isFI() && - MI->getOperand(2).isImm() && - MI->getOperand(2).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); - } - break; - } +unsigned Thumb1InstrInfo::getUnindexedOpcode(unsigned Opc) const { return 0; } -unsigned Thumb1InstrInfo::isStoreToStackSlot(const MachineInstr *MI, - int &FrameIndex) const { - switch (MI->getOpcode()) { - default: break; - case ARM::tSpill: - if (MI->getOperand(1).isFI() && - MI->getOperand(2).isImm() && - MI->getOperand(2).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); - } +bool +Thumb1InstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const { + if (MBB.empty()) return false; + + switch (MBB.back().getOpcode()) { + case ARM::tBX_RET: + case ARM::tBX_RET_vararg: + case ARM::tPOP_RET: + case ARM::tB: + case ARM::tBR_JTr: + return true; + default: break; } - return 0; + + return false; } bool Thumb1InstrInfo::copyRegToReg(MachineBasicBlock &MBB, @@ -91,15 +57,15 @@ bool Thumb1InstrInfo::copyRegToReg(MachineBasicBlock &MBB, if (DestRC == ARM::GPRRegisterClass) { if (SrcRC == ARM::GPRRegisterClass) { - BuildMI(MBB, I, DL, get(ARM::tMOVhir2hir), DestReg).addReg(SrcReg); + BuildMI(MBB, I, DL, get(ARM::tMOVgpr2gpr), DestReg).addReg(SrcReg); return true; } else if (SrcRC == ARM::tGPRRegisterClass) { - BuildMI(MBB, I, DL, get(ARM::tMOVlor2hir), DestReg).addReg(SrcReg); + BuildMI(MBB, I, DL, get(ARM::tMOVtgpr2gpr), DestReg).addReg(SrcReg); return true; } } else if (DestRC == ARM::tGPRRegisterClass) { if (SrcRC == ARM::GPRRegisterClass) { - BuildMI(MBB, I, DL, get(ARM::tMOVhir2lor), DestReg).addReg(SrcReg); + BuildMI(MBB, I, DL, get(ARM::tMOVgpr2tgpr), DestReg).addReg(SrcReg); return true; } else if (SrcRC == ARM::tGPRRegisterClass) { BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg).addReg(SrcReg); @@ -120,17 +86,19 @@ canFoldMemoryOperand(const MachineInstr *MI, switch (Opc) { default: break; case ARM::tMOVr: - case ARM::tMOVlor2hir: - case ARM::tMOVhir2lor: - case ARM::tMOVhir2hir: { + case ARM::tMOVtgpr2gpr: + case ARM::tMOVgpr2tgpr: + case ARM::tMOVgpr2gpr: { if (OpNum == 0) { // move -> store unsigned SrcReg = MI->getOperand(1).getReg(); - if (RI.isPhysicalRegister(SrcReg) && !isARMLowRegister(SrcReg)) + if (TargetRegisterInfo::isPhysicalRegister(SrcReg) && + !isARMLowRegister(SrcReg)) // tSpill cannot take a high register operand. return false; } else { // move -> load unsigned DstReg = MI->getOperand(0).getReg(); - if (RI.isPhysicalRegister(DstReg) && !isARMLowRegister(DstReg)) + if (TargetRegisterInfo::isPhysicalRegister(DstReg) && + !isARMLowRegister(DstReg)) // tRestore cannot target a high register operand. return false; } @@ -148,36 +116,17 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL = DebugLoc::getUnknownLoc(); if (I != MBB.end()) DL = I->getDebugLoc(); - assert(RC == ARM::tGPRRegisterClass && "Unknown regclass!"); + assert((RC == ARM::tGPRRegisterClass || + (TargetRegisterInfo::isPhysicalRegister(SrcReg) && + isARMLowRegister(SrcReg))) && "Unknown regclass!"); if (RC == ARM::tGPRRegisterClass) { - BuildMI(MBB, I, DL, get(ARM::tSpill)) - .addReg(SrcReg, getKillRegState(isKill)) - .addFrameIndex(FI).addImm(0); + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tSpill)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI).addImm(0)); } } -void Thumb1InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg, - bool isKill, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - SmallVectorImpl<MachineInstr*> &NewMIs) const{ - DebugLoc DL = DebugLoc::getUnknownLoc(); - unsigned Opc = 0; - - assert(RC == ARM::GPRRegisterClass && "Unknown regclass!"); - if (RC == ARM::GPRRegisterClass) { - Opc = Addr[0].isFI() ? ARM::tSpill : ARM::tSTR; - } - - MachineInstrBuilder MIB = - BuildMI(MF, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill)); - for (unsigned i = 0, e = Addr.size(); i != e; ++i) - MIB.addOperand(Addr[i]); - NewMIs.push_back(MIB); - return; -} - void Thumb1InstrInfo:: loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned DestReg, int FI, @@ -185,33 +134,16 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL = DebugLoc::getUnknownLoc(); if (I != MBB.end()) DL = I->getDebugLoc(); - assert(RC == ARM::tGPRRegisterClass && "Unknown regclass!"); + assert((RC == ARM::tGPRRegisterClass || + (TargetRegisterInfo::isPhysicalRegister(DestReg) && + isARMLowRegister(DestReg))) && "Unknown regclass!"); if (RC == ARM::tGPRRegisterClass) { - BuildMI(MBB, I, DL, get(ARM::tRestore), DestReg) - .addFrameIndex(FI).addImm(0); + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tRestore), DestReg) + .addFrameIndex(FI).addImm(0)); } } -void Thumb1InstrInfo:: -loadRegFromAddr(MachineFunction &MF, unsigned DestReg, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - SmallVectorImpl<MachineInstr*> &NewMIs) const { - DebugLoc DL = DebugLoc::getUnknownLoc(); - unsigned Opc = 0; - - if (RC == ARM::GPRRegisterClass) { - Opc = Addr[0].isFI() ? ARM::tRestore : ARM::tLDR; - } - - MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg); - for (unsigned i = 0, e = Addr.size(); i != e; ++i) - MIB.addOperand(Addr[i]); - NewMIs.push_back(MIB); - return; -} - bool Thumb1InstrInfo:: spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, @@ -223,6 +155,8 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, if (MI != MBB.end()) DL = MI->getDebugLoc(); MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, get(ARM::tPUSH)); + AddDefaultPred(MIB); + MIB.addReg(0); // No write back. for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i-1].getReg(); // Add the callee-saved register as live-in. It's killed at the spill. @@ -242,7 +176,12 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB, return false; bool isVarArg = AFI->getVarArgsRegSaveSize() > 0; - MachineInstr *PopMI = MF.CreateMachineInstr(get(ARM::tPOP),MI->getDebugLoc()); + DebugLoc DL = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(MF, DL, get(ARM::tPOP)); + AddDefaultPred(MIB); + MIB.addReg(0); // No write back. + + bool NumRegs = 0; for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i-1].getReg(); if (Reg == ARM::LR) { @@ -250,15 +189,16 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB, if (isVarArg) continue; Reg = ARM::PC; - PopMI->setDesc(get(ARM::tPOP_RET)); + (*MIB).setDesc(get(ARM::tPOP_RET)); MI = MBB.erase(MI); } - PopMI->addOperand(MachineOperand::CreateReg(Reg, true)); + MIB.addReg(Reg, getDefRegState(true)); + ++NumRegs; } // It's illegal to emit pop instruction without operands. - if (PopMI->getNumOperands() > 0) - MBB.insert(MI, PopMI); + if (NumRegs) + MBB.insert(MI, &*MIB); return true; } @@ -274,27 +214,30 @@ foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, switch (Opc) { default: break; case ARM::tMOVr: - case ARM::tMOVlor2hir: - case ARM::tMOVhir2lor: - case ARM::tMOVhir2hir: { + case ARM::tMOVtgpr2gpr: + case ARM::tMOVgpr2tgpr: + case ARM::tMOVgpr2gpr: { if (OpNum == 0) { // move -> store unsigned SrcReg = MI->getOperand(1).getReg(); bool isKill = MI->getOperand(1).isKill(); - if (RI.isPhysicalRegister(SrcReg) && !isARMLowRegister(SrcReg)) + if (TargetRegisterInfo::isPhysicalRegister(SrcReg) && + !isARMLowRegister(SrcReg)) // tSpill cannot take a high register operand. break; - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::tSpill)) - .addReg(SrcReg, getKillRegState(isKill)) - .addFrameIndex(FI).addImm(0); + NewMI = AddDefaultPred(BuildMI(MF, MI->getDebugLoc(), get(ARM::tSpill)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI).addImm(0)); } else { // move -> load unsigned DstReg = MI->getOperand(0).getReg(); - if (RI.isPhysicalRegister(DstReg) && !isARMLowRegister(DstReg)) + if (TargetRegisterInfo::isPhysicalRegister(DstReg) && + !isARMLowRegister(DstReg)) // tRestore cannot target a high register operand. break; bool isDead = MI->getOperand(0).isDead(); - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::tRestore)) - .addReg(DstReg, RegState::Define | getDeadRegState(isDead)) - .addFrameIndex(FI).addImm(0); + NewMI = AddDefaultPred(BuildMI(MF, MI->getDebugLoc(), get(ARM::tRestore)) + .addReg(DstReg, + RegState::Define | getDeadRegState(isDead)) + .addFrameIndex(FI).addImm(0)); } break; } diff --git a/lib/Target/ARM/Thumb1InstrInfo.h b/lib/Target/ARM/Thumb1InstrInfo.h index 1bfa1d0..13cc578 100644 --- a/lib/Target/ARM/Thumb1InstrInfo.h +++ b/lib/Target/ARM/Thumb1InstrInfo.h @@ -27,6 +27,13 @@ class Thumb1InstrInfo : public ARMBaseInstrInfo { public: explicit Thumb1InstrInfo(const ARMSubtarget &STI); + // Return the non-pre/post incrementing version of 'Opc'. Return 0 + // if there is not such an opcode. + unsigned getUnindexedOpcode(unsigned Opc) const; + + // Return true if the block does not fall through. + bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const; + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). @@ -40,14 +47,6 @@ public: MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI) const; - bool isMoveInstr(const MachineInstr &MI, - unsigned &SrcReg, unsigned &DstReg, - unsigned &SrcSubIdx, unsigned &DstSubIdx) const; - unsigned isLoadFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const; - unsigned isStoreToStackSlot(const MachineInstr *MI, - int &FrameIndex) const; - bool copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned DestReg, unsigned SrcReg, @@ -58,21 +57,11 @@ public: unsigned SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC) const; - void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - SmallVectorImpl<MachineInstr*> &NewMIs) const; - void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg, int FrameIndex, const TargetRegisterClass *RC) const; - void loadRegFromAddr(MachineFunction &MF, unsigned DestReg, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - SmallVectorImpl<MachineInstr*> &NewMIs) const; - bool canFoldMemoryOperand(const MachineInstr *MI, const SmallVectorImpl<unsigned> &Ops) const; @@ -80,7 +69,7 @@ public: MachineInstr* MI, const SmallVectorImpl<unsigned> &Ops, int FrameIndex) const; - + MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, MachineInstr* MI, const SmallVectorImpl<unsigned> &Ops, diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp index 92f01d1..3c896da 100644 --- a/lib/Target/ARM/Thumb1RegisterInfo.cpp +++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp @@ -13,12 +13,15 @@ #include "ARM.h" #include "ARMAddressingModes.h" +#include "ARMBaseInstrInfo.h" #include "ARMMachineFunctionInfo.h" #include "ARMSubtarget.h" #include "Thumb1InstrInfo.h" #include "Thumb1RegisterInfo.h" #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/LLVMContext.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -30,14 +33,11 @@ #include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; -static cl::opt<bool> -ThumbRegScavenging("enable-thumb-reg-scavenging", - cl::Hidden, - cl::desc("Enable register scavenging on Thumb")); - -Thumb1RegisterInfo::Thumb1RegisterInfo(const TargetInstrInfo &tii, +Thumb1RegisterInfo::Thumb1RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &sti) : ARMBaseRegisterInfo(tii, sti) { } @@ -46,20 +46,24 @@ Thumb1RegisterInfo::Thumb1RegisterInfo(const TargetInstrInfo &tii, /// specified immediate. void Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - unsigned DestReg, int Val, - const TargetInstrInfo *TII, - DebugLoc dl) const { + DebugLoc dl, + unsigned DestReg, unsigned SubIdx, + int Val, + ARMCC::CondCodes Pred, + unsigned PredReg) const { MachineFunction &MF = *MBB.getParent(); MachineConstantPool *ConstantPool = MF.getConstantPool(); - Constant *C = ConstantInt::get(Type::Int32Ty, Val); + Constant *C = ConstantInt::get( + Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val); unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4); - BuildMI(MBB, MBBI, dl, TII->get(ARM::tLDRcp), DestReg) - .addConstantPoolIndex(Idx); + BuildMI(MBB, MBBI, dl, TII.get(ARM::tLDRcp)) + .addReg(DestReg, getDefRegState(true), SubIdx) + .addConstantPoolIndex(Idx).addImm(Pred).addReg(PredReg); } const TargetRegisterClass* -Thumb1RegisterInfo::getPhysicalRegisterRegClass(unsigned Reg, MVT VT) const { +Thumb1RegisterInfo::getPhysicalRegisterRegClass(unsigned Reg, EVT VT) const { if (isARMLowRegister(Reg)) return ARM::tGPRRegisterClass; switch (Reg) { @@ -75,9 +79,16 @@ Thumb1RegisterInfo::getPhysicalRegisterRegClass(unsigned Reg, MVT VT) const { bool Thumb1RegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const { - return ThumbRegScavenging; + return true; +} + +bool +Thumb1RegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) + const { + return true; } + bool Thumb1RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const { const MachineFrameInfo *FFI = MF.getFrameInfo(); unsigned CFSize = FFI->getMaxCallFrameSize(); @@ -91,6 +102,7 @@ bool Thumb1RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const { return !MF.getFrameInfo()->hasVarSizedObjects(); } + /// emitThumbRegPlusImmInReg - Emits a series of instructions to materialize /// a destreg = basereg + immediate in Thumb code. Materialize the immediate /// in a register using mov / mvn sequences or load the immediate from a @@ -103,6 +115,7 @@ void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB, const TargetInstrInfo &TII, const Thumb1RegisterInfo& MRI, DebugLoc dl) { + MachineFunction &MF = *MBB.getParent(); bool isHigh = !isARMLowRegister(DestReg) || (BaseReg != 0 && !isARMLowRegister(BaseReg)); bool isSub = false; @@ -117,31 +130,31 @@ void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB, unsigned LdReg = DestReg; if (DestReg == ARM::SP) { assert(BaseReg == ARM::SP && "Unexpected!"); - LdReg = ARM::R3; - BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVlor2hir), ARM::R12) - .addReg(ARM::R3, RegState::Kill); + LdReg = MF.getRegInfo().createVirtualRegister(ARM::tGPRRegisterClass); } if (NumBytes <= 255 && NumBytes >= 0) - BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg).addImm(NumBytes); + AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg)) + .addImm(NumBytes); else if (NumBytes < 0 && NumBytes >= -255) { - BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg).addImm(NumBytes); - BuildMI(MBB, MBBI, dl, TII.get(ARM::tNEG), LdReg) + AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg)) + .addImm(NumBytes); + AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg)) .addReg(LdReg, RegState::Kill); } else - MRI.emitLoadConstPool(MBB, MBBI, LdReg, NumBytes, &TII, dl); + MRI.emitLoadConstPool(MBB, MBBI, dl, LdReg, 0, NumBytes); // Emit add / sub. int Opc = (isSub) ? ARM::tSUBrr : (isHigh ? ARM::tADDhirr : ARM::tADDrr); - const MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, - TII.get(Opc), DestReg); + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg); + if (Opc != ARM::tADDhirr) + MIB = AddDefaultT1CC(MIB); if (DestReg == ARM::SP || isSub) MIB.addReg(BaseReg).addReg(LdReg, RegState::Kill); else MIB.addReg(LdReg).addReg(BaseReg, RegState::Kill); - if (DestReg == ARM::SP) - BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVhir2lor), ARM::R3) - .addReg(ARM::R12, RegState::Kill); + AddDefaultPred(MIB); } /// calcNumMI - Returns the number of instructions required to materialize @@ -187,6 +200,8 @@ void emitThumbRegPlusImmediate(MachineBasicBlock &MBB, unsigned Scale = 1; int Opc = 0; int ExtraOpc = 0; + bool NeedCC = false; + bool NeedPred = false; if (DestReg == BaseReg && BaseReg == ARM::SP) { assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!"); @@ -213,7 +228,16 @@ void emitThumbRegPlusImmediate(MachineBasicBlock &MBB, if (DestReg != BaseReg) DstNotEqBase = true; NumBits = 8; - Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8; + if (DestReg == ARM::SP) { + Opc = isSub ? ARM::tSUBspi : ARM::tADDspi; + assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!"); + NumBits = 7; + Scale = 4; + } else { + Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8; + NumBits = 8; + NeedPred = NeedCC = true; + } isTwoAddr = true; } @@ -233,8 +257,10 @@ void emitThumbRegPlusImmediate(MachineBasicBlock &MBB, unsigned Chunk = (1 << 3) - 1; unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes; Bytes -= ThisVal; - BuildMI(MBB, MBBI, dl,TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3), DestReg) - .addReg(BaseReg, RegState::Kill).addImm(ThisVal); + const TargetInstrDesc &TID = TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3); + const MachineInstrBuilder MIB = + AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TID, DestReg)); + AddDefaultPred(MIB.addReg(BaseReg, RegState::Kill).addImm(ThisVal)); } else { BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg) .addReg(BaseReg, RegState::Kill); @@ -248,13 +274,22 @@ void emitThumbRegPlusImmediate(MachineBasicBlock &MBB, Bytes -= ThisVal; ThisVal /= Scale; // Build the new tADD / tSUB. - if (isTwoAddr) - BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg) - .addReg(DestReg).addImm(ThisVal); + if (isTwoAddr) { + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg); + if (NeedCC) + MIB = AddDefaultT1CC(MIB); + MIB .addReg(DestReg).addImm(ThisVal); + if (NeedPred) + MIB = AddDefaultPred(MIB); + } else { bool isKill = BaseReg != ARM::SP; - BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg) - .addReg(BaseReg, getKillRegState(isKill)).addImm(ThisVal); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg); + if (NeedCC) + MIB = AddDefaultT1CC(MIB); + MIB.addReg(BaseReg, getKillRegState(isKill)).addImm(ThisVal); + if (NeedPred) + MIB = AddDefaultPred(MIB); BaseReg = DestReg; if (Opc == ARM::tADDrSPi) { @@ -265,15 +300,17 @@ void emitThumbRegPlusImmediate(MachineBasicBlock &MBB, Scale = 1; Chunk = ((1 << NumBits) - 1) * Scale; Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8; - isTwoAddr = true; + NeedPred = NeedCC = isTwoAddr = true; } } } - if (ExtraOpc) - BuildMI(MBB, MBBI, dl, TII.get(ExtraOpc), DestReg) - .addReg(DestReg, RegState::Kill) - .addImm(((unsigned)NumBytes) & 3); + if (ExtraOpc) { + const TargetInstrDesc &TID = TII.get(ExtraOpc); + AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TID, DestReg)) + .addReg(DestReg, RegState::Kill) + .addImm(((unsigned)NumBytes) & 3)); + } } static void emitSPUpdate(MachineBasicBlock &MBB, @@ -329,16 +366,64 @@ static void emitThumbConstant(MachineBasicBlock &MBB, int Chunk = (1 << 8) - 1; int ThisVal = (Imm > Chunk) ? Chunk : Imm; Imm -= ThisVal; - BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), DestReg).addImm(ThisVal); + AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), + DestReg)) + .addImm(ThisVal)); if (Imm > 0) emitThumbRegPlusImmediate(MBB, MBBI, DestReg, DestReg, Imm, TII, MRI, dl); - if (isSub) - BuildMI(MBB, MBBI, dl, TII.get(ARM::tNEG), DestReg) - .addReg(DestReg, RegState::Kill); + if (isSub) { + const TargetInstrDesc &TID = TII.get(ARM::tRSB); + AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TID, DestReg)) + .addReg(DestReg, RegState::Kill)); + } } -void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, - int SPAdj, RegScavenger *RS) const{ +static void removeOperands(MachineInstr &MI, unsigned i) { + unsigned Op = i; + for (unsigned e = MI.getNumOperands(); i != e; ++i) + MI.RemoveOperand(Op); +} + +int Thumb1RegisterInfo:: +rewriteFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int Offset, + unsigned MOVOpc, unsigned ADDriOpc, unsigned SUBriOpc) const +{ + // if/when eliminateFrameIndex() conforms with ARMBaseRegisterInfo + // version then can pull out Thumb1 specific parts here + return 0; +} + +/// saveScavengerRegister - Save the register so it can be used by the +/// register scavenger. Return true. +bool Thumb1RegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const TargetRegisterClass *RC, + unsigned Reg) const { + // Thumb1 can't use the emergency spill slot on the stack because + // ldr/str immediate offsets must be positive, and if we're referencing + // off the frame pointer (if, for example, there are alloca() calls in + // the function, the offset will be negative. Use R12 instead since that's + // a call clobbered register that we know won't be used in Thumb1 mode. + + TII.copyRegToReg(MBB, I, ARM::R12, Reg, ARM::GPRRegisterClass, RC); + return true; +} + +/// restoreScavengerRegister - restore a registers saved by +// saveScavengerRegister(). +void Thumb1RegisterInfo::restoreScavengerRegister(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const TargetRegisterClass *RC, + unsigned Reg) const { + TII.copyRegToReg(MBB, I, Reg, ARM::R12, RC, ARM::GPRRegisterClass); +} + +unsigned +Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, int *Value, + RegScavenger *RS) const{ + unsigned VReg = 0; unsigned i = 0; MachineInstr &MI = *II; MachineBasicBlock &MBB = *MI.getParent(); @@ -380,7 +465,7 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, unsigned Scale = 1; if (FrameReg != ARM::SP) { Opcode = ARM::tADDi3; - MI.setDesc(TII.get(ARM::tADDi3)); + MI.setDesc(TII.get(Opcode)); NumBits = 3; } else { NumBits = 8; @@ -391,19 +476,26 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, if (Offset == 0) { // Turn it into a move. - MI.setDesc(TII.get(ARM::tMOVhir2lor)); + MI.setDesc(TII.get(ARM::tMOVgpr2tgpr)); MI.getOperand(i).ChangeToRegister(FrameReg, false); MI.RemoveOperand(i+1); - return; + return 0; } // Common case: small offset, fits into instruction. unsigned Mask = (1 << NumBits) - 1; if (((Offset / Scale) & ~Mask) == 0) { // Replace the FrameIndex with sp / fp - MI.getOperand(i).ChangeToRegister(FrameReg, false); - MI.getOperand(i+1).ChangeToImmediate(Offset / Scale); - return; + if (Opcode == ARM::tADDi3) { + removeOperands(MI, i); + MachineInstrBuilder MIB(&MI); + AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg) + .addImm(Offset / Scale)); + } else { + MI.getOperand(i).ChangeToRegister(FrameReg, false); + MI.getOperand(i+1).ChangeToImmediate(Offset / Scale); + } + return 0; } unsigned DestReg = MI.getOperand(0).getReg(); @@ -415,15 +507,21 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, emitThumbRegPlusImmediate(MBB, II, DestReg, FrameReg, Offset, TII, *this, dl); MBB.erase(II); - return; + return 0; } if (Offset > 0) { // Translate r0 = add sp, imm to // r0 = add sp, 255*4 // r0 = add r0, (imm - 255*4) - MI.getOperand(i).ChangeToRegister(FrameReg, false); - MI.getOperand(i+1).ChangeToImmediate(Mask); + if (Opcode == ARM::tADDi3) { + removeOperands(MI, i); + MachineInstrBuilder MIB(&MI); + AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg).addImm(Mask)); + } else { + MI.getOperand(i).ChangeToRegister(FrameReg, false); + MI.getOperand(i+1).ChangeToImmediate(Mask); + } Offset = (Offset - Mask * Scale); MachineBasicBlock::iterator NII = next(II); emitThumbRegPlusImmediate(MBB, NII, DestReg, DestReg, Offset, TII, @@ -433,11 +531,16 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // r0 = -imm (this is then translated into a series of instructons) // r0 = add r0, sp emitThumbConstant(MBB, II, DestReg, Offset, TII, *this, dl); + MI.setDesc(TII.get(ARM::tADDhirr)); MI.getOperand(i).ChangeToRegister(DestReg, false, false, true); MI.getOperand(i+1).ChangeToRegister(FrameReg, false); + if (Opcode == ARM::tADDi3) { + MachineInstrBuilder MIB(&MI); + AddDefaultPred(MIB); + } } - return; + return 0; } else { unsigned ImmIdx = 0; int InstrOffs = 0; @@ -452,8 +555,7 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, break; } default: - assert(0 && "Unsupported addressing mode!"); - abort(); + llvm_unreachable("Unsupported addressing mode!"); break; } @@ -468,7 +570,7 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // Replace the FrameIndex with sp MI.getOperand(i).ChangeToRegister(FrameReg, false); ImmOp.ChangeToImmediate(ImmedOffset); - return; + return 0; } bool isThumSpillRestore = Opcode == ARM::tRestore || Opcode == ARM::tSpill; @@ -495,6 +597,11 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // SP+LargeImm. assert(Offset && "This code isn't needed if offset already handled!"); + // Remove predicate first. + int PIdx = MI.findFirstPredOperandIdx(); + if (PIdx != -1) + removeOperands(MI, PIdx); + if (Desc.mayLoad()) { // Use the destination register to materialize sp + offset. unsigned TmpReg = MI.getOperand(0).getReg(); @@ -504,12 +611,14 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, emitThumbRegPlusImmInReg(MBB, II, TmpReg, FrameReg, Offset, false, TII, *this, dl); else { - emitLoadConstPool(MBB, II, TmpReg, Offset, &TII, dl); + emitLoadConstPool(MBB, II, dl, TmpReg, 0, Offset); UseRR = true; } - } else + } else { emitThumbRegPlusImmediate(MBB, II, TmpReg, FrameReg, Offset, TII, *this, dl); + } + MI.setDesc(TII.get(ARM::tLDR)); MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true); if (UseRR) @@ -518,52 +627,37 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, else // tLDR has an extra register operand. MI.addOperand(MachineOperand::CreateReg(0, false)); } else if (Desc.mayStore()) { - // FIXME! This is horrific!!! We need register scavenging. - // Our temporary workaround has marked r3 unavailable. Of course, r3 is - // also a ABI register so it's possible that is is the register that is - // being storing here. If that's the case, we do the following: - // r12 = r2 - // Use r2 to materialize sp + offset - // str r3, r2 - // r2 = r12 - unsigned ValReg = MI.getOperand(0).getReg(); - unsigned TmpReg = ARM::R3; - bool UseRR = false; - if (ValReg == ARM::R3) { - BuildMI(MBB, II, dl, TII.get(ARM::tMOVlor2hir), ARM::R12) - .addReg(ARM::R2, RegState::Kill); - TmpReg = ARM::R2; - } - if (TmpReg == ARM::R3 && AFI->isR3LiveIn()) - BuildMI(MBB, II, dl, TII.get(ARM::tMOVlor2hir), ARM::R12) - .addReg(ARM::R3, RegState::Kill); - if (Opcode == ARM::tSpill) { - if (FrameReg == ARM::SP) - emitThumbRegPlusImmInReg(MBB, II, TmpReg, FrameReg, - Offset, false, TII, *this, dl); - else { - emitLoadConstPool(MBB, II, TmpReg, Offset, &TII, dl); - UseRR = true; - } - } else - emitThumbRegPlusImmediate(MBB, II, TmpReg, FrameReg, Offset, TII, - *this, dl); - MI.setDesc(TII.get(ARM::tSTR)); - MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true); - if (UseRR) // Use [reg, reg] addrmode. - MI.addOperand(MachineOperand::CreateReg(FrameReg, false)); - else // tSTR has an extra register operand. - MI.addOperand(MachineOperand::CreateReg(0, false)); - - MachineBasicBlock::iterator NII = next(II); - if (ValReg == ARM::R3) - BuildMI(MBB, NII, dl, TII.get(ARM::tMOVhir2lor), ARM::R2) - .addReg(ARM::R12, RegState::Kill); - if (TmpReg == ARM::R3 && AFI->isR3LiveIn()) - BuildMI(MBB, NII, dl, TII.get(ARM::tMOVhir2lor), ARM::R3) - .addReg(ARM::R12, RegState::Kill); + VReg = MF.getRegInfo().createVirtualRegister(ARM::tGPRRegisterClass); + assert (Value && "Frame index virtual allocated, but Value arg is NULL!"); + *Value = Offset; + bool UseRR = false; + + if (Opcode == ARM::tSpill) { + if (FrameReg == ARM::SP) + emitThumbRegPlusImmInReg(MBB, II, VReg, FrameReg, + Offset, false, TII, *this, dl); + else { + emitLoadConstPool(MBB, II, dl, VReg, 0, Offset); + UseRR = true; + } + } else + emitThumbRegPlusImmediate(MBB, II, VReg, FrameReg, Offset, TII, + *this, dl); + MI.setDesc(TII.get(ARM::tSTR)); + MI.getOperand(i).ChangeToRegister(VReg, false, false, true); + if (UseRR) // Use [reg, reg] addrmode. + MI.addOperand(MachineOperand::CreateReg(FrameReg, false)); + else // tSTR has an extra register operand. + MI.addOperand(MachineOperand::CreateReg(0, false)); } else assert(false && "Unexpected opcode!"); + + // Add predicate back if it's needed. + if (MI.getDesc().isPredicable()) { + MachineInstrBuilder MIB(&MI); + AddDefaultPred(MIB); + } + return VReg; } void Thumb1RegisterInfo::emitPrologue(MachineFunction &MF) const { @@ -577,15 +671,6 @@ void Thumb1RegisterInfo::emitPrologue(MachineFunction &MF) const { DebugLoc dl = (MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc::getUnknownLoc()); - // Check if R3 is live in. It might have to be used as a scratch register. - for (MachineRegisterInfo::livein_iterator I =MF.getRegInfo().livein_begin(), - E = MF.getRegInfo().livein_end(); I != E; ++I) { - if (I->first == ARM::R3) { - AFI->setR3IsLiveIn(true); - break; - } - } - // Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4. NumBytes = (NumBytes + 3) & ~3; MFI->setStackSize(NumBytes); @@ -647,8 +732,7 @@ void Thumb1RegisterInfo::emitPrologue(MachineFunction &MF) const { // Darwin ABI requires FP to point to the stack slot that contains the // previous FP. if (STI.isTargetDarwin() || hasFP(MF)) { - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr) + BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr) .addFrameIndex(FramePtrSpillFI).addImm(0); } @@ -729,7 +813,7 @@ void Thumb1RegisterInfo::emitEpilogue(MachineFunction &MF, emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, FramePtr, -NumBytes, TII, *this, dl); else - BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVlor2hir), ARM::SP) + BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVtgpr2gpr), ARM::SP) .addReg(FramePtr); } else { if (MBBI->getOpcode() == ARM::tBX_RET && @@ -745,11 +829,14 @@ void Thumb1RegisterInfo::emitEpilogue(MachineFunction &MF, if (VARegSaveSize) { // Epilogue for vararg functions: pop LR to R3 and branch off it. // FIXME: Verify this is still ok when R3 is no longer being reserved. - BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)).addReg(ARM::R3); + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP))) + .addReg(0) // No write back. + .addReg(ARM::R3, RegState::Define); emitSPUpdate(MBB, MBBI, TII, dl, *this, VARegSaveSize); - BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET_vararg)).addReg(ARM::R3); + BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET_vararg)) + .addReg(ARM::R3, RegState::Kill); MBB.erase(MBBI); } } diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h index 6d4f1f0..bb7a619 100644 --- a/lib/Target/ARM/Thumb1RegisterInfo.h +++ b/lib/Target/ARM/Thumb1RegisterInfo.h @@ -20,28 +20,28 @@ namespace llvm { class ARMSubtarget; - class TargetInstrInfo; + class ARMBaseInstrInfo; class Type; struct Thumb1RegisterInfo : public ARMBaseRegisterInfo { public: - Thumb1RegisterInfo(const TargetInstrInfo &tii, const ARMSubtarget &STI); + Thumb1RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI); /// emitLoadConstPool - Emits a load from constpool to materialize the /// specified immediate. - void emitLoadConstPool(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - unsigned DestReg, int Val, - const TargetInstrInfo *TII, - DebugLoc dl) const; + void emitLoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, + unsigned DestReg, unsigned SubIdx, int Val, + ARMCC::CondCodes Pred = ARMCC::AL, + unsigned PredReg = 0) const; /// Code Generation virtual methods... const TargetRegisterClass * - getPhysicalRegisterRegClass(unsigned Reg, MVT VT = MVT::Other) const; - - bool isReservedReg(const MachineFunction &MF, unsigned Reg) const; + getPhysicalRegisterRegClass(unsigned Reg, EVT VT = MVT::Other) const; bool requiresRegisterScavenging(const MachineFunction &MF) const; + bool requiresFrameIndexScavenging(const MachineFunction &MF) const; bool hasReservedCallFrame(MachineFunction &MF) const; @@ -49,8 +49,23 @@ public: MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; - void eliminateFrameIndex(MachineBasicBlock::iterator II, - int SPAdj, RegScavenger *RS = NULL) const; + // rewrite MI to access 'Offset' bytes from the FP. Return the offset that + // could not be handled directly in MI. + int rewriteFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int Offset, + unsigned MOVOpc, unsigned ADDriOpc, unsigned SUBriOpc) const; + + bool saveScavengerRegister(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const TargetRegisterClass *RC, + unsigned Reg) const; + void restoreScavengerRegister(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const TargetRegisterClass *RC, + unsigned Reg) const; + unsigned eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, int *Value = NULL, + RegScavenger *RS = NULL) const; void emitPrologue(MachineFunction &MF) const; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp new file mode 100644 index 0000000..98b5cbd --- /dev/null +++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp @@ -0,0 +1,158 @@ +//===-- Thumb2ITBlockPass.cpp - Insert Thumb IT blocks -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "thumb2-it" +#include "ARM.h" +#include "ARMMachineFunctionInfo.h" +#include "Thumb2InstrInfo.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Support/Compiler.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumITs, "Number of IT blocks inserted"); + +namespace { + struct VISIBILITY_HIDDEN Thumb2ITBlockPass : public MachineFunctionPass { + static char ID; + Thumb2ITBlockPass() : MachineFunctionPass(&ID) {} + + const Thumb2InstrInfo *TII; + ARMFunctionInfo *AFI; + + virtual bool runOnMachineFunction(MachineFunction &Fn); + + virtual const char *getPassName() const { + return "Thumb IT blocks insertion pass"; + } + + private: + MachineBasicBlock::iterator + SplitT2MOV32imm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineInstr *MI, DebugLoc dl, + unsigned PredReg, ARMCC::CondCodes CC); + bool InsertITBlocks(MachineBasicBlock &MBB); + }; + char Thumb2ITBlockPass::ID = 0; +} + +static ARMCC::CondCodes getPredicate(const MachineInstr *MI, unsigned &PredReg){ + unsigned Opc = MI->getOpcode(); + if (Opc == ARM::tBcc || Opc == ARM::t2Bcc) + return ARMCC::AL; + return llvm::getInstrPredicate(MI, PredReg); +} + +MachineBasicBlock::iterator +Thumb2ITBlockPass::SplitT2MOV32imm(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineInstr *MI, + DebugLoc dl, unsigned PredReg, + ARMCC::CondCodes CC) { + // Splitting t2MOVi32imm into a pair of t2MOVi16 + t2MOVTi16 here. + // The only reason it was a single instruction was so it could be + // re-materialized. We want to split it before this and the thumb2 + // size reduction pass to make sure the IT mask is correct and expose + // width reduction opportunities. It doesn't make sense to do this in a + // separate pass so here it is. + unsigned DstReg = MI->getOperand(0).getReg(); + bool DstDead = MI->getOperand(0).isDead(); // Is this possible? + unsigned Imm = MI->getOperand(1).getImm(); + unsigned Lo16 = Imm & 0xffff; + unsigned Hi16 = (Imm >> 16) & 0xffff; + BuildMI(MBB, MBBI, dl, TII->get(ARM::t2MOVi16), DstReg) + .addImm(Lo16).addImm(CC).addReg(PredReg); + BuildMI(MBB, MBBI, dl, TII->get(ARM::t2MOVTi16)) + .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstDead)) + .addReg(DstReg).addImm(Hi16).addImm(CC).addReg(PredReg); + --MBBI; + --MBBI; + MI->eraseFromParent(); + return MBBI; +} + +bool Thumb2ITBlockPass::InsertITBlocks(MachineBasicBlock &MBB) { + bool Modified = false; + + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + while (MBBI != E) { + MachineInstr *MI = &*MBBI; + DebugLoc dl = MI->getDebugLoc(); + unsigned PredReg = 0; + ARMCC::CondCodes CC = getPredicate(MI, PredReg); + + if (MI->getOpcode() == ARM::t2MOVi32imm) { + MBBI = SplitT2MOV32imm(MBB, MBBI, MI, dl, PredReg, CC); + continue; + } + + if (CC == ARMCC::AL) { + ++MBBI; + continue; + } + + // Insert an IT instruction. + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(ARM::t2IT)) + .addImm(CC); + ++MBBI; + + // Finalize IT mask. + ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC); + unsigned Mask = 0, Pos = 3; + while (MBBI != E && Pos) { + MachineInstr *NMI = &*MBBI; + DebugLoc ndl = NMI->getDebugLoc(); + unsigned NPredReg = 0; + ARMCC::CondCodes NCC = getPredicate(NMI, NPredReg); + if (NMI->getOpcode() == ARM::t2MOVi32imm) { + MBBI = SplitT2MOV32imm(MBB, MBBI, NMI, ndl, NPredReg, NCC); + continue; + } + + if (NCC == OCC) { + Mask |= (1 << Pos); + } else if (NCC != CC) + break; + --Pos; + ++MBBI; + } + Mask |= (1 << Pos); + MIB.addImm(Mask); + Modified = true; + ++NumITs; + } + + return Modified; +} + +bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) { + const TargetMachine &TM = Fn.getTarget(); + AFI = Fn.getInfo<ARMFunctionInfo>(); + TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo()); + + if (!AFI->isThumbFunction()) + return false; + + bool Modified = false; + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) { + MachineBasicBlock &MBB = *MFI; + Modified |= InsertITBlocks(MBB); + } + + return Modified; +} + +/// createThumb2ITBlockPass - Returns an instance of the Thumb2 IT blocks +/// insertion pass. +FunctionPass *llvm::createThumb2ITBlockPass() { + return new Thumb2ITBlockPass(); +} diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp index 35d09fd..264601b 100644 --- a/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -13,6 +13,7 @@ #include "ARMInstrInfo.h" #include "ARM.h" +#include "ARMAddressingModes.h" #include "ARMGenInstrInfo.inc" #include "ARMMachineFunctionInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -22,127 +23,62 @@ using namespace llvm; -Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI) - : ARMBaseInstrInfo(STI), RI(*this, STI) { +Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI) : RI(*this, STI) { } -bool Thumb2InstrInfo::isMoveInstr(const MachineInstr &MI, - unsigned &SrcReg, unsigned &DstReg, - unsigned& SrcSubIdx, unsigned& DstSubIdx) const { - SrcSubIdx = DstSubIdx = 0; // No sub-registers. - - unsigned oc = MI.getOpcode(); - switch (oc) { - default: - return false; - // FIXME: Thumb2 - case ARM::tMOVr: - case ARM::tMOVhir2lor: - case ARM::tMOVlor2hir: - case ARM::tMOVhir2hir: - assert(MI.getDesc().getNumOperands() >= 2 && - MI.getOperand(0).isReg() && - MI.getOperand(1).isReg() && - "Invalid Thumb MOV instruction"); - SrcReg = MI.getOperand(1).getReg(); - DstReg = MI.getOperand(0).getReg(); - return true; - } -} - -unsigned Thumb2InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const { - switch (MI->getOpcode()) { - default: break; - // FIXME: Thumb2 - case ARM::tRestore: - if (MI->getOperand(1).isFI() && - MI->getOperand(2).isImm() && - MI->getOperand(2).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); - } - break; - } +unsigned Thumb2InstrInfo::getUnindexedOpcode(unsigned Opc) const { + // FIXME return 0; } -unsigned Thumb2InstrInfo::isStoreToStackSlot(const MachineInstr *MI, - int &FrameIndex) const { - switch (MI->getOpcode()) { - default: break; - // FIXME: Thumb2 - case ARM::tSpill: - if (MI->getOperand(1).isFI() && - MI->getOperand(2).isImm() && - MI->getOperand(2).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); - } +bool +Thumb2InstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const { + if (MBB.empty()) return false; + + switch (MBB.back().getOpcode()) { + case ARM::t2LDM_RET: + case ARM::t2B: // Uncond branch. + case ARM::t2BR_JT: // Jumptable branch. + case ARM::t2TBB: // Table branch byte. + case ARM::t2TBH: // Table branch halfword. + case ARM::tBR_JTr: // Jumptable branch (16-bit version). + case ARM::tBX_RET: + case ARM::tBX_RET_vararg: + case ARM::tPOP_RET: + case ARM::tB: + return true; + default: break; } - return 0; + + return false; } -bool Thumb2InstrInfo::copyRegToReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - unsigned DestReg, unsigned SrcReg, - const TargetRegisterClass *DestRC, - const TargetRegisterClass *SrcRC) const { +bool +Thumb2InstrInfo::copyRegToReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *DestRC, + const TargetRegisterClass *SrcRC) const { DebugLoc DL = DebugLoc::getUnknownLoc(); if (I != MBB.end()) DL = I->getDebugLoc(); - // FIXME: Thumb2 - if (DestRC == ARM::GPRRegisterClass) { - if (SrcRC == ARM::GPRRegisterClass) { - BuildMI(MBB, I, DL, get(ARM::tMOVhir2hir), DestReg).addReg(SrcReg); - return true; - } else if (SrcRC == ARM::tGPRRegisterClass) { - BuildMI(MBB, I, DL, get(ARM::tMOVlor2hir), DestReg).addReg(SrcReg); - return true; - } - } else if (DestRC == ARM::tGPRRegisterClass) { - if (SrcRC == ARM::GPRRegisterClass) { - BuildMI(MBB, I, DL, get(ARM::tMOVhir2lor), DestReg).addReg(SrcReg); - return true; - } else if (SrcRC == ARM::tGPRRegisterClass) { - BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg).addReg(SrcReg); - return true; - } - } - - return false; -} - -bool Thumb2InstrInfo:: -canFoldMemoryOperand(const MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops) const { - if (Ops.size() != 1) return false; - - unsigned OpNum = Ops[0]; - unsigned Opc = MI->getOpcode(); - switch (Opc) { - default: break; - case ARM::tMOVr: - case ARM::tMOVlor2hir: - case ARM::tMOVhir2lor: - case ARM::tMOVhir2hir: { - if (OpNum == 0) { // move -> store - unsigned SrcReg = MI->getOperand(1).getReg(); - if (RI.isPhysicalRegister(SrcReg) && !isARMLowRegister(SrcReg)) - // tSpill cannot take a high register operand. - return false; - } else { // move -> load - unsigned DstReg = MI->getOperand(0).getReg(); - if (RI.isPhysicalRegister(DstReg) && !isARMLowRegister(DstReg)) - // tRestore cannot target a high register operand. - return false; - } + if (DestRC == ARM::GPRRegisterClass && + SrcRC == ARM::GPRRegisterClass) { + BuildMI(MBB, I, DL, get(ARM::tMOVgpr2gpr), DestReg).addReg(SrcReg); + return true; + } else if (DestRC == ARM::GPRRegisterClass && + SrcRC == ARM::tGPRRegisterClass) { + BuildMI(MBB, I, DL, get(ARM::tMOVtgpr2gpr), DestReg).addReg(SrcReg); + return true; + } else if (DestRC == ARM::tGPRRegisterClass && + SrcRC == ARM::GPRRegisterClass) { + BuildMI(MBB, I, DL, get(ARM::tMOVgpr2tgpr), DestReg).addReg(SrcReg); return true; - } } - return false; + // Handle SPR, DPR, and QPR copies. + return ARMBaseInstrInfo::copyRegToReg(MBB, I, DestReg, SrcReg, DestRC, SrcRC); } void Thumb2InstrInfo:: @@ -152,36 +88,14 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL = DebugLoc::getUnknownLoc(); if (I != MBB.end()) DL = I->getDebugLoc(); - assert(RC == ARM::tGPRRegisterClass && "Unknown regclass!"); - - // FIXME: Thumb2 - if (RC == ARM::tGPRRegisterClass) { - BuildMI(MBB, I, DL, get(ARM::tSpill)) - .addReg(SrcReg, getKillRegState(isKill)) - .addFrameIndex(FI).addImm(0); - } -} - -void Thumb2InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg, - bool isKill, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - SmallVectorImpl<MachineInstr*> &NewMIs) const{ - DebugLoc DL = DebugLoc::getUnknownLoc(); - unsigned Opc = 0; - - // FIXME: Thumb2. Is GPRRegClass here correct? - assert(RC == ARM::GPRRegisterClass && "Unknown regclass!"); if (RC == ARM::GPRRegisterClass) { - Opc = Addr[0].isFI() ? ARM::tSpill : ARM::tSTR; + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2STRi12)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI).addImm(0)); + return; } - MachineInstrBuilder MIB = - BuildMI(MF, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill)); - for (unsigned i = 0, e = Addr.size(); i != e; ++i) - MIB.addOperand(Addr[i]); - NewMIs.push_back(MIB); - return; + ARMBaseInstrInfo::storeRegToStackSlot(MBB, I, SrcReg, isKill, FI, RC); } void Thumb2InstrInfo:: @@ -191,122 +105,381 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL = DebugLoc::getUnknownLoc(); if (I != MBB.end()) DL = I->getDebugLoc(); - // FIXME: Thumb2 - assert(RC == ARM::tGPRRegisterClass && "Unknown regclass!"); - - if (RC == ARM::tGPRRegisterClass) { - BuildMI(MBB, I, DL, get(ARM::tRestore), DestReg) - .addFrameIndex(FI).addImm(0); + if (RC == ARM::GPRRegisterClass) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2LDRi12), DestReg) + .addFrameIndex(FI).addImm(0)); + return; } + + ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC); } -void Thumb2InstrInfo:: -loadRegFromAddr(MachineFunction &MF, unsigned DestReg, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - SmallVectorImpl<MachineInstr*> &NewMIs) const { - DebugLoc DL = DebugLoc::getUnknownLoc(); - unsigned Opc = 0; - // FIXME: Thumb2. Is GPRRegClass ok here? - if (RC == ARM::GPRRegisterClass) { - Opc = Addr[0].isFI() ? ARM::tRestore : ARM::tLDR; +void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + unsigned DestReg, unsigned BaseReg, int NumBytes, + ARMCC::CondCodes Pred, unsigned PredReg, + const ARMBaseInstrInfo &TII) { + bool isSub = NumBytes < 0; + if (isSub) NumBytes = -NumBytes; + + // If profitable, use a movw or movt to materialize the offset. + // FIXME: Use the scavenger to grab a scratch register. + if (DestReg != ARM::SP && DestReg != BaseReg && + NumBytes >= 4096 && + ARM_AM::getT2SOImmVal(NumBytes) == -1) { + bool Fits = false; + if (NumBytes < 65536) { + // Use a movw to materialize the 16-bit constant. + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), DestReg) + .addImm(NumBytes) + .addImm((unsigned)Pred).addReg(PredReg).addReg(0); + Fits = true; + } else if ((NumBytes & 0xffff) == 0) { + // Use a movt to materialize the 32-bit constant. + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVTi16), DestReg) + .addReg(DestReg) + .addImm(NumBytes >> 16) + .addImm((unsigned)Pred).addReg(PredReg).addReg(0); + Fits = true; + } + + if (Fits) { + if (isSub) { + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr), DestReg) + .addReg(BaseReg, RegState::Kill) + .addReg(DestReg, RegState::Kill) + .addImm((unsigned)Pred).addReg(PredReg).addReg(0); + } else { + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2ADDrr), DestReg) + .addReg(DestReg, RegState::Kill) + .addReg(BaseReg, RegState::Kill) + .addImm((unsigned)Pred).addReg(PredReg).addReg(0); + } + return; + } } - MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg); - for (unsigned i = 0, e = Addr.size(); i != e; ++i) - MIB.addOperand(Addr[i]); - NewMIs.push_back(MIB); - return; -} + while (NumBytes) { + unsigned ThisVal = NumBytes; + unsigned Opc = 0; + if (DestReg == ARM::SP && BaseReg != ARM::SP) { + // mov sp, rn. Note t2MOVr cannot be used. + BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr),DestReg).addReg(BaseReg); + BaseReg = ARM::SP; + continue; + } -bool Thumb2InstrInfo:: -spillCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI) const { - if (CSI.empty()) - return false; + if (BaseReg == ARM::SP) { + // sub sp, sp, #imm7 + if (DestReg == ARM::SP && (ThisVal < ((1 << 7)-1) * 4)) { + assert((ThisVal & 3) == 0 && "Stack update is not multiple of 4?"); + Opc = isSub ? ARM::tSUBspi : ARM::tADDspi; + // FIXME: Fix Thumb1 immediate encoding. + BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg) + .addReg(BaseReg).addImm(ThisVal/4); + NumBytes = 0; + continue; + } + + // sub rd, sp, so_imm + Opc = isSub ? ARM::t2SUBrSPi : ARM::t2ADDrSPi; + if (ARM_AM::getT2SOImmVal(NumBytes) != -1) { + NumBytes = 0; + } else { + // FIXME: Move this to ARMAddressingModes.h? + unsigned RotAmt = CountLeadingZeros_32(ThisVal); + ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt); + NumBytes &= ~ThisVal; + assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 && + "Bit extraction didn't work?"); + } + } else { + assert(DestReg != ARM::SP && BaseReg != ARM::SP); + Opc = isSub ? ARM::t2SUBri : ARM::t2ADDri; + if (ARM_AM::getT2SOImmVal(NumBytes) != -1) { + NumBytes = 0; + } else if (ThisVal < 4096) { + Opc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12; + NumBytes = 0; + } else { + // FIXME: Move this to ARMAddressingModes.h? + unsigned RotAmt = CountLeadingZeros_32(ThisVal); + ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt); + NumBytes &= ~ThisVal; + assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 && + "Bit extraction didn't work?"); + } + } - DebugLoc DL = DebugLoc::getUnknownLoc(); - if (MI != MBB.end()) DL = MI->getDebugLoc(); - - MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, get(ARM::tPUSH)); - for (unsigned i = CSI.size(); i != 0; --i) { - unsigned Reg = CSI[i-1].getReg(); - // Add the callee-saved register as live-in. It's killed at the spill. - MBB.addLiveIn(Reg); - MIB.addReg(Reg, RegState::Kill); + // Build the new ADD / SUB. + AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg) + .addReg(BaseReg, RegState::Kill) + .addImm(ThisVal))); + + BaseReg = DestReg; } - return true; } -bool Thumb2InstrInfo:: -restoreCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI) const { - MachineFunction &MF = *MBB.getParent(); - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - if (CSI.empty()) - return false; - - bool isVarArg = AFI->getVarArgsRegSaveSize() > 0; - MachineInstr *PopMI = MF.CreateMachineInstr(get(ARM::tPOP),MI->getDebugLoc()); - for (unsigned i = CSI.size(); i != 0; --i) { - unsigned Reg = CSI[i-1].getReg(); - if (Reg == ARM::LR) { - // Special epilogue for vararg functions. See emitEpilogue - if (isVarArg) - continue; - Reg = ARM::PC; - PopMI->setDesc(get(ARM::tPOP_RET)); - MI = MBB.erase(MI); - } - PopMI->addOperand(MachineOperand::CreateReg(Reg, true)); +static unsigned +negativeOffsetOpcode(unsigned opcode) +{ + switch (opcode) { + case ARM::t2LDRi12: return ARM::t2LDRi8; + case ARM::t2LDRHi12: return ARM::t2LDRHi8; + case ARM::t2LDRBi12: return ARM::t2LDRBi8; + case ARM::t2LDRSHi12: return ARM::t2LDRSHi8; + case ARM::t2LDRSBi12: return ARM::t2LDRSBi8; + case ARM::t2STRi12: return ARM::t2STRi8; + case ARM::t2STRBi12: return ARM::t2STRBi8; + case ARM::t2STRHi12: return ARM::t2STRHi8; + + case ARM::t2LDRi8: + case ARM::t2LDRHi8: + case ARM::t2LDRBi8: + case ARM::t2LDRSHi8: + case ARM::t2LDRSBi8: + case ARM::t2STRi8: + case ARM::t2STRBi8: + case ARM::t2STRHi8: + return opcode; + + default: + break; } - // It's illegal to emit pop instruction without operands. - if (PopMI->getNumOperands() > 0) - MBB.insert(MI, PopMI); + return 0; +} + +static unsigned +positiveOffsetOpcode(unsigned opcode) +{ + switch (opcode) { + case ARM::t2LDRi8: return ARM::t2LDRi12; + case ARM::t2LDRHi8: return ARM::t2LDRHi12; + case ARM::t2LDRBi8: return ARM::t2LDRBi12; + case ARM::t2LDRSHi8: return ARM::t2LDRSHi12; + case ARM::t2LDRSBi8: return ARM::t2LDRSBi12; + case ARM::t2STRi8: return ARM::t2STRi12; + case ARM::t2STRBi8: return ARM::t2STRBi12; + case ARM::t2STRHi8: return ARM::t2STRHi12; + + case ARM::t2LDRi12: + case ARM::t2LDRHi12: + case ARM::t2LDRBi12: + case ARM::t2LDRSHi12: + case ARM::t2LDRSBi12: + case ARM::t2STRi12: + case ARM::t2STRBi12: + case ARM::t2STRHi12: + return opcode; - return true; + default: + break; + } + + return 0; } -MachineInstr *Thumb2InstrInfo:: -foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops, int FI) const { - if (Ops.size() != 1) return NULL; - - unsigned OpNum = Ops[0]; - unsigned Opc = MI->getOpcode(); - MachineInstr *NewMI = NULL; - switch (Opc) { - default: break; - case ARM::tMOVr: - case ARM::tMOVlor2hir: - case ARM::tMOVhir2lor: - case ARM::tMOVhir2hir: { - if (OpNum == 0) { // move -> store - unsigned SrcReg = MI->getOperand(1).getReg(); - bool isKill = MI->getOperand(1).isKill(); - if (RI.isPhysicalRegister(SrcReg) && !isARMLowRegister(SrcReg)) - // tSpill cannot take a high register operand. - break; - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::tSpill)) - .addReg(SrcReg, getKillRegState(isKill)) - .addFrameIndex(FI).addImm(0); - } else { // move -> load - unsigned DstReg = MI->getOperand(0).getReg(); - if (RI.isPhysicalRegister(DstReg) && !isARMLowRegister(DstReg)) - // tRestore cannot target a high register operand. - break; - bool isDead = MI->getOperand(0).isDead(); - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::tRestore)) - .addReg(DstReg, RegState::Define | getDeadRegState(isDead)) - .addFrameIndex(FI).addImm(0); - } +static unsigned +immediateOffsetOpcode(unsigned opcode) +{ + switch (opcode) { + case ARM::t2LDRs: return ARM::t2LDRi12; + case ARM::t2LDRHs: return ARM::t2LDRHi12; + case ARM::t2LDRBs: return ARM::t2LDRBi12; + case ARM::t2LDRSHs: return ARM::t2LDRSHi12; + case ARM::t2LDRSBs: return ARM::t2LDRSBi12; + case ARM::t2STRs: return ARM::t2STRi12; + case ARM::t2STRBs: return ARM::t2STRBi12; + case ARM::t2STRHs: return ARM::t2STRHi12; + + case ARM::t2LDRi12: + case ARM::t2LDRHi12: + case ARM::t2LDRBi12: + case ARM::t2LDRSHi12: + case ARM::t2LDRSBi12: + case ARM::t2STRi12: + case ARM::t2STRBi12: + case ARM::t2STRHi12: + case ARM::t2LDRi8: + case ARM::t2LDRHi8: + case ARM::t2LDRBi8: + case ARM::t2LDRSHi8: + case ARM::t2LDRSBi8: + case ARM::t2STRi8: + case ARM::t2STRBi8: + case ARM::t2STRHi8: + return opcode; + + default: break; } + + return 0; +} + +bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII) { + unsigned Opcode = MI.getOpcode(); + const TargetInstrDesc &Desc = MI.getDesc(); + unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); + bool isSub = false; + + // Memory operands in inline assembly always use AddrModeT2_i12. + if (Opcode == ARM::INLINEASM) + AddrMode = ARMII::AddrModeT2_i12; // FIXME. mode for thumb2? + + if (Opcode == ARM::t2ADDri || Opcode == ARM::t2ADDri12) { + Offset += MI.getOperand(FrameRegIdx+1).getImm(); + + bool isSP = FrameReg == ARM::SP; + if (Offset == 0) { + // Turn it into a move. + MI.setDesc(TII.get(ARM::tMOVgpr2gpr)); + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + MI.RemoveOperand(FrameRegIdx+1); + Offset = 0; + return true; + } + + if (Offset < 0) { + Offset = -Offset; + isSub = true; + MI.setDesc(TII.get(isSP ? ARM::t2SUBrSPi : ARM::t2SUBri)); + } else { + MI.setDesc(TII.get(isSP ? ARM::t2ADDrSPi : ARM::t2ADDri)); + } + + // Common case: small offset, fits into instruction. + if (ARM_AM::getT2SOImmVal(Offset) != -1) { + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset); + Offset = 0; + return true; + } + // Another common case: imm12. + if (Offset < 4096) { + unsigned NewOpc = isSP + ? (isSub ? ARM::t2SUBrSPi12 : ARM::t2ADDrSPi12) + : (isSub ? ARM::t2SUBri12 : ARM::t2ADDri12); + MI.setDesc(TII.get(NewOpc)); + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset); + Offset = 0; + return true; + } + + // Otherwise, extract 8 adjacent bits from the immediate into this + // t2ADDri/t2SUBri. + unsigned RotAmt = CountLeadingZeros_32(Offset); + unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xff000000U, RotAmt); + + // We will handle these bits from offset, clear them. + Offset &= ~ThisImmVal; + + assert(ARM_AM::getT2SOImmVal(ThisImmVal) != -1 && + "Bit extraction didn't work?"); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(ThisImmVal); + } else { + + // AddrMode4 cannot handle any offset. + if (AddrMode == ARMII::AddrMode4) + return false; + + // AddrModeT2_so cannot handle any offset. If there is no offset + // register then we change to an immediate version. + unsigned NewOpc = Opcode; + if (AddrMode == ARMII::AddrModeT2_so) { + unsigned OffsetReg = MI.getOperand(FrameRegIdx+1).getReg(); + if (OffsetReg != 0) { + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + return Offset == 0; + } + + MI.RemoveOperand(FrameRegIdx+1); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(0); + NewOpc = immediateOffsetOpcode(Opcode); + AddrMode = ARMII::AddrModeT2_i12; + } + + unsigned NumBits = 0; + unsigned Scale = 1; + if (AddrMode == ARMII::AddrModeT2_i8 || AddrMode == ARMII::AddrModeT2_i12) { + // i8 supports only negative, and i12 supports only positive, so + // based on Offset sign convert Opcode to the appropriate + // instruction + Offset += MI.getOperand(FrameRegIdx+1).getImm(); + if (Offset < 0) { + NewOpc = negativeOffsetOpcode(Opcode); + NumBits = 8; + isSub = true; + Offset = -Offset; + } else { + NewOpc = positiveOffsetOpcode(Opcode); + NumBits = 12; + } + } else { + // VFP and NEON address modes. + int InstrOffs = 0; + if (AddrMode == ARMII::AddrMode5) { + const MachineOperand &OffOp = MI.getOperand(FrameRegIdx+1); + InstrOffs = ARM_AM::getAM5Offset(OffOp.getImm()); + if (ARM_AM::getAM5Op(OffOp.getImm()) == ARM_AM::sub) + InstrOffs *= -1; + } + NumBits = 8; + Scale = 4; + Offset += InstrOffs * 4; + assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!"); + if (Offset < 0) { + Offset = -Offset; + isSub = true; + } + } + + if (NewOpc != Opcode) + MI.setDesc(TII.get(NewOpc)); + + MachineOperand &ImmOp = MI.getOperand(FrameRegIdx+1); + + // Attempt to fold address computation + // Common case: small offset, fits into instruction. + int ImmedOffset = Offset / Scale; + unsigned Mask = (1 << NumBits) - 1; + if ((unsigned)Offset <= Mask * Scale) { + // Replace the FrameIndex with fp/sp + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + if (isSub) { + if (AddrMode == ARMII::AddrMode5) + // FIXME: Not consistent. + ImmedOffset |= 1 << NumBits; + else + ImmedOffset = -ImmedOffset; + } + ImmOp.ChangeToImmediate(ImmedOffset); + Offset = 0; + return true; + } + + // Otherwise, offset doesn't fit. Pull in what we can to simplify + ImmedOffset = ImmedOffset & Mask; + if (isSub) { + if (AddrMode == ARMII::AddrMode5) + // FIXME: Not consistent. + ImmedOffset |= 1 << NumBits; + else { + ImmedOffset = -ImmedOffset; + if (ImmedOffset == 0) + // Change the opcode back if the encoded offset is zero. + MI.setDesc(TII.get(positiveOffsetOpcode(NewOpc))); + } + } + ImmOp.ChangeToImmediate(ImmedOffset); + Offset &= ~(Mask*Scale); } - return NewMI; + Offset = (isSub) ? -Offset : Offset; + return Offset == 0; } diff --git a/lib/Target/ARM/Thumb2InstrInfo.h b/lib/Target/ARM/Thumb2InstrInfo.h index 84dcb49..f3688c0 100644 --- a/lib/Target/ARM/Thumb2InstrInfo.h +++ b/lib/Target/ARM/Thumb2InstrInfo.h @@ -27,66 +27,34 @@ class Thumb2InstrInfo : public ARMBaseInstrInfo { public: explicit Thumb2InstrInfo(const ARMSubtarget &STI); - /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As - /// such, whenever a client has an instance of instruction info, it should - /// always be able to get register info as well (through this method). - /// - const Thumb2RegisterInfo &getRegisterInfo() const { return RI; } - - bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI) const; - bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI) const; + // Return the non-pre/post incrementing version of 'Opc'. Return 0 + // if there is not such an opcode. + unsigned getUnindexedOpcode(unsigned Opc) const; - bool isMoveInstr(const MachineInstr &MI, - unsigned &SrcReg, unsigned &DstReg, - unsigned &SrcSubIdx, unsigned &DstSubIdx) const; - unsigned isLoadFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const; - unsigned isStoreToStackSlot(const MachineInstr *MI, - int &FrameIndex) const; + // Return true if the block does not fall through. + bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const; bool copyRegToReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - unsigned DestReg, unsigned SrcReg, - const TargetRegisterClass *DestRC, - const TargetRegisterClass *SrcRC) const; - void storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - unsigned SrcReg, bool isKill, int FrameIndex, - const TargetRegisterClass *RC) const; + MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *DestRC, + const TargetRegisterClass *SrcRC) const; - void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - SmallVectorImpl<MachineInstr*> &NewMIs) const; + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC) const; void loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - unsigned DestReg, int FrameIndex, - const TargetRegisterClass *RC) const; + MachineBasicBlock::iterator MBBI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC) const; - void loadRegFromAddr(MachineFunction &MF, unsigned DestReg, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - SmallVectorImpl<MachineInstr*> &NewMIs) const; - - bool canFoldMemoryOperand(const MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops) const; - - MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr* MI, - const SmallVectorImpl<unsigned> &Ops, - int FrameIndex) const; - - MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr* MI, - const SmallVectorImpl<unsigned> &Ops, - MachineInstr* LoadMI) const { - return 0; - } + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + /// + const Thumb2RegisterInfo &getRegisterInfo() const { return RI; } }; } diff --git a/lib/Target/ARM/Thumb2RegisterInfo.cpp b/lib/Target/ARM/Thumb2RegisterInfo.cpp index 0f0c0e4..6c4c15d 100644 --- a/lib/Target/ARM/Thumb2RegisterInfo.cpp +++ b/lib/Target/ARM/Thumb2RegisterInfo.cpp @@ -13,12 +13,15 @@ #include "ARM.h" #include "ARMAddressingModes.h" +#include "ARMBaseInstrInfo.h" #include "ARMMachineFunctionInfo.h" #include "ARMSubtarget.h" #include "Thumb2InstrInfo.h" #include "Thumb2RegisterInfo.h" #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/LLVMContext.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -30,14 +33,10 @@ #include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" using namespace llvm; -static cl::opt<bool> -Thumb2RegScavenging("enable-thumb2-reg-scavenging", - cl::Hidden, - cl::desc("Enable register scavenging on Thumb-2")); - -Thumb2RegisterInfo::Thumb2RegisterInfo(const TargetInstrInfo &tii, +Thumb2RegisterInfo::Thumb2RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &sti) : ARMBaseRegisterInfo(tii, sti) { } @@ -46,710 +45,23 @@ Thumb2RegisterInfo::Thumb2RegisterInfo(const TargetInstrInfo &tii, /// specified immediate. void Thumb2RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - unsigned DestReg, int Val, - const TargetInstrInfo *TII, - DebugLoc dl) const { + DebugLoc dl, + unsigned DestReg, unsigned SubIdx, + int Val, + ARMCC::CondCodes Pred, + unsigned PredReg) const { MachineFunction &MF = *MBB.getParent(); MachineConstantPool *ConstantPool = MF.getConstantPool(); - Constant *C = ConstantInt::get(Type::Int32Ty, Val); + Constant *C = ConstantInt::get( + Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val); unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4); - BuildMI(MBB, MBBI, dl, TII->get(ARM::tLDRcp), DestReg) - .addConstantPoolIndex(Idx); -} - -const TargetRegisterClass* -Thumb2RegisterInfo::getPhysicalRegisterRegClass(unsigned Reg, MVT VT) const { - if (isARMLowRegister(Reg)) - return ARM::tGPRRegisterClass; - switch (Reg) { - default: - break; - case ARM::R8: case ARM::R9: case ARM::R10: case ARM::R11: - case ARM::R12: case ARM::SP: case ARM::LR: case ARM::PC: - return ARM::GPRRegisterClass; - } - - return TargetRegisterInfo::getPhysicalRegisterRegClass(Reg, VT); -} - -bool -Thumb2RegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const { - return Thumb2RegScavenging; -} - -bool Thumb2RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const { - const MachineFrameInfo *FFI = MF.getFrameInfo(); - unsigned CFSize = FFI->getMaxCallFrameSize(); - // It's not always a good idea to include the call frame as part of the - // stack frame. ARM (especially Thumb) has small immediate offset to - // address the stack frame. So a large call frame can cause poor codegen - // and may even makes it impossible to scavenge a register. - if (CFSize >= ((1 << 8) - 1) * 4 / 2) // Half of imm8 * 4 - return false; - - return !MF.getFrameInfo()->hasVarSizedObjects(); -} - -/// emitThumbRegPlusImmInReg - Emits a series of instructions to materialize -/// a destreg = basereg + immediate in Thumb code. Materialize the immediate -/// in a register using mov / mvn sequences or load the immediate from a -/// constpool entry. -static -void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - unsigned DestReg, unsigned BaseReg, - int NumBytes, bool CanChangeCC, - const TargetInstrInfo &TII, - const Thumb2RegisterInfo& MRI, - DebugLoc dl) { - bool isHigh = !isARMLowRegister(DestReg) || - (BaseReg != 0 && !isARMLowRegister(BaseReg)); - bool isSub = false; - // Subtract doesn't have high register version. Load the negative value - // if either base or dest register is a high register. Also, if do not - // issue sub as part of the sequence if condition register is to be - // preserved. - if (NumBytes < 0 && !isHigh && CanChangeCC) { - isSub = true; - NumBytes = -NumBytes; - } - unsigned LdReg = DestReg; - if (DestReg == ARM::SP) { - assert(BaseReg == ARM::SP && "Unexpected!"); - LdReg = ARM::R3; - BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVlor2hir), ARM::R12) - .addReg(ARM::R3, RegState::Kill); - } - - if (NumBytes <= 255 && NumBytes >= 0) - BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg).addImm(NumBytes); - else if (NumBytes < 0 && NumBytes >= -255) { - BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg).addImm(NumBytes); - BuildMI(MBB, MBBI, dl, TII.get(ARM::tNEG), LdReg) - .addReg(LdReg, RegState::Kill); - } else - MRI.emitLoadConstPool(MBB, MBBI, LdReg, NumBytes, &TII, dl); - - // Emit add / sub. - int Opc = (isSub) ? ARM::tSUBrr : (isHigh ? ARM::tADDhirr : ARM::tADDrr); - const MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, - TII.get(Opc), DestReg); - if (DestReg == ARM::SP || isSub) - MIB.addReg(BaseReg).addReg(LdReg, RegState::Kill); - else - MIB.addReg(LdReg).addReg(BaseReg, RegState::Kill); - if (DestReg == ARM::SP) - BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVhir2lor), ARM::R3) - .addReg(ARM::R12, RegState::Kill); -} - -/// calcNumMI - Returns the number of instructions required to materialize -/// the specific add / sub r, c instruction. -static unsigned calcNumMI(int Opc, int ExtraOpc, unsigned Bytes, - unsigned NumBits, unsigned Scale) { - unsigned NumMIs = 0; - unsigned Chunk = ((1 << NumBits) - 1) * Scale; - - if (Opc == ARM::tADDrSPi) { - unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes; - Bytes -= ThisVal; - NumMIs++; - NumBits = 8; - Scale = 1; // Followed by a number of tADDi8. - Chunk = ((1 << NumBits) - 1) * Scale; - } - - NumMIs += Bytes / Chunk; - if ((Bytes % Chunk) != 0) - NumMIs++; - if (ExtraOpc) - NumMIs++; - return NumMIs; -} - -/// emitThumbRegPlusImmediate - Emits a series of instructions to materialize -/// a destreg = basereg + immediate in Thumb code. -static -void emitThumbRegPlusImmediate(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - unsigned DestReg, unsigned BaseReg, - int NumBytes, const TargetInstrInfo &TII, - const Thumb2RegisterInfo& MRI, - DebugLoc dl) { - bool isSub = NumBytes < 0; - unsigned Bytes = (unsigned)NumBytes; - if (isSub) Bytes = -NumBytes; - bool isMul4 = (Bytes & 3) == 0; - bool isTwoAddr = false; - bool DstNotEqBase = false; - unsigned NumBits = 1; - unsigned Scale = 1; - int Opc = 0; - int ExtraOpc = 0; - - if (DestReg == BaseReg && BaseReg == ARM::SP) { - assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!"); - NumBits = 7; - Scale = 4; - Opc = isSub ? ARM::tSUBspi : ARM::tADDspi; - isTwoAddr = true; - } else if (!isSub && BaseReg == ARM::SP) { - // r1 = add sp, 403 - // => - // r1 = add sp, 100 * 4 - // r1 = add r1, 3 - if (!isMul4) { - Bytes &= ~3; - ExtraOpc = ARM::tADDi3; - } - NumBits = 8; - Scale = 4; - Opc = ARM::tADDrSPi; - } else { - // sp = sub sp, c - // r1 = sub sp, c - // r8 = sub sp, c - if (DestReg != BaseReg) - DstNotEqBase = true; - NumBits = 8; - Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8; - isTwoAddr = true; - } - - unsigned NumMIs = calcNumMI(Opc, ExtraOpc, Bytes, NumBits, Scale); - unsigned Threshold = (DestReg == ARM::SP) ? 3 : 2; - if (NumMIs > Threshold) { - // This will expand into too many instructions. Load the immediate from a - // constpool entry. - emitThumbRegPlusImmInReg(MBB, MBBI, DestReg, BaseReg, NumBytes, true, TII, - MRI, dl); - return; - } - - if (DstNotEqBase) { - if (isARMLowRegister(DestReg) && isARMLowRegister(BaseReg)) { - // If both are low registers, emit DestReg = add BaseReg, max(Imm, 7) - unsigned Chunk = (1 << 3) - 1; - unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes; - Bytes -= ThisVal; - BuildMI(MBB, MBBI, dl,TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3), DestReg) - .addReg(BaseReg, RegState::Kill).addImm(ThisVal); - } else { - BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg) - .addReg(BaseReg, RegState::Kill); - } - BaseReg = DestReg; - } - - unsigned Chunk = ((1 << NumBits) - 1) * Scale; - while (Bytes) { - unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes; - Bytes -= ThisVal; - ThisVal /= Scale; - // Build the new tADD / tSUB. - if (isTwoAddr) - BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg) - .addReg(DestReg).addImm(ThisVal); - else { - bool isKill = BaseReg != ARM::SP; - BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg) - .addReg(BaseReg, getKillRegState(isKill)).addImm(ThisVal); - BaseReg = DestReg; - - if (Opc == ARM::tADDrSPi) { - // r4 = add sp, imm - // r4 = add r4, imm - // ... - NumBits = 8; - Scale = 1; - Chunk = ((1 << NumBits) - 1) * Scale; - Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8; - isTwoAddr = true; - } - } - } - - if (ExtraOpc) - BuildMI(MBB, MBBI, dl, TII.get(ExtraOpc), DestReg) - .addReg(DestReg, RegState::Kill) - .addImm(((unsigned)NumBytes) & 3); -} - -static void emitSPUpdate(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - const TargetInstrInfo &TII, DebugLoc dl, - const Thumb2RegisterInfo &MRI, - int NumBytes) { - emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, ARM::SP, NumBytes, TII, - MRI, dl); -} - -void Thumb2RegisterInfo:: -eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const { - if (!hasReservedCallFrame(MF)) { - // If we have alloca, convert as follows: - // ADJCALLSTACKDOWN -> sub, sp, sp, amount - // ADJCALLSTACKUP -> add, sp, sp, amount - MachineInstr *Old = I; - DebugLoc dl = Old->getDebugLoc(); - unsigned Amount = Old->getOperand(0).getImm(); - if (Amount != 0) { - // We need to keep the stack aligned properly. To do this, we round the - // amount of space needed for the outgoing arguments up to the next - // alignment boundary. - unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment(); - Amount = (Amount+Align-1)/Align*Align; - - // Replace the pseudo instruction with a new instruction... - unsigned Opc = Old->getOpcode(); - if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) { - emitSPUpdate(MBB, I, TII, dl, *this, -Amount); - } else { - assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP); - emitSPUpdate(MBB, I, TII, dl, *this, Amount); - } - } - } - MBB.erase(I); -} - -/// emitThumbConstant - Emit a series of instructions to materialize a -/// constant. -static void emitThumbConstant(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - unsigned DestReg, int Imm, - const TargetInstrInfo &TII, - const Thumb2RegisterInfo& MRI, - DebugLoc dl) { - bool isSub = Imm < 0; - if (isSub) Imm = -Imm; - - int Chunk = (1 << 8) - 1; - int ThisVal = (Imm > Chunk) ? Chunk : Imm; - Imm -= ThisVal; - BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), DestReg).addImm(ThisVal); - if (Imm > 0) - emitThumbRegPlusImmediate(MBB, MBBI, DestReg, DestReg, Imm, TII, MRI, dl); - if (isSub) - BuildMI(MBB, MBBI, dl, TII.get(ARM::tNEG), DestReg) - .addReg(DestReg, RegState::Kill); -} - -void Thumb2RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, - int SPAdj, RegScavenger *RS) const{ - unsigned i = 0; - MachineInstr &MI = *II; - MachineBasicBlock &MBB = *MI.getParent(); - MachineFunction &MF = *MBB.getParent(); - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - DebugLoc dl = MI.getDebugLoc(); - - while (!MI.getOperand(i).isFI()) { - ++i; - assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); - } - - unsigned FrameReg = ARM::SP; - int FrameIndex = MI.getOperand(i).getIndex(); - int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) + - MF.getFrameInfo()->getStackSize() + SPAdj; - - if (AFI->isGPRCalleeSavedArea1Frame(FrameIndex)) - Offset -= AFI->getGPRCalleeSavedArea1Offset(); - else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex)) - Offset -= AFI->getGPRCalleeSavedArea2Offset(); - else if (hasFP(MF)) { - assert(SPAdj == 0 && "Unexpected"); - // There is alloca()'s in this function, must reference off the frame - // pointer instead. - FrameReg = getFrameRegister(MF); - Offset -= AFI->getFramePtrSpillOffset(); - } - - unsigned Opcode = MI.getOpcode(); - const TargetInstrDesc &Desc = MI.getDesc(); - unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); - - if (Opcode == ARM::tADDrSPi) { - Offset += MI.getOperand(i+1).getImm(); - - // Can't use tADDrSPi if it's based off the frame pointer. - unsigned NumBits = 0; - unsigned Scale = 1; - if (FrameReg != ARM::SP) { - Opcode = ARM::tADDi3; - MI.setDesc(TII.get(ARM::tADDi3)); - NumBits = 3; - } else { - NumBits = 8; - Scale = 4; - assert((Offset & 3) == 0 && - "Thumb add/sub sp, #imm immediate must be multiple of 4!"); - } - - if (Offset == 0) { - // Turn it into a move. - MI.setDesc(TII.get(ARM::tMOVhir2lor)); - MI.getOperand(i).ChangeToRegister(FrameReg, false); - MI.RemoveOperand(i+1); - return; - } - - // Common case: small offset, fits into instruction. - unsigned Mask = (1 << NumBits) - 1; - if (((Offset / Scale) & ~Mask) == 0) { - // Replace the FrameIndex with sp / fp - MI.getOperand(i).ChangeToRegister(FrameReg, false); - MI.getOperand(i+1).ChangeToImmediate(Offset / Scale); - return; - } - - unsigned DestReg = MI.getOperand(0).getReg(); - unsigned Bytes = (Offset > 0) ? Offset : -Offset; - unsigned NumMIs = calcNumMI(Opcode, 0, Bytes, NumBits, Scale); - // MI would expand into a large number of instructions. Don't try to - // simplify the immediate. - if (NumMIs > 2) { - emitThumbRegPlusImmediate(MBB, II, DestReg, FrameReg, Offset, TII, - *this, dl); - MBB.erase(II); - return; - } - - if (Offset > 0) { - // Translate r0 = add sp, imm to - // r0 = add sp, 255*4 - // r0 = add r0, (imm - 255*4) - MI.getOperand(i).ChangeToRegister(FrameReg, false); - MI.getOperand(i+1).ChangeToImmediate(Mask); - Offset = (Offset - Mask * Scale); - MachineBasicBlock::iterator NII = next(II); - emitThumbRegPlusImmediate(MBB, NII, DestReg, DestReg, Offset, TII, - *this, dl); - } else { - // Translate r0 = add sp, -imm to - // r0 = -imm (this is then translated into a series of instructons) - // r0 = add r0, sp - emitThumbConstant(MBB, II, DestReg, Offset, TII, *this, dl); - MI.setDesc(TII.get(ARM::tADDhirr)); - MI.getOperand(i).ChangeToRegister(DestReg, false, false, true); - MI.getOperand(i+1).ChangeToRegister(FrameReg, false); - } - return; - } else { - unsigned ImmIdx = 0; - int InstrOffs = 0; - unsigned NumBits = 0; - unsigned Scale = 1; - switch (AddrMode) { - case ARMII::AddrModeT1_s: { - ImmIdx = i+1; - InstrOffs = MI.getOperand(ImmIdx).getImm(); - NumBits = (FrameReg == ARM::SP) ? 8 : 5; - Scale = 4; - break; - } - default: - assert(0 && "Unsupported addressing mode!"); - abort(); - break; - } - - Offset += InstrOffs * Scale; - assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!"); - - // Common case: small offset, fits into instruction. - MachineOperand &ImmOp = MI.getOperand(ImmIdx); - int ImmedOffset = Offset / Scale; - unsigned Mask = (1 << NumBits) - 1; - if ((unsigned)Offset <= Mask * Scale) { - // Replace the FrameIndex with sp - MI.getOperand(i).ChangeToRegister(FrameReg, false); - ImmOp.ChangeToImmediate(ImmedOffset); - return; - } - - bool isThumSpillRestore = Opcode == ARM::tRestore || Opcode == ARM::tSpill; - if (AddrMode == ARMII::AddrModeT1_s) { - // Thumb tLDRspi, tSTRspi. These will change to instructions that use - // a different base register. - NumBits = 5; - Mask = (1 << NumBits) - 1; - } - // If this is a thumb spill / restore, we will be using a constpool load to - // materialize the offset. - if (AddrMode == ARMII::AddrModeT1_s && isThumSpillRestore) - ImmOp.ChangeToImmediate(0); - else { - // Otherwise, it didn't fit. Pull in what we can to simplify the immed. - ImmedOffset = ImmedOffset & Mask; - ImmOp.ChangeToImmediate(ImmedOffset); - Offset &= ~(Mask*Scale); - } - } - - // If we get here, the immediate doesn't fit into the instruction. We folded - // as much as possible above, handle the rest, providing a register that is - // SP+LargeImm. - assert(Offset && "This code isn't needed if offset already handled!"); - - if (Desc.mayLoad()) { - // Use the destination register to materialize sp + offset. - unsigned TmpReg = MI.getOperand(0).getReg(); - bool UseRR = false; - if (Opcode == ARM::tRestore) { - if (FrameReg == ARM::SP) - emitThumbRegPlusImmInReg(MBB, II, TmpReg, FrameReg, - Offset, false, TII, *this, dl); - else { - emitLoadConstPool(MBB, II, TmpReg, Offset, &TII, dl); - UseRR = true; - } - } else - emitThumbRegPlusImmediate(MBB, II, TmpReg, FrameReg, Offset, TII, - *this, dl); - MI.setDesc(TII.get(ARM::tLDR)); - MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true); - if (UseRR) - // Use [reg, reg] addrmode. - MI.addOperand(MachineOperand::CreateReg(FrameReg, false)); - else // tLDR has an extra register operand. - MI.addOperand(MachineOperand::CreateReg(0, false)); - } else if (Desc.mayStore()) { - // FIXME! This is horrific!!! We need register scavenging. - // Our temporary workaround has marked r3 unavailable. Of course, r3 is - // also a ABI register so it's possible that is is the register that is - // being storing here. If that's the case, we do the following: - // r12 = r2 - // Use r2 to materialize sp + offset - // str r3, r2 - // r2 = r12 - unsigned ValReg = MI.getOperand(0).getReg(); - unsigned TmpReg = ARM::R3; - bool UseRR = false; - if (ValReg == ARM::R3) { - BuildMI(MBB, II, dl, TII.get(ARM::tMOVlor2hir), ARM::R12) - .addReg(ARM::R2, RegState::Kill); - TmpReg = ARM::R2; - } - if (TmpReg == ARM::R3 && AFI->isR3LiveIn()) - BuildMI(MBB, II, dl, TII.get(ARM::tMOVlor2hir), ARM::R12) - .addReg(ARM::R3, RegState::Kill); - if (Opcode == ARM::tSpill) { - if (FrameReg == ARM::SP) - emitThumbRegPlusImmInReg(MBB, II, TmpReg, FrameReg, - Offset, false, TII, *this, dl); - else { - emitLoadConstPool(MBB, II, TmpReg, Offset, &TII, dl); - UseRR = true; - } - } else - emitThumbRegPlusImmediate(MBB, II, TmpReg, FrameReg, Offset, TII, - *this, dl); - MI.setDesc(TII.get(ARM::tSTR)); - MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true); - if (UseRR) // Use [reg, reg] addrmode. - MI.addOperand(MachineOperand::CreateReg(FrameReg, false)); - else // tSTR has an extra register operand. - MI.addOperand(MachineOperand::CreateReg(0, false)); - - MachineBasicBlock::iterator NII = next(II); - if (ValReg == ARM::R3) - BuildMI(MBB, NII, dl, TII.get(ARM::tMOVhir2lor), ARM::R2) - .addReg(ARM::R12, RegState::Kill); - if (TmpReg == ARM::R3 && AFI->isR3LiveIn()) - BuildMI(MBB, NII, dl, TII.get(ARM::tMOVhir2lor), ARM::R3) - .addReg(ARM::R12, RegState::Kill); - } else - assert(false && "Unexpected opcode!"); -} - -void Thumb2RegisterInfo::emitPrologue(MachineFunction &MF) const { - MachineBasicBlock &MBB = MF.front(); - MachineBasicBlock::iterator MBBI = MBB.begin(); - MachineFrameInfo *MFI = MF.getFrameInfo(); - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); - unsigned NumBytes = MFI->getStackSize(); - const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); - DebugLoc dl = (MBBI != MBB.end() ? - MBBI->getDebugLoc() : DebugLoc::getUnknownLoc()); - - // Check if R3 is live in. It might have to be used as a scratch register. - for (MachineRegisterInfo::livein_iterator I =MF.getRegInfo().livein_begin(), - E = MF.getRegInfo().livein_end(); I != E; ++I) { - if (I->first == ARM::R3) { - AFI->setR3IsLiveIn(true); - break; - } - } - - // Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4. - NumBytes = (NumBytes + 3) & ~3; - MFI->setStackSize(NumBytes); - - // Determine the sizes of each callee-save spill areas and record which frame - // belongs to which callee-save spill areas. - unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0; - int FramePtrSpillFI = 0; - - if (VARegSaveSize) - emitSPUpdate(MBB, MBBI, TII, dl, *this, -VARegSaveSize); - - if (!AFI->hasStackFrame()) { - if (NumBytes != 0) - emitSPUpdate(MBB, MBBI, TII, dl, *this, -NumBytes); - return; - } - - for (unsigned i = 0, e = CSI.size(); i != e; ++i) { - unsigned Reg = CSI[i].getReg(); - int FI = CSI[i].getFrameIdx(); - switch (Reg) { - case ARM::R4: - case ARM::R5: - case ARM::R6: - case ARM::R7: - case ARM::LR: - if (Reg == FramePtr) - FramePtrSpillFI = FI; - AFI->addGPRCalleeSavedArea1Frame(FI); - GPRCS1Size += 4; - break; - case ARM::R8: - case ARM::R9: - case ARM::R10: - case ARM::R11: - if (Reg == FramePtr) - FramePtrSpillFI = FI; - if (STI.isTargetDarwin()) { - AFI->addGPRCalleeSavedArea2Frame(FI); - GPRCS2Size += 4; - } else { - AFI->addGPRCalleeSavedArea1Frame(FI); - GPRCS1Size += 4; - } - break; - default: - AFI->addDPRCalleeSavedAreaFrame(FI); - DPRCSSize += 8; - } - } - - if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) { - ++MBBI; - if (MBBI != MBB.end()) - dl = MBBI->getDebugLoc(); - } - - // Darwin ABI requires FP to point to the stack slot that contains the - // previous FP. - if (STI.isTargetDarwin() || hasFP(MF)) { - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr) - .addFrameIndex(FramePtrSpillFI).addImm(0); - } - - // Determine starting offsets of spill areas. - unsigned DPRCSOffset = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize); - unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize; - unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size; - AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes); - AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset); - AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset); - AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset); - - NumBytes = DPRCSOffset; - if (NumBytes) { - // Insert it after all the callee-save spills. - emitSPUpdate(MBB, MBBI, TII, dl, *this, -NumBytes); - } - - if (STI.isTargetELF() && hasFP(MF)) { - MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() - - AFI->getFramePtrSpillOffset()); - } - - AFI->setGPRCalleeSavedArea1Size(GPRCS1Size); - AFI->setGPRCalleeSavedArea2Size(GPRCS2Size); - AFI->setDPRCalleeSavedAreaSize(DPRCSSize); + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2LDRpci)) + .addReg(DestReg, getDefRegState(true), SubIdx) + .addConstantPoolIndex(Idx).addImm((int64_t)ARMCC::AL).addReg(0); } -static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) { - for (unsigned i = 0; CSRegs[i]; ++i) - if (Reg == CSRegs[i]) - return true; - return false; -} - -static bool isCSRestore(MachineInstr *MI, const unsigned *CSRegs) { - return (MI->getOpcode() == ARM::tRestore && - MI->getOperand(1).isFI() && - isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs)); -} - -void Thumb2RegisterInfo::emitEpilogue(MachineFunction &MF, - MachineBasicBlock &MBB) const { - MachineBasicBlock::iterator MBBI = prior(MBB.end()); - assert((MBBI->getOpcode() == ARM::tBX_RET || - MBBI->getOpcode() == ARM::tPOP_RET) && - "Can only insert epilog into returning blocks"); - DebugLoc dl = MBBI->getDebugLoc(); - MachineFrameInfo *MFI = MF.getFrameInfo(); - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); - int NumBytes = (int)MFI->getStackSize(); - - if (!AFI->hasStackFrame()) { - if (NumBytes != 0) - emitSPUpdate(MBB, MBBI, TII, dl, *this, NumBytes); - } else { - // Unwind MBBI to point to first LDR / FLDD. - const unsigned *CSRegs = getCalleeSavedRegs(); - if (MBBI != MBB.begin()) { - do - --MBBI; - while (MBBI != MBB.begin() && isCSRestore(MBBI, CSRegs)); - if (!isCSRestore(MBBI, CSRegs)) - ++MBBI; - } - - // Move SP to start of FP callee save spill area. - NumBytes -= (AFI->getGPRCalleeSavedArea1Size() + - AFI->getGPRCalleeSavedArea2Size() + - AFI->getDPRCalleeSavedAreaSize()); - - if (hasFP(MF)) { - NumBytes = AFI->getFramePtrSpillOffset() - NumBytes; - // Reset SP based on frame pointer only if the stack frame extends beyond - // frame pointer stack slot or target is ELF and the function has FP. - if (NumBytes) - emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, FramePtr, -NumBytes, - TII, *this, dl); - else - BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVlor2hir), ARM::SP) - .addReg(FramePtr); - } else { - if (MBBI->getOpcode() == ARM::tBX_RET && - &MBB.front() != MBBI && - prior(MBBI)->getOpcode() == ARM::tPOP) { - MachineBasicBlock::iterator PMBBI = prior(MBBI); - emitSPUpdate(MBB, PMBBI, TII, dl, *this, NumBytes); - } else - emitSPUpdate(MBB, MBBI, TII, dl, *this, NumBytes); - } - } - - if (VARegSaveSize) { - // Epilogue for vararg functions: pop LR to R3 and branch off it. - // FIXME: Verify this is still ok when R3 is no longer being reserved. - BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)).addReg(ARM::R3); - - emitSPUpdate(MBB, MBBI, TII, dl, *this, VARegSaveSize); - - BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET_vararg)).addReg(ARM::R3); - MBB.erase(MBBI); - } +bool Thumb2RegisterInfo:: +requiresRegisterScavenging(const MachineFunction &MF) const { + return true; } diff --git a/lib/Target/ARM/Thumb2RegisterInfo.h b/lib/Target/ARM/Thumb2RegisterInfo.h index d379c31..a63c60b 100644 --- a/lib/Target/ARM/Thumb2RegisterInfo.h +++ b/lib/Target/ARM/Thumb2RegisterInfo.h @@ -20,40 +20,23 @@ namespace llvm { class ARMSubtarget; - class TargetInstrInfo; + class ARMBaseInstrInfo; class Type; struct Thumb2RegisterInfo : public ARMBaseRegisterInfo { public: - Thumb2RegisterInfo(const TargetInstrInfo &tii, const ARMSubtarget &STI); + Thumb2RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI); /// emitLoadConstPool - Emits a load from constpool to materialize the /// specified immediate. void emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - unsigned DestReg, int Val, - const TargetInstrInfo *TII, - DebugLoc dl) const; - - /// Code Generation virtual methods... - const TargetRegisterClass * - getPhysicalRegisterRegClass(unsigned Reg, MVT VT = MVT::Other) const; - - bool isReservedReg(const MachineFunction &MF, unsigned Reg) const; + DebugLoc dl, + unsigned DestReg, unsigned SubIdx, int Val, + ARMCC::CondCodes Pred = ARMCC::AL, + unsigned PredReg = 0) const; bool requiresRegisterScavenging(const MachineFunction &MF) const; - - bool hasReservedCallFrame(MachineFunction &MF) const; - - void eliminateCallFramePseudoInstr(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const; - - void eliminateFrameIndex(MachineBasicBlock::iterator II, - int SPAdj, RegScavenger *RS = NULL) const; - - void emitPrologue(MachineFunction &MF) const; - void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; }; } diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp new file mode 100644 index 0000000..b8879d2 --- /dev/null +++ b/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -0,0 +1,685 @@ +//===-- Thumb2SizeReduction.cpp - Thumb2 code size reduction pass -*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "t2-reduce-size" +#include "ARM.h" +#include "ARMAddressingModes.h" +#include "ARMBaseRegisterInfo.h" +#include "ARMBaseInstrInfo.h" +#include "Thumb2InstrInfo.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumNarrows, "Number of 32-bit instrs reduced to 16-bit ones"); +STATISTIC(Num2Addrs, "Number of 32-bit instrs reduced to 2addr 16-bit ones"); +STATISTIC(NumLdSts, "Number of 32-bit load / store reduced to 16-bit ones"); + +static cl::opt<int> ReduceLimit("t2-reduce-limit", + cl::init(-1), cl::Hidden); +static cl::opt<int> ReduceLimit2Addr("t2-reduce-limit2", + cl::init(-1), cl::Hidden); +static cl::opt<int> ReduceLimitLdSt("t2-reduce-limit3", + cl::init(-1), cl::Hidden); + +namespace { + /// ReduceTable - A static table with information on mapping from wide + /// opcodes to narrow + struct ReduceEntry { + unsigned WideOpc; // Wide opcode + unsigned NarrowOpc1; // Narrow opcode to transform to + unsigned NarrowOpc2; // Narrow opcode when it's two-address + uint8_t Imm1Limit; // Limit of immediate field (bits) + uint8_t Imm2Limit; // Limit of immediate field when it's two-address + unsigned LowRegs1 : 1; // Only possible if low-registers are used + unsigned LowRegs2 : 1; // Only possible if low-registers are used (2addr) + unsigned PredCC1 : 2; // 0 - If predicated, cc is on and vice versa. + // 1 - No cc field. + // 2 - Always set CPSR. + unsigned PredCC2 : 2; + unsigned Special : 1; // Needs to be dealt with specially + }; + + static const ReduceEntry ReduceTable[] = { + // Wide, Narrow1, Narrow2, imm1,imm2, lo1, lo2, P/C, S + { ARM::t2ADCrr, 0, ARM::tADC, 0, 0, 0, 1, 0,0, 0 }, + { ARM::t2ADDri, ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 0,0, 0 }, + { ARM::t2ADDrr, ARM::tADDrr, ARM::tADDhirr, 0, 0, 1, 0, 0,1, 0 }, + // Note: immediate scale is 4. + { ARM::t2ADDrSPi,ARM::tADDrSPi,0, 8, 0, 1, 0, 1,0, 0 }, + { ARM::t2ADDSri,ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 2,2, 1 }, + { ARM::t2ADDSrr,ARM::tADDrr, 0, 0, 0, 1, 0, 2,0, 1 }, + { ARM::t2ANDrr, 0, ARM::tAND, 0, 0, 0, 1, 0,0, 0 }, + { ARM::t2ASRri, ARM::tASRri, 0, 5, 0, 1, 0, 0,0, 0 }, + { ARM::t2ASRrr, 0, ARM::tASRrr, 0, 0, 0, 1, 0,0, 0 }, + { ARM::t2BICrr, 0, ARM::tBIC, 0, 0, 0, 1, 0,0, 0 }, + { ARM::t2CMNrr, ARM::tCMN, 0, 0, 0, 1, 0, 2,0, 0 }, + { ARM::t2CMPri, ARM::tCMPi8, 0, 8, 0, 1, 0, 2,0, 0 }, + { ARM::t2CMPrr, ARM::tCMPhir, 0, 0, 0, 0, 0, 2,0, 0 }, + { ARM::t2CMPzri,ARM::tCMPzi8, 0, 8, 0, 1, 0, 2,0, 0 }, + { ARM::t2CMPzrr,ARM::tCMPzhir,0, 0, 0, 0, 0, 2,0, 0 }, + { ARM::t2EORrr, 0, ARM::tEOR, 0, 0, 0, 1, 0,0, 0 }, + // FIXME: adr.n immediate offset must be multiple of 4. + //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0, 0, 0, 1, 0, 1,0, 0 }, + { ARM::t2LSLri, ARM::tLSLri, 0, 5, 0, 1, 0, 0,0, 0 }, + { ARM::t2LSLrr, 0, ARM::tLSLrr, 0, 0, 0, 1, 0,0, 0 }, + { ARM::t2LSRri, ARM::tLSRri, 0, 5, 0, 1, 0, 0,0, 0 }, + { ARM::t2LSRrr, 0, ARM::tLSRrr, 0, 0, 0, 1, 0,0, 0 }, + { ARM::t2MOVi, ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0 }, + { ARM::t2MOVi16,ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0 }, + // FIXME: Do we need the 16-bit 'S' variant? + { ARM::t2MOVr,ARM::tMOVgpr2gpr,0, 0, 0, 0, 0, 1,0, 0 }, + { ARM::t2MOVCCr,0, ARM::tMOVCCr, 0, 0, 0, 0, 0,1, 0 }, + { ARM::t2MOVCCi,0, ARM::tMOVCCi, 0, 8, 0, 0, 0,1, 0 }, + { ARM::t2MUL, 0, ARM::tMUL, 0, 0, 0, 1, 0,0, 0 }, + { ARM::t2MVNr, ARM::tMVN, 0, 0, 0, 1, 0, 0,0, 0 }, + { ARM::t2ORRrr, 0, ARM::tORR, 0, 0, 0, 1, 0,0, 0 }, + { ARM::t2REV, ARM::tREV, 0, 0, 0, 1, 0, 1,0, 0 }, + { ARM::t2REV16, ARM::tREV16, 0, 0, 0, 1, 0, 1,0, 0 }, + { ARM::t2REVSH, ARM::tREVSH, 0, 0, 0, 1, 0, 1,0, 0 }, + { ARM::t2RORrr, 0, ARM::tROR, 0, 0, 0, 1, 0,0, 0 }, + { ARM::t2RSBri, ARM::tRSB, 0, 0, 0, 1, 0, 0,0, 1 }, + { ARM::t2RSBSri,ARM::tRSB, 0, 0, 0, 1, 0, 2,0, 1 }, + { ARM::t2SBCrr, 0, ARM::tSBC, 0, 0, 0, 1, 0,0, 0 }, + { ARM::t2SUBri, ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 0,0, 0 }, + { ARM::t2SUBrr, ARM::tSUBrr, 0, 0, 0, 1, 0, 0,0, 0 }, + { ARM::t2SUBSri,ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 2,2, 0 }, + { ARM::t2SUBSrr,ARM::tSUBrr, 0, 0, 0, 1, 0, 2,0, 0 }, + { ARM::t2SXTBr, ARM::tSXTB, 0, 0, 0, 1, 0, 1,0, 0 }, + { ARM::t2SXTHr, ARM::tSXTH, 0, 0, 0, 1, 0, 1,0, 0 }, + { ARM::t2TSTrr, ARM::tTST, 0, 0, 0, 1, 0, 2,0, 0 }, + { ARM::t2UXTBr, ARM::tUXTB, 0, 0, 0, 1, 0, 1,0, 0 }, + { ARM::t2UXTHr, ARM::tUXTH, 0, 0, 0, 1, 0, 1,0, 0 }, + + // FIXME: Clean this up after splitting each Thumb load / store opcode + // into multiple ones. + { ARM::t2LDRi12,ARM::tLDR, 0, 5, 0, 1, 0, 0,0, 1 }, + { ARM::t2LDRs, ARM::tLDR, 0, 0, 0, 1, 0, 0,0, 1 }, + { ARM::t2LDRBi12,ARM::tLDRB, 0, 5, 0, 1, 0, 0,0, 1 }, + { ARM::t2LDRBs, ARM::tLDRB, 0, 0, 0, 1, 0, 0,0, 1 }, + { ARM::t2LDRHi12,ARM::tLDRH, 0, 5, 0, 1, 0, 0,0, 1 }, + { ARM::t2LDRHs, ARM::tLDRH, 0, 0, 0, 1, 0, 0,0, 1 }, + { ARM::t2LDRSBs,ARM::tLDRSB, 0, 0, 0, 1, 0, 0,0, 1 }, + { ARM::t2LDRSHs,ARM::tLDRSH, 0, 0, 0, 1, 0, 0,0, 1 }, + { ARM::t2STRi12,ARM::tSTR, 0, 5, 0, 1, 0, 0,0, 1 }, + { ARM::t2STRs, ARM::tSTR, 0, 0, 0, 1, 0, 0,0, 1 }, + { ARM::t2STRBi12,ARM::tSTRB, 0, 5, 0, 1, 0, 0,0, 1 }, + { ARM::t2STRBs, ARM::tSTRB, 0, 0, 0, 1, 0, 0,0, 1 }, + { ARM::t2STRHi12,ARM::tSTRH, 0, 5, 0, 1, 0, 0,0, 1 }, + { ARM::t2STRHs, ARM::tSTRH, 0, 0, 0, 1, 0, 0,0, 1 }, + + { ARM::t2LDM_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 1 }, + { ARM::t2LDM, ARM::tLDM, ARM::tPOP, 0, 0, 1, 1, 1,1, 1 }, + { ARM::t2STM, ARM::tSTM, ARM::tPUSH, 0, 0, 1, 1, 1,1, 1 }, + }; + + class VISIBILITY_HIDDEN Thumb2SizeReduce : public MachineFunctionPass { + public: + static char ID; + Thumb2SizeReduce(); + + const Thumb2InstrInfo *TII; + + virtual bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { + return "Thumb2 instruction size reduction pass"; + } + + private: + /// ReduceOpcodeMap - Maps wide opcode to index of entry in ReduceTable. + DenseMap<unsigned, unsigned> ReduceOpcodeMap; + + bool VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry, + bool is2Addr, ARMCC::CondCodes Pred, + bool LiveCPSR, bool &HasCC, bool &CCDead); + + bool ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry); + + bool ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, bool LiveCPSR); + + /// ReduceTo2Addr - Reduce a 32-bit instruction to a 16-bit two-address + /// instruction. + bool ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, + bool LiveCPSR); + + /// ReduceToNarrow - Reduce a 32-bit instruction to a 16-bit + /// non-two-address instruction. + bool ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, + bool LiveCPSR); + + /// ReduceMBB - Reduce width of instructions in the specified basic block. + bool ReduceMBB(MachineBasicBlock &MBB); + }; + char Thumb2SizeReduce::ID = 0; +} + +Thumb2SizeReduce::Thumb2SizeReduce() : MachineFunctionPass(&ID) { + for (unsigned i = 0, e = array_lengthof(ReduceTable); i != e; ++i) { + unsigned FromOpc = ReduceTable[i].WideOpc; + if (!ReduceOpcodeMap.insert(std::make_pair(FromOpc, i)).second) + assert(false && "Duplicated entries?"); + } +} + +static bool HasImplicitCPSRDef(const TargetInstrDesc &TID) { + for (const unsigned *Regs = TID.ImplicitDefs; *Regs; ++Regs) + if (*Regs == ARM::CPSR) + return true; + return false; +} + +bool +Thumb2SizeReduce::VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry, + bool is2Addr, ARMCC::CondCodes Pred, + bool LiveCPSR, bool &HasCC, bool &CCDead) { + if ((is2Addr && Entry.PredCC2 == 0) || + (!is2Addr && Entry.PredCC1 == 0)) { + if (Pred == ARMCC::AL) { + // Not predicated, must set CPSR. + if (!HasCC) { + // Original instruction was not setting CPSR, but CPSR is not + // currently live anyway. It's ok to set it. The CPSR def is + // dead though. + if (!LiveCPSR) { + HasCC = true; + CCDead = true; + return true; + } + return false; + } + } else { + // Predicated, must not set CPSR. + if (HasCC) + return false; + } + } else if ((is2Addr && Entry.PredCC2 == 2) || + (!is2Addr && Entry.PredCC1 == 2)) { + /// Old opcode has an optional def of CPSR. + if (HasCC) + return true; + // If both old opcode does not implicit CPSR def, then it's not ok since + // these new opcodes CPSR def is not meant to be thrown away. e.g. CMP. + if (!HasImplicitCPSRDef(MI->getDesc())) + return false; + HasCC = true; + } else { + // 16-bit instruction does not set CPSR. + if (HasCC) + return false; + } + + return true; +} + +static bool VerifyLowRegs(MachineInstr *MI) { + unsigned Opc = MI->getOpcode(); + bool isPCOk = (Opc == ARM::t2LDM_RET) || (Opc == ARM::t2LDM); + bool isLROk = (Opc == ARM::t2STM); + bool isSPOk = isPCOk || isLROk || (Opc == ARM::t2ADDrSPi); + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || MO.isImplicit()) + continue; + unsigned Reg = MO.getReg(); + if (Reg == 0 || Reg == ARM::CPSR) + continue; + if (isPCOk && Reg == ARM::PC) + continue; + if (isLROk && Reg == ARM::LR) + continue; + if (isSPOk && Reg == ARM::SP) + continue; + if (!isARMLowRegister(Reg)) + return false; + } + return true; +} + +bool +Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry) { + if (ReduceLimitLdSt != -1 && ((int)NumLdSts >= ReduceLimitLdSt)) + return false; + + unsigned Scale = 1; + bool HasImmOffset = false; + bool HasShift = false; + bool isLdStMul = false; + unsigned Opc = Entry.NarrowOpc1; + unsigned OpNum = 3; // First 'rest' of operands. + switch (Entry.WideOpc) { + default: + llvm_unreachable("Unexpected Thumb2 load / store opcode!"); + case ARM::t2LDRi12: + case ARM::t2STRi12: + Scale = 4; + HasImmOffset = true; + break; + case ARM::t2LDRBi12: + case ARM::t2STRBi12: + HasImmOffset = true; + break; + case ARM::t2LDRHi12: + case ARM::t2STRHi12: + Scale = 2; + HasImmOffset = true; + break; + case ARM::t2LDRs: + case ARM::t2LDRBs: + case ARM::t2LDRHs: + case ARM::t2LDRSBs: + case ARM::t2LDRSHs: + case ARM::t2STRs: + case ARM::t2STRBs: + case ARM::t2STRHs: + HasShift = true; + OpNum = 4; + break; + case ARM::t2LDM_RET: + case ARM::t2LDM: + case ARM::t2STM: { + OpNum = 0; + unsigned BaseReg = MI->getOperand(0).getReg(); + unsigned Mode = MI->getOperand(1).getImm(); + if (BaseReg == ARM::SP && ARM_AM::getAM4WBFlag(Mode)) { + Opc = Entry.NarrowOpc2; + OpNum = 2; + } else if (Entry.WideOpc == ARM::t2LDM_RET || + !isARMLowRegister(BaseReg) || + !ARM_AM::getAM4WBFlag(Mode) || + ARM_AM::getAM4SubMode(Mode) != ARM_AM::ia) { + return false; + } + isLdStMul = true; + break; + } + } + + unsigned OffsetReg = 0; + bool OffsetKill = false; + if (HasShift) { + OffsetReg = MI->getOperand(2).getReg(); + OffsetKill = MI->getOperand(2).isKill(); + if (MI->getOperand(3).getImm()) + // Thumb1 addressing mode doesn't support shift. + return false; + } + + unsigned OffsetImm = 0; + if (HasImmOffset) { + OffsetImm = MI->getOperand(2).getImm(); + unsigned MaxOffset = ((1 << Entry.Imm1Limit) - 1) * Scale; + if ((OffsetImm & (Scale-1)) || OffsetImm > MaxOffset) + // Make sure the immediate field fits. + return false; + } + + // Add the 16-bit load / store instruction. + // FIXME: Thumb1 addressing mode encode both immediate and register offset. + DebugLoc dl = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, TII->get(Opc)); + if (!isLdStMul) { + MIB.addOperand(MI->getOperand(0)).addOperand(MI->getOperand(1)); + if (Entry.NarrowOpc1 != ARM::tLDRSB && Entry.NarrowOpc1 != ARM::tLDRSH) { + // tLDRSB and tLDRSH do not have an immediate offset field. On the other + // hand, it must have an offset register. + // FIXME: Remove this special case. + MIB.addImm(OffsetImm/Scale); + } + assert((!HasShift || OffsetReg) && "Invalid so_reg load / store address!"); + + MIB.addReg(OffsetReg, getKillRegState(OffsetKill)); + } + + // Transfer the rest of operands. + for (unsigned e = MI->getNumOperands(); OpNum != e; ++OpNum) + MIB.addOperand(MI->getOperand(OpNum)); + + DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB); + + MBB.erase(MI); + ++NumLdSts; + return true; +} + +bool +Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, + bool LiveCPSR) { + if (Entry.LowRegs1 && !VerifyLowRegs(MI)) + return false; + + const TargetInstrDesc &TID = MI->getDesc(); + if (TID.mayLoad() || TID.mayStore()) + return ReduceLoadStore(MBB, MI, Entry); + + unsigned Opc = MI->getOpcode(); + switch (Opc) { + default: break; + case ARM::t2ADDSri: + case ARM::t2ADDSrr: { + unsigned PredReg = 0; + if (getInstrPredicate(MI, PredReg) == ARMCC::AL) { + switch (Opc) { + default: break; + case ARM::t2ADDSri: { + if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR)) + return true; + // fallthrough + } + case ARM::t2ADDSrr: + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR); + } + } + break; + } + case ARM::t2RSBri: + case ARM::t2RSBSri: + if (MI->getOperand(2).getImm() == 0) + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR); + break; + } + return false; +} + +bool +Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, + bool LiveCPSR) { + + if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr)) + return false; + + const TargetInstrDesc &TID = MI->getDesc(); + unsigned Reg0 = MI->getOperand(0).getReg(); + unsigned Reg1 = MI->getOperand(1).getReg(); + if (Reg0 != Reg1) + return false; + if (Entry.LowRegs2 && !isARMLowRegister(Reg0)) + return false; + if (Entry.Imm2Limit) { + unsigned Imm = MI->getOperand(2).getImm(); + unsigned Limit = (1 << Entry.Imm2Limit) - 1; + if (Imm > Limit) + return false; + } else { + unsigned Reg2 = MI->getOperand(2).getReg(); + if (Entry.LowRegs2 && !isARMLowRegister(Reg2)) + return false; + } + + // Check if it's possible / necessary to transfer the predicate. + const TargetInstrDesc &NewTID = TII->get(Entry.NarrowOpc2); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); + bool SkipPred = false; + if (Pred != ARMCC::AL) { + if (!NewTID.isPredicable()) + // Can't transfer predicate, fail. + return false; + } else { + SkipPred = !NewTID.isPredicable(); + } + + bool HasCC = false; + bool CCDead = false; + if (TID.hasOptionalDef()) { + unsigned NumOps = TID.getNumOperands(); + HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR); + if (HasCC && MI->getOperand(NumOps-1).isDead()) + CCDead = true; + } + if (!VerifyPredAndCC(MI, Entry, true, Pred, LiveCPSR, HasCC, CCDead)) + return false; + + // Add the 16-bit instruction. + DebugLoc dl = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewTID); + MIB.addOperand(MI->getOperand(0)); + if (NewTID.hasOptionalDef()) { + if (HasCC) + AddDefaultT1CC(MIB, CCDead); + else + AddNoT1CC(MIB); + } + + // Transfer the rest of operands. + unsigned NumOps = TID.getNumOperands(); + for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) { + if (i < NumOps && TID.OpInfo[i].isOptionalDef()) + continue; + if (SkipPred && TID.OpInfo[i].isPredicate()) + continue; + MIB.addOperand(MI->getOperand(i)); + } + + DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB); + + MBB.erase(MI); + ++Num2Addrs; + return true; +} + +bool +Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, + bool LiveCPSR) { + if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit)) + return false; + + unsigned Limit = ~0U; + unsigned Scale = (Entry.WideOpc == ARM::t2ADDrSPi) ? 4 : 1; + if (Entry.Imm1Limit) + Limit = ((1 << Entry.Imm1Limit) - 1) * Scale; + + const TargetInstrDesc &TID = MI->getDesc(); + for (unsigned i = 0, e = TID.getNumOperands(); i != e; ++i) { + if (TID.OpInfo[i].isPredicate()) + continue; + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg()) { + unsigned Reg = MO.getReg(); + if (!Reg || Reg == ARM::CPSR) + continue; + if (Entry.WideOpc == ARM::t2ADDrSPi && Reg == ARM::SP) + continue; + if (Entry.LowRegs1 && !isARMLowRegister(Reg)) + return false; + } else if (MO.isImm() && + !TID.OpInfo[i].isPredicate()) { + if (((unsigned)MO.getImm()) > Limit || (MO.getImm() & (Scale-1)) != 0) + return false; + } + } + + // Check if it's possible / necessary to transfer the predicate. + const TargetInstrDesc &NewTID = TII->get(Entry.NarrowOpc1); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); + bool SkipPred = false; + if (Pred != ARMCC::AL) { + if (!NewTID.isPredicable()) + // Can't transfer predicate, fail. + return false; + } else { + SkipPred = !NewTID.isPredicable(); + } + + bool HasCC = false; + bool CCDead = false; + if (TID.hasOptionalDef()) { + unsigned NumOps = TID.getNumOperands(); + HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR); + if (HasCC && MI->getOperand(NumOps-1).isDead()) + CCDead = true; + } + if (!VerifyPredAndCC(MI, Entry, false, Pred, LiveCPSR, HasCC, CCDead)) + return false; + + // Add the 16-bit instruction. + DebugLoc dl = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewTID); + MIB.addOperand(MI->getOperand(0)); + if (NewTID.hasOptionalDef()) { + if (HasCC) + AddDefaultT1CC(MIB, CCDead); + else + AddNoT1CC(MIB); + } + + // Transfer the rest of operands. + unsigned NumOps = TID.getNumOperands(); + for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) { + if (i < NumOps && TID.OpInfo[i].isOptionalDef()) + continue; + if ((TID.getOpcode() == ARM::t2RSBSri || + TID.getOpcode() == ARM::t2RSBri) && i == 2) + // Skip the zero immediate operand, it's now implicit. + continue; + bool isPred = (i < NumOps && TID.OpInfo[i].isPredicate()); + if (SkipPred && isPred) + continue; + const MachineOperand &MO = MI->getOperand(i); + if (Scale > 1 && !isPred && MO.isImm()) + MIB.addImm(MO.getImm() / Scale); + else { + if (MO.isReg() && MO.isImplicit() && MO.getReg() == ARM::CPSR) + // Skip implicit def of CPSR. Either it's modeled as an optional + // def now or it's already an implicit def on the new instruction. + continue; + MIB.addOperand(MO); + } + } + if (!TID.isPredicable() && NewTID.isPredicable()) + AddDefaultPred(MIB); + + DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB); + + MBB.erase(MI); + ++NumNarrows; + return true; +} + +static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR) { + bool HasDef = false; + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI.getOperand(i); + if (!MO.isReg() || MO.isUndef() || MO.isUse()) + continue; + if (MO.getReg() != ARM::CPSR) + continue; + if (!MO.isDead()) + HasDef = true; + } + + return HasDef || LiveCPSR; +} + +static bool UpdateCPSRUse(MachineInstr &MI, bool LiveCPSR) { + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI.getOperand(i); + if (!MO.isReg() || MO.isUndef() || MO.isDef()) + continue; + if (MO.getReg() != ARM::CPSR) + continue; + assert(LiveCPSR && "CPSR liveness tracking is wrong!"); + if (MO.isKill()) { + LiveCPSR = false; + break; + } + } + + return LiveCPSR; +} + +bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { + bool Modified = false; + + bool LiveCPSR = false; + // Yes, CPSR could be livein. + for (MachineBasicBlock::const_livein_iterator I = MBB.livein_begin(), + E = MBB.livein_end(); I != E; ++I) { + if (*I == ARM::CPSR) { + LiveCPSR = true; + break; + } + } + + MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end(); + MachineBasicBlock::iterator NextMII; + for (; MII != E; MII = NextMII) { + NextMII = next(MII); + + MachineInstr *MI = &*MII; + LiveCPSR = UpdateCPSRUse(*MI, LiveCPSR); + + unsigned Opcode = MI->getOpcode(); + DenseMap<unsigned, unsigned>::iterator OPI = ReduceOpcodeMap.find(Opcode); + if (OPI != ReduceOpcodeMap.end()) { + const ReduceEntry &Entry = ReduceTable[OPI->second]; + // Ignore "special" cases for now. + if (Entry.Special) { + if (ReduceSpecial(MBB, MI, Entry, LiveCPSR)) { + Modified = true; + MachineBasicBlock::iterator I = prior(NextMII); + MI = &*I; + } + goto ProcessNext; + } + + // Try to transform to a 16-bit two-address instruction. + if (Entry.NarrowOpc2 && ReduceTo2Addr(MBB, MI, Entry, LiveCPSR)) { + Modified = true; + MachineBasicBlock::iterator I = prior(NextMII); + MI = &*I; + goto ProcessNext; + } + + // Try to transform ro a 16-bit non-two-address instruction. + if (Entry.NarrowOpc1 && ReduceToNarrow(MBB, MI, Entry, LiveCPSR)) { + Modified = true; + MachineBasicBlock::iterator I = prior(NextMII); + MI = &*I; + } + } + + ProcessNext: + LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR); + } + + return Modified; +} + +bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) { + const TargetMachine &TM = MF.getTarget(); + TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo()); + + bool Modified = false; + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) + Modified |= ReduceMBB(*I); + return Modified; +} + +/// createThumb2SizeReductionPass - Returns an instance of the Thumb2 size +/// reduction pass. +FunctionPass *llvm::createThumb2SizeReductionPass() { + return new Thumb2SizeReduce(); +} |